From 8e89bca78b373c98be82b2b4d98d75b40c9c831e Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 27 May 2023 13:57:04 +1000 Subject: [PATCH 01/91] Exploit PMULL+EOR fusion capability on Apple M1 --- gf16/gf16_clmul_neon.c | 47 ++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c index b7645184..6af64ff0 100644 --- a/gf16/gf16_clmul_neon.c +++ b/gf16/gf16_clmul_neon.c @@ -11,9 +11,6 @@ static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) { return vreinterpretq_p8_u8(veorq_u8(vreinterpretq_u8_p8(a), vreinterpretq_u8_p8(b))); } -static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) { - return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b))); -} #ifdef __aarch64__ typedef poly8x16_t coeff_t; @@ -48,6 +45,34 @@ typedef poly8x8_t coeff_t; # define pmull_high(x, y) vmull_p8(vget_high_p8(x), y) #endif +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__) +// Apple M1 supports fusing PMULL+EOR, so ensure these are paired +static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { + poly16x8_t result; + __asm__ ("pmull %0.8h,%1.8b,%2.8b\n" + "eor %0.16b,%0.16b,%3.16b\n" + : "=&w"(result) + : "w"(a), "w"(b), "w"(sum) + : /* No clobbers */); + return result; +} +static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_high(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { + poly16x8_t result; + __asm__ ("pmull2 %0.8h,%1.16b,%2.16b\n" + "eor %0.16b,%0.16b,%3.16b\n" + : "=&w"(result) + : "w"(a), "w"(b), "w"(sum) + : /* No clobbers */); + return result; +} +#else +static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) { + return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b))); +} +# define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b)) +# define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b)) +#endif + static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) { poly8x16x2_t data = vld2q_p8((const poly8_t*)src); *low1 = pmull_low(data.val[0], coeff[0]); @@ -76,14 +101,14 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x } static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) { - poly16x8_t _low1, _low2, _mid1, _mid2, _high1, _high2; - gf16_clmul_neon_round1(src, &_low1, &_low2, &_mid1, &_mid2, &_high1, &_high2, coeff); - *low1 = veorq_p16(*low1, _low1); - *low2 = veorq_p16(*low2, _low2); - *mid1 = veorq_p16(*mid1, _mid1); - *mid2 = veorq_p16(*mid2, _mid2); - *high1 = veorq_p16(*high1, _high1); - *high2 = veorq_p16(*high2, _high2); + poly8x16x2_t data = vld2q_p8((const poly8_t*)src); + *low1 = pmacl_low(*low1, data.val[0], coeff[0]); + *low2 = pmacl_high(*low2, data.val[0], coeff[0]); + poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); + *mid1 = pmacl_low(*mid1, mid, coeff[2]); + *mid2 = pmacl_high(*mid2, mid, coeff[2]); + *high1 = pmacl_low(*high1, data.val[1], coeff[1]); + *high2 = pmacl_high(*high2, data.val[1], coeff[1]); } static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) { From 95b5d3ed14c5b49c2fb1a6ad3107960c158ed72e Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 27 May 2023 13:57:36 +1000 Subject: [PATCH 02/91] Set dev version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 021cf5e9..fcb7340d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@animetosho/parpar", - "version": "0.4.1", + "version": "0.4.2-dev", "description": "High performance multi-threaded PAR2 creation library", "keywords": [ "par2", From 81da84166b8de020bfe33d7452b04e8f2b91d587 Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 28 May 2023 13:03:57 +1000 Subject: [PATCH 03/91] Clear OpenCL platform list when unloading --- gf16/controller_ocl.cpp | 5 +---- gf16/controller_ocl.h | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/gf16/controller_ocl.cpp b/gf16/controller_ocl.cpp index bee23f60..f1d1e881 100644 --- a/gf16/controller_ocl.cpp +++ b/gf16/controller_ocl.cpp @@ -6,10 +6,6 @@ std::vector PAR2ProcOCL::platforms; -// buffer for zeroing GPU memory -#define ZERO_MEM_SIZE 65536 -#include "gfmat_coeff.h" - int PAR2ProcOCL::load_runtime() { if(load_opencl()) { return 1; @@ -384,6 +380,7 @@ bool PAR2ProcOCL::fillInput(const void* buffer) { return false; } +#include "gfmat_coeff.h" void PAR2ProcOCL::set_coeffs(PAR2ProcOCLStaging& area, unsigned idx, uint16_t inputNum) { uint16_t inputLog = gfmat_input_log(inputNum); auto& coeffs = area.procCoeffs; diff --git a/gf16/controller_ocl.h b/gf16/controller_ocl.h index 09988958..f0e7f7c2 100644 --- a/gf16/controller_ocl.h +++ b/gf16/controller_ocl.h @@ -165,6 +165,7 @@ class PAR2ProcOCL : public IPAR2ProcBackend { public: static int load_runtime(); static inline int unload_runtime() { + platforms.clear(); return unload_opencl(); } static int defaultPlatformId(); From 12fce934067216788e11fedeebd4069f4edf2786 Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 28 May 2023 13:49:18 +1000 Subject: [PATCH 04/91] Fix parpar_gf_init signature when building as an internal module --- nexe/build.js | 1 + 1 file changed, 1 insertion(+) diff --git a/nexe/build.js b/nexe/build.js index dd7b8b8c..25464dae 100644 --- a/nexe/build.js +++ b/nexe/build.js @@ -198,6 +198,7 @@ nexe.compile({ data = data.contents.toString(); const internalModuleRegister = (parseFloat(nodeVer) >= 12) ? 'NODE_MODULE_CONTEXT_AWARE_INTERNAL' : 'NODE_BUILTIN_MODULE_CONTEXT_AWARE'; data = data.replace(/NODE_MODULE\(/, '#define NODE_WANT_INTERNALS 1\n#include \n' + internalModuleRegister + '('); + data = data.replace(/Local module,\s*void\* priv/, 'Local module, v8::Local context, void* priv'); await compiler.setFileContentsAsync('deps/parpar/src/gf.cc', data); data = await compiler.readFileAsync('deps/parpar/binding.gyp'); From 32338b52cd38b64d96a9099faee79c5badeef8ab Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 3 Jun 2023 21:22:14 +1000 Subject: [PATCH 05/91] Initial matrix inversion implementation --- gf16/gf16_shuffle.h | 5 ++ gf16/gf16_shuffle_ssse3.c | 23 ++++++++ gf16/gf16mul.cpp | 35 ++++++++++-- gf16/gf16mul.h | 12 +++- gf16/gfmat_coeff.c | 24 ++++++++ gf16/gfmat_inv.cpp | 115 ++++++++++++++++++++++++++++++++++++++ gf16/gfmat_inv.h | 12 ++++ 7 files changed, 219 insertions(+), 7 deletions(-) create mode 100644 gf16/gfmat_inv.cpp create mode 100644 gf16/gfmat_inv.h diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h index 3bff4658..2c12db49 100644 --- a/gf16/gf16_shuffle.h +++ b/gf16/gf16_shuffle.h @@ -121,3 +121,8 @@ void* gf16_shuffle_init_128_sve(int polynomial); void* gf16_shuffle_init_512_sve(int polynomial); int gf16_sve_get_size(); + +uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue); diff --git a/gf16/gf16_shuffle_ssse3.c b/gf16/gf16_shuffle_ssse3.c index b6b9de4a..ddd8247c 100644 --- a/gf16/gf16_shuffle_ssse3.c +++ b/gf16/gf16_shuffle_ssse3.c @@ -21,6 +21,29 @@ #undef _FNSUFFIX #undef _MM_END +static HEDLEY_ALWAYS_INLINE uint16_t gf16_shuffleX_replace_word(void* data, size_t index, uint16_t newValue, size_t width) { + uint8_t* base = (uint8_t*)data + (index & ~(width-1)) * 2; + unsigned pos = index & (width-1); + if(width > 16) + pos = (pos & 7) | ((pos & ((width/2)-8)) << 1) | ((pos & (width/2)) ? 8 : 0); // handle awkward positioning due to avoiding cross-lane shuffles + uint16_t oldValue = base[pos + width] | (base[pos] << 8); + base[pos + width] = newValue & 0xff; + base[pos] = newValue >> 8; + return oldValue; +} + +uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue) { // only used for Affine2x + return gf16_shuffleX_replace_word(data, index, newValue, 8); +} +uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_shuffleX_replace_word(data, index, newValue, 16); +} +uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_shuffleX_replace_word(data, index, newValue, 32); +} +uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_shuffleX_replace_word(data, index, newValue, 64); +} void* gf16_shuffle_init_x86(int polynomial) { diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 1def382b..917de3e0 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -507,6 +507,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_ssse3; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; + replace_word = &gf16_shuffle16_replace_word; break; case GF16_SHUFFLE_AVX: METHOD_REQUIRES(gf16_shuffle_available_avx && scratch) @@ -526,6 +527,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; + replace_word = &gf16_shuffle16_replace_word; break; case GF16_SHUFFLE_AVX2: METHOD_REQUIRES(gf16_shuffle_available_avx2 && scratch) @@ -545,6 +547,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; + replace_word = &gf16_shuffle32_replace_word; break; case GF16_SHUFFLE_AVX512: METHOD_REQUIRES(gf16_shuffle_available_avx512 && scratch) @@ -573,6 +576,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; + replace_word = &gf16_shuffle64_replace_word; break; default: break; // for pedantic compilers } @@ -604,6 +608,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; + replace_word = &gf16_shuffle64_replace_word; break; case GF16_SHUFFLE2X_AVX512: scratch = gf16_shuffle_init_x86(GF16_POLYNOMIAL); @@ -631,6 +636,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; + replace_word = &gf16_shuffle32_replace_word; break; case GF16_SHUFFLE2X_AVX2: scratch = gf16_shuffle_init_x86(GF16_POLYNOMIAL); @@ -658,6 +664,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; + replace_word = &gf16_shuffle16_replace_word; break; case GF16_SHUFFLE_NEON: @@ -842,6 +849,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; + replace_word = &gf16_shuffle64_replace_word; break; case GF16_AFFINE_AVX2: @@ -871,6 +879,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; + replace_word = &gf16_shuffle32_replace_word; break; case GF16_AFFINE_GFNI: @@ -900,6 +909,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_ssse3; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; + replace_word = &gf16_shuffle16_replace_word; break; case GF16_AFFINE2X_AVX512: @@ -927,6 +937,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; + replace_word = &gf16_shuffle32_replace_word; break; case GF16_AFFINE2X_AVX2: @@ -954,6 +965,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; + replace_word = &gf16_shuffle16_replace_word; break; case GF16_AFFINE2X_GFNI: @@ -981,6 +993,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_gfni; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; + replace_word = &gf16_shuffle8_replace_word; break; case GF16_XOR_JIT_AVX512: @@ -1018,6 +1031,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_sse2; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; + replace_word = NULL; break; /* case GF16_XOR_JIT_AVX: @@ -1039,6 +1053,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; + replace_word = NULL; break; */ case GF16_XOR_JIT_AVX2: @@ -1060,6 +1075,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; + replace_word = NULL; break; case GF16_XOR_JIT_AVX512: METHOD_REQUIRES(gf16_xor_available_avx512) @@ -1082,6 +1098,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; + replace_word = NULL; break; default: break; // for pedantic compilers } @@ -1140,6 +1157,7 @@ Galois16Mul::Galois16Mul(Galois16Methods method) { prepare_packed = &Galois16Mul::_prepare_packed_none; finish = &Galois16Mul::_finish_none; finish_packed = NULL; + replace_word = &Galois16Mul::_replace_word; _mul = NULL; _mul_add_pf = NULL; @@ -1218,9 +1236,11 @@ void Galois16Mul::mutScratch_free(void* mutScratch) const { } } -Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*outputs*/) { +Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inputs, unsigned /*outputs*/, bool forInvert) { const CpuCap caps(true); (void)regionSizeHint; + (void)inputs; + (void)forInvert; #ifdef PLATFORM_X86 if(caps.hasGFNI) { @@ -1237,7 +1257,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*ou } if(caps.hasAVX2) { # ifdef PLATFORM_AMD64 - if(gf16_xor_available_avx2 && caps.canMemWX && caps.propFastJit && !caps.isEmulated) // TODO: check size hint? + if(gf16_xor_available_avx2 && caps.canMemWX && caps.propFastJit && !caps.isEmulated && !forInvert) // TODO: check size hint? return GF16_XOR_JIT_AVX2; # endif if(gf16_shuffle_available_avx2) @@ -1245,7 +1265,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*ou } if(gf16_affine_available_gfni && caps.hasGFNI && gf16_shuffle_available_ssse3 && caps.hasSSSE3) return GF16_AFFINE2X_GFNI; // this should beat XOR-JIT; even seems to generally beat Shuffle2x AVX2 - if(!caps.isEmulated && (!regionSizeHint || regionSizeHint > caps.propPrefShuffleThresh)) { + if(!caps.isEmulated && regionSizeHint > caps.propPrefShuffleThresh && !forInvert) { // TODO: if only a few recovery slices being made (e.g. 3), prefer shuffle //if(gf16_xor_available_avx && caps.hasAVX && caps.canMemWX) // return GF16_XOR_JIT_AVX; @@ -1260,12 +1280,15 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*ou return GF16_XOR_SSE2; #endif #ifdef PLATFORM_ARM - if(caps.hasSVE2) - return gf16_sve_get_size() >= 64 ? GF16_SHUFFLE_512_SVE2 : GF16_CLMUL_SVE2; + if(caps.hasSVE2) { + if(gf16_sve_get_size() >= 64) + return GF16_SHUFFLE_512_SVE2; + return inputs > 3 ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2; + } if(caps.hasSVE && gf16_sve_get_size() > 16) return GF16_SHUFFLE_128_SVE; if(gf16_available_neon && caps.hasNEON) - return GF16_CLMUL_NEON; + return inputs > 3 ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; #endif diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index d5baa952..9d3e7a59 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -15,6 +15,9 @@ typedef void(*Galois16MulUntransformPacked) (void *HEDLEY_RESTRICT dst, const vo typedef int(*Galois16MulUntransformPackedCksum) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); typedef int(*Galois16MulUntransformPackedCksumPartial) (void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); +typedef uint16_t(*Galois16ReplaceWord) (void* data, size_t index, uint16_t newValue); + + typedef void(*Galois16MulFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); typedef void(*Galois16MulPfFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); typedef void(*Galois16PowFunc) (const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); @@ -122,6 +125,12 @@ class Galois16Mul { } static void _finish_none(void *HEDLEY_RESTRICT, size_t) {} static void _prepare_packed_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); + static uint16_t _replace_word(void* data, size_t index, uint16_t newValue) { + uint16_t* p = (uint16_t*)data + index; + uint16_t oldValue = *p; + *p = newValue; + return oldValue; + } Galois16Methods _method; @@ -136,7 +145,7 @@ class Galois16Mul { #endif public: - static Galois16Methods default_method(size_t regionSizeHint = 0, unsigned outputs = 0); + static Galois16Methods default_method(size_t regionSizeHint = 1048576, unsigned inputs = 32768, unsigned outputs = 65535, bool forInvert = false); Galois16Mul(Galois16Methods method = GF16_AUTO); ~Galois16Mul(); @@ -199,6 +208,7 @@ class Galois16Mul { Galois16MulUntransformPacked finish_packed; Galois16MulUntransformPackedCksum finish_packed_cksum; Galois16MulUntransformPackedCksumPartial finish_partial_packsum; + Galois16ReplaceWord replace_word; Galois16AddMultiFunc add_multi; Galois16AddPackedFunc add_multi_packed; Galois16AddPackPfFunc add_multi_packpf; diff --git a/gf16/gfmat_coeff.c b/gf16/gfmat_coeff.c index df31276b..4cd8e8b8 100644 --- a/gf16/gfmat_coeff.c +++ b/gf16/gfmat_coeff.c @@ -3,15 +3,24 @@ static int8_t* input_diff = NULL; // difference between predicted input coefficient and actual (number range is -4...5, so could be compressed to 4 bits, but I don't feel it's worth the savings) static uint16_t* gf_exp = NULL; // pre-calculated exponents in GF(2^16), missing bottom 3 bits, followed by 128-entry polynomial shift table +#ifdef PARPAR_INVERT_SUPPORT +uint16_t* gf16_recip = NULL; // full GF(2^16) reciprocal table +#endif void gfmat_init() { if(input_diff) return; input_diff = (int8_t*)malloc(32768); gf_exp = (uint16_t*)malloc((8192+128)*2); +#ifdef PARPAR_INVERT_SUPPORT + gf16_recip = (uint16_t*)malloc(65536*2); +#endif int exp = 0, n = 1; for (int i = 0; i < 32768; i++) { do { +#ifdef PARPAR_INVERT_SUPPORT + gf16_recip[n] = exp; // essentially construct a log table, then alter it later to get the reciprocal +#endif if((exp & 7) == 0) gf_exp[exp>>3] = n; exp++; // exp will reach 65534 by the end of the loop n <<= 1; @@ -20,6 +29,9 @@ void gfmat_init() { input_diff[i] = exp - i*2; } +#ifdef PARPAR_INVERT_SUPPORT + gf16_recip[n] = exp; +#endif // correction values for handling the missing bottom 3 bits of exp // essentially this is a table to speed up multiplication by 0...127 by applying the effects of polynomial masking @@ -31,6 +43,14 @@ void gfmat_init() { } gf_exp[8192+i] = n; } + +#ifdef PARPAR_INVERT_SUPPORT + gf16_recip[1] = 65535; + // exponentiate for reciprocals + for (int i = 1; i < 65536; i++) { + gf16_recip[i] = gf16_exp(65535 - gf16_recip[i]); + } +#endif } void gfmat_free() { @@ -38,6 +58,10 @@ void gfmat_free() { free(gf_exp); input_diff = NULL; gf_exp = NULL; +#ifdef PARPAR_INVERT_SUPPORT + free(gf16_recip); + gf16_recip = NULL; +#endif } HEDLEY_CONST uint16_t gf16_exp(uint_fast16_t v) { diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp new file mode 100644 index 00000000..2bc5cd1a --- /dev/null +++ b/gf16/gfmat_inv.cpp @@ -0,0 +1,115 @@ +#include "gfmat_coeff.h" + +#ifdef PARPAR_INVERT_SUPPORT +extern "C" uint16_t* gf16_recip; + +#include +#include "../src/platform.h" // for ALIGN_* +#include "gf16mul.h" + +uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned validCount, std::vector& recovery, unsigned& stride) { + unsigned matWidth = inputValid.size() * sizeof(uint16_t); + Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true)); + stride = gf.alignToStride(matWidth); + const auto gfInfo = gf.info(); + void* gfScratch = gf.mutScratch_alloc(); + + unsigned invalidCount = inputValid.size() - validCount; + assert(validCount < inputValid.size()); // i.e. invalidCount > 0 + + uint16_t* mat; + ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment); + + unsigned validCol, missingCol; + unsigned stride16 = stride / sizeof(uint16_t); + assert(stride16 * sizeof(uint16_t) == stride); + + invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? + if(invalidCount > recovery.size()) { // not enough recovery + gf.mutScratch_free(gfScratch); + ALIGN_FREE(mat); + return nullptr; + } + + // generate matrix + validCol = 0; + missingCol = validCount; + for(unsigned input = 0; input < inputValid.size(); input++) { + uint16_t inputLog = gfmat_input_log(input); + unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; + for(unsigned rec = 0; rec < invalidCount; rec++) { + mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); + } + } + assert(validCol == validCount); + + // pre-transform + if(gf.needPrepare()) { + for(unsigned rec = 0; rec < invalidCount; rec++) { + uint16_t* row = mat + rec * stride16; + //memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory + gf.prepare(row, row, stride); + } + } + + // invert + // TODO: optimise: multi-thread + packed arrangement + // TODO: progress hook + missingCol = validCount; + for(unsigned rec = 0; rec < invalidCount; rec++) { + uint16_t* row = mat + rec * stride16; + // scale down factor + uint16_t baseCoeff = gf.replace_word(row, missingCol, 1); + if(HEDLEY_UNLIKELY(baseCoeff == 0)) { // bad recovery coeff + // ignore this recovery row and try again + recovery.erase(recovery.begin() + rec); + goto invert_loop; + } + baseCoeff = gf16_recip[baseCoeff]; // TODO: consider prefetching this? + if(HEDLEY_LIKELY(baseCoeff != 1)) { + gf.mul(row, row, stride, baseCoeff, gfScratch); + } + + for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) { + if(HEDLEY_UNLIKELY(rec == rec2)) continue; + uint16_t* row2 = mat + rec2 * stride16; + uint16_t coeff = gf.replace_word(row2, missingCol, 0); + if(HEDLEY_LIKELY(coeff != 0)) { + gf.mul_add(row2, row, stride, coeff, gfScratch); + } // TODO: is a coefficient of 0 ever correct? + } + + missingCol++; + } + + // post transform + if(gf.needPrepare()) { + for(unsigned rec = 0; rec < invalidCount; rec++) { + uint16_t* row = mat + rec * stride16; + gf.finish(row, stride); + + /* + // check for zeroes; TODO: does this need to be the full row? + for(unsigned col = validCount; col < inputValid.size(); col++) { + if(HEDLEY_UNLIKELY(row[col] == 0)) { // bad coeff + recovery.erase(recovery.begin() + rec); + goto invert_loop; + } + } + */ + } + } + } + + // remove excess recovery + recovery.resize(invalidCount); + + gf.mutScratch_free(gfScratch); + return mat; +} + +void free_recovery_matrix(uint16_t* mat) { + ALIGN_FREE(mat); +} + +#endif diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h new file mode 100644 index 00000000..c70cacba --- /dev/null +++ b/gf16/gfmat_inv.h @@ -0,0 +1,12 @@ +#ifndef GFMAT_INV_H +#define GFMAT_INV_H + +#include +#include "../src/stdint.h" + +#ifdef PARPAR_INVERT_SUPPORT +uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned validCount, std::vector& recovery, unsigned& stride); +void free_recovery_matrix(uint16_t* mat); +#endif + +#endif From 7accee96259b6fa7dd501734cddd991994310d4f Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 3 Jun 2023 21:25:56 +1000 Subject: [PATCH 06/91] Compiler warning --- gf16/gf_add_x86.h | 55 ++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/gf16/gf_add_x86.h b/gf16/gf_add_x86.h index f44921b3..0024f0df 100644 --- a/gf16/gf_add_x86.h +++ b/gf16/gf_add_x86.h @@ -51,34 +51,35 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf_add_x)( if(vecStride == 16) { // for xor kernels, need to do 4x prefetch - const char* pfBase; - if(doPrefetch) pfBase = _pf+(ptr>>1); - if(doPrefetch == 1) { - _mm_prefetch(pfBase, MM_HINT_WT1); - _mm_prefetch(pfBase+64, MM_HINT_WT1); - if(sizeof(_mword) > 16) { - _mm_prefetch(pfBase+128, MM_HINT_WT1); - _mm_prefetch(pfBase+192, MM_HINT_WT1); + if(doPrefetch) { + const char* pfBase = _pf+(ptr>>1); + if(doPrefetch == 1) { + _mm_prefetch(pfBase, MM_HINT_WT1); + _mm_prefetch(pfBase+64, MM_HINT_WT1); + if(sizeof(_mword) > 16) { + _mm_prefetch(pfBase+128, MM_HINT_WT1); + _mm_prefetch(pfBase+192, MM_HINT_WT1); + } + if(sizeof(_mword) > 32) { + _mm_prefetch(pfBase+256, MM_HINT_WT1); + _mm_prefetch(pfBase+320, MM_HINT_WT1); + _mm_prefetch(pfBase+384, MM_HINT_WT1); + _mm_prefetch(pfBase+448, MM_HINT_WT1); + } } - if(sizeof(_mword) > 32) { - _mm_prefetch(pfBase+256, MM_HINT_WT1); - _mm_prefetch(pfBase+320, MM_HINT_WT1); - _mm_prefetch(pfBase+384, MM_HINT_WT1); - _mm_prefetch(pfBase+448, MM_HINT_WT1); - } - } - if(doPrefetch == 2) { - _mm_prefetch(pfBase, _MM_HINT_T1); - _mm_prefetch(pfBase+64, _MM_HINT_T1); - if(sizeof(_mword) > 16) { - _mm_prefetch(pfBase+128, _MM_HINT_T1); - _mm_prefetch(pfBase+192, _MM_HINT_T1); - } - if(sizeof(_mword) > 32) { - _mm_prefetch(pfBase+256, _MM_HINT_T1); - _mm_prefetch(pfBase+320, _MM_HINT_T1); - _mm_prefetch(pfBase+384, _MM_HINT_T1); - _mm_prefetch(pfBase+448, _MM_HINT_T1); + if(doPrefetch == 2) { + _mm_prefetch(pfBase, _MM_HINT_T1); + _mm_prefetch(pfBase+64, _MM_HINT_T1); + if(sizeof(_mword) > 16) { + _mm_prefetch(pfBase+128, _MM_HINT_T1); + _mm_prefetch(pfBase+192, _MM_HINT_T1); + } + if(sizeof(_mword) > 32) { + _mm_prefetch(pfBase+256, _MM_HINT_T1); + _mm_prefetch(pfBase+320, _MM_HINT_T1); + _mm_prefetch(pfBase+384, _MM_HINT_T1); + _mm_prefetch(pfBase+448, _MM_HINT_T1); + } } } } else { From ae7af0fa682c12a882dd7173b7f73f4a2ed77627 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 6 Jun 2023 12:27:19 +1000 Subject: [PATCH 07/91] Enable in-place mul/prepare for use in inversion + replace_word fixes --- gf16/gf16_affine.h | 5 +- gf16/gf16_affine2x_x86.h | 6 +- gf16/gf16_affine_avx2.c | 25 +++- gf16/gf16_affine_avx512.c | 25 +++- gf16/gf16_affine_gfni.c | 24 +++- gf16/gf16_clmul.h | 1 + gf16/gf16_clmul_neon.c | 43 ++++-- gf16/gf16_clmul_sve2.c | 18 +++ gf16/gf16_global.h | 14 +- gf16/gf16_lookup.c | 4 +- gf16/gf16_lookup.h | 6 +- gf16/gf16_lookup_sse2.c | 2 +- gf16/gf16_shuffle.h | 20 +-- gf16/gf16_shuffle2x128_sve2.c | 37 ++++++ gf16/gf16_shuffle2x_x86.h | 6 +- gf16/gf16_shuffle512_sve2.c | 24 ++++ gf16/gf16_shuffle_neon.c | 2 +- gf16/gf16_shuffle_ssse3.c | 19 ++- gf16/gf16_shuffle_vbmi.c | 2 +- gf16/gf16_shuffle_x86.h | 2 +- gf16/gf16_shuffle_x86_prepare.h | 4 +- gf16/gf16_xor.h | 10 +- gf16/gf16_xor_avx2.c | 73 +++++++---- gf16/gf16_xor_avx512.c | 16 +-- gf16/gf16_xor_common.h | 25 +++- gf16/gf16_xor_common_funcs.h | 95 ++++++++++---- gf16/gf16_xor_sse2.c | 225 ++++++++++++++++++++++---------- gf16/gf16mul.cpp | 37 ++++-- gf16/gf16mul.h | 49 +++---- 29 files changed, 586 insertions(+), 233 deletions(-) diff --git a/gf16/gf16_affine.h b/gf16/gf16_affine.h index b25f4eb9..254e5334 100644 --- a/gf16/gf16_affine.h +++ b/gf16/gf16_affine.h @@ -2,7 +2,7 @@ #include "../src/hedley.h" #define FUNCS(v) \ - void gf16_affine_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_affine_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \ void gf16_affine_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ @@ -21,11 +21,12 @@ FUNCS(avx512); #undef FUNCS #define FUNCS(v) \ + void gf16_affine2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \ - void gf16_affine2x_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \ + void gf16_affine2x_prepare_##v(void* dst, const void* src, size_t srcLen); \ void gf16_affine2x_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_affine2x_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_affine2x_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ diff --git a/gf16/gf16_affine2x_x86.h b/gf16/gf16_affine2x_x86.h index acf522ce..b6391622 100644 --- a/gf16/gf16_affine2x_x86.h +++ b/gf16/gf16_affine2x_x86.h @@ -4,12 +4,12 @@ #ifdef _AVAILABLE # include "gf16_checksum_x86.h" -static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_block)(void* dst, const void* src) { _mword data = _MMI(loadu)((_mword*)src); data = separate_low_high(data); _MMI(store)((_mword*)dst, data); } -static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_blocku)(void* dst, const void* src, size_t remaining) { _mword data = partial_load(src, remaining); data = separate_low_high(data); _MMI(store)((_mword*)dst, data); @@ -62,7 +62,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_finish_copy_blocku)(void *HED } #endif -void _FN(gf16_affine2x_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) { +void _FN(gf16_affine2x_prepare)(void* dst, const void* src, size_t srcLen) { #ifdef _AVAILABLE gf16_prepare(dst, src, srcLen, sizeof(_mword), &_FN(gf16_affine2x_prepare_block), &_FN(gf16_affine2x_prepare_blocku)); _MM_END diff --git a/gf16/gf16_affine_avx2.c b/gf16/gf16_affine_avx2.c index 80eaea34..1d42e328 100644 --- a/gf16/gf16_affine_avx2.c +++ b/gf16/gf16_affine_avx2.c @@ -53,7 +53,7 @@ static HEDLEY_ALWAYS_INLINE __m256i gf16_affine_load_matrix(const void *HEDLEY_R } #endif -void gf16_affine_mul_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_affine_mul_avx2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__AVX2__) __m256i depmask = gf16_affine_load_matrix(scratch, coefficient); @@ -336,6 +336,29 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_avx2( } #endif /*defined(__GFNI__) && defined(__AVX2__)*/ +void gf16_affine2x_mul_avx2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(__GFNI__) && defined(__AVX2__) + __m256i depmask = gf16_affine_load_matrix(scratch, coefficient); + __m256i matNorm = _mm256_inserti128_si256(depmask, _mm256_castsi256_si128(depmask), 1); + __m256i matSwap = _mm256_permute2x128_si256(depmask, depmask, 0x11); + + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m256i)) { + __m256i data = _mm256_load_si256((__m256i*)(_src + ptr)); + __m256i result1 = _mm256_gf2p8affine_epi64_epi8(data, matNorm, 0); + __m256i result2 = _mm256_gf2p8affine_epi64_epi8(data, matSwap, 0); + + result1 = _mm256_xor_si256(result1, _mm256_shuffle_epi32(result2, _MM_SHUFFLE(1,0,3,2))); + _mm256_store_si256((__m256i*)(_dst + ptr), result1); + } +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); +#endif +} + void gf16_affine2x_muladd_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__AVX2__) diff --git a/gf16/gf16_affine_avx512.c b/gf16/gf16_affine_avx512.c index a848d0d2..67f11f36 100644 --- a/gf16/gf16_affine_avx512.c +++ b/gf16/gf16_affine_avx512.c @@ -87,7 +87,7 @@ static HEDLEY_ALWAYS_INLINE __m512i gf16_affine_load2_matrix(const void *HEDLEY_ } #endif -void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) __m256i depmask = gf16_affine_load_matrix(scratch, coefficient); @@ -465,6 +465,29 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_avx512( #endif /*defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)*/ +void gf16_affine2x_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) + __m512i depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficient)); + __m512i matNorm = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + __m512i matSwap = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m512i)) { + __m512i data = _mm512_load_si512((__m512i*)(_src + ptr)); + __m512i result = _mm512_gf2p8affine_epi64_epi8(data, matNorm, 0); + __m512i swapped = _mm512_gf2p8affine_epi64_epi8(data, matSwap, 0); + + result = _mm512_xor_si512(result, _mm512_shuffle_epi32(swapped, _MM_SHUFFLE(1,0,3,2))); + _mm512_store_si512((__m512i*)(_dst + ptr), result); + } +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); +#endif +} + void gf16_affine2x_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) diff --git a/gf16/gf16_affine_gfni.c b/gf16/gf16_affine_gfni.c index be1bc9e4..e7668cdb 100644 --- a/gf16/gf16_affine_gfni.c +++ b/gf16/gf16_affine_gfni.c @@ -56,7 +56,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine_load_matrix(const void *HEDLEY_REST } #endif -void gf16_affine_mul_gfni(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_affine_mul_gfni(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__SSSE3__) __m128i depmask1, depmask2; @@ -364,6 +364,28 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_gfni( } #endif /*defined(__GFNI__) && defined(__SSSE3__)*/ +void gf16_affine2x_mul_gfni(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(__GFNI__) && defined(__SSSE3__) + __m128i matNorm, matSwap; + gf16_affine_load_matrix(scratch, coefficient, &matNorm, &matSwap); + + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)) { + __m128i data = _mm_load_si128((__m128i*)(_src + ptr)); + __m128i result1 = _mm_gf2p8affine_epi64_epi8(data, matNorm, 0); + __m128i result2 = _mm_gf2p8affine_epi64_epi8(data, matSwap, 0); + + result1 = _mm_xor_si128(result1, _mm_shuffle_epi32(result2, _MM_SHUFFLE(1,0,3,2))); + _mm_store_si128((__m128i*)(_dst + ptr), result1); + } +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); +#endif +} + void gf16_affine2x_muladd_gfni(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__SSSE3__) diff --git a/gf16/gf16_clmul.h b/gf16/gf16_clmul.h index b5be1ff0..696f0dcc 100644 --- a/gf16/gf16_clmul.h +++ b/gf16/gf16_clmul.h @@ -5,6 +5,7 @@ void gf16_clmul_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_clmul_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_clmul_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \ + void gf16_clmul_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_clmul_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_clmul_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_clmul_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c index 6af64ff0..7d73a938 100644 --- a/gf16/gf16_clmul_neon.c +++ b/gf16/gf16_clmul_neon.c @@ -36,6 +36,7 @@ static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_high(poly8x16_t a, poly8x16_t b) { # define pmull_low(x, y) vmull_p8(vget_low_p8(x), vget_low_p8(y)) # define pmull_high vmull_high_p8 # endif +# define coeff_fn(f1, f2) f1##q_##f2 #else static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) { return vreinterpret_p8_u8(veor_u8(vreinterpret_u8_p8(a), vreinterpret_u8_p8(b))); @@ -43,6 +44,7 @@ static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) { typedef poly8x8_t coeff_t; # define pmull_low(x, y) vmull_p8(vget_low_p8(x), y) # define pmull_high(x, y) vmull_p8(vget_high_p8(x), y) +# define coeff_fn(f1, f2) f1##_##f2 #endif #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__) @@ -178,18 +180,11 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_neon( for(int src=0; src> 8; -#ifdef __aarch64__ - coeff[src*CLMUL_COEFF_PER_REGION +0] = vdupq_n_p8(lo); - coeff[src*CLMUL_COEFF_PER_REGION +1] = vdupq_n_p8(hi); - coeff[src*CLMUL_COEFF_PER_REGION +2] = veorq_p8(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]); - - // if we want to have one register per region, at the expense of 2 extra instructions per region + coeff[src*CLMUL_COEFF_PER_REGION +0] = coeff_fn(vdup, n_p8)(lo); + coeff[src*CLMUL_COEFF_PER_REGION +1] = coeff_fn(vdup, n_p8)(hi); + coeff[src*CLMUL_COEFF_PER_REGION +2] = coeff_fn(veor, p8)(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]); + // if we want to have one register per region (AArch64), at the expense of 2 extra instructions per region //coeff[src] = vcombine_p8(vdup_n_p8(lo), vdup_n_p8(hi)); -#else - coeff[src*CLMUL_COEFF_PER_REGION +0] = vdup_n_p8(lo); - coeff[src*CLMUL_COEFF_PER_REGION +1] = vdup_n_p8(hi); - coeff[src*CLMUL_COEFF_PER_REGION +2] = veor_p8(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]); -#endif } poly16x8_t low1, low2, mid1, mid2, high1, high2; @@ -249,6 +244,32 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_neon( +void gf16_clmul_mul_neon(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); UNUSED(scratch); +#if defined(__ARM_NEON) + + coeff_t coeff[3]; + coeff[0] = coeff_fn(vdup, n_p8)(val & 0xff); + coeff[1] = coeff_fn(vdup, n_p8)(val >> 8); + coeff[2] = coeff_fn(veor, p8)(coeff[0], coeff[1]); + + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + poly16x8_t low1, low2, mid1, mid2, high1, high2; + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { + gf16_clmul_neon_round1(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff); + gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); + uint8x16x2_t out; + out.val[0] = vreinterpretq_u8_p16(low1); + out.val[1] = vreinterpretq_u8_p16(high1); + vst2q_u8(_dst+ptr, out); + } +#else + UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + + void gf16_clmul_muladd_neon(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__ARM_NEON) diff --git a/gf16/gf16_clmul_sve2.c b/gf16/gf16_clmul_sve2.c index ad225827..913dc8a9 100644 --- a/gf16/gf16_clmul_sve2.c +++ b/gf16/gf16_clmul_sve2.c @@ -178,6 +178,24 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_sve2( +void gf16_clmul_mul_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); UNUSED(scratch); +#if defined(__ARM_FEATURE_SVE2) + svuint8_t coeff = svreinterpret_u8_u16(svdup_n_u16(val)); + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + + svuint8_t low1, low2, mid1, mid2, high1, high2; + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()*2) { + gf16_clmul_sve2_round(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff); + gf16_clmul_sve2_reduction(&low1, low2, mid1, mid2, &high1, high2); + svst2_u8(svptrue_b8(), _dst+ptr, svcreate2_u8(low1, high1)); + } +#else + UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + void gf16_clmul_muladd_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__ARM_FEATURE_SVE2) diff --git a/gf16/gf16_global.h b/gf16/gf16_global.h index ce72593e..4951b544 100644 --- a/gf16/gf16_global.h +++ b/gf16/gf16_global.h @@ -31,13 +31,15 @@ typedef void (CONST_PTR gf16_checksum_exp)(void *HEDLEY_RESTRICT checksum, uint16_t exp); typedef void (CONST_PTR gf16_checksum_block)(const void *HEDLEY_RESTRICT src, void *HEDLEY_RESTRICT checksum, const size_t blockLen, const int aligned); typedef void (CONST_PTR gf16_checksum_blocku)(const void *HEDLEY_RESTRICT src, size_t amount, void *HEDLEY_RESTRICT checksum); -typedef void (CONST_PTR gf16_transform_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src); -typedef void (CONST_PTR gf16_transform_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining); -typedef void (CONST_PTR gf16_prepare_checksum)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock); +typedef void (CONST_PTR gf16_transform_block)(void* dst, const void* src); +typedef void (CONST_PTR gf16_transform_blocku)(void* dst, const void* src, size_t remaining); +typedef void (CONST_PTR gf16_transform_block_rst)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src); +typedef void (CONST_PTR gf16_transform_blocku_rst)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining); +typedef void (CONST_PTR gf16_prepare_checksum)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock); typedef void (CONST_PTR gf16_finish_block)(void *HEDLEY_RESTRICT dst); #undef CONST_PTR -static HEDLEY_ALWAYS_INLINE void gf16_prepare(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, const size_t blockLen, gf16_transform_block prepareBlock, gf16_transform_blocku prepareBlockU) { +static HEDLEY_ALWAYS_INLINE void gf16_prepare(void* dst, const void* src, size_t srcLen, const size_t blockLen, gf16_transform_block prepareBlock, gf16_transform_blocku prepareBlockU) { size_t remaining = srcLen % blockLen; size_t len = srcLen - remaining; uint8_t* _src = (uint8_t*)src + len; @@ -79,7 +81,7 @@ static HEDLEY_ALWAYS_INLINE void* gf16_checksum_ptr(void* ptr, size_t sliceLen, #include #include "gfmat_coeff.h" static HEDLEY_ALWAYS_INLINE void gf16_prepare_packed( - void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, const size_t blockLen, gf16_transform_block prepareBlock, gf16_transform_blocku prepareBlockU, + void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, const size_t blockLen, gf16_transform_block_rst prepareBlock, gf16_transform_blocku_rst prepareBlockU, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, const unsigned interleaveSize, size_t partOffset, size_t partLen, void *HEDLEY_RESTRICT checksum, gf16_checksum_block checksumBlock, gf16_checksum_blocku checksumBlockU, gf16_checksum_exp checksumExp, gf16_prepare_checksum prepareChecksum @@ -217,7 +219,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_prepare_packed( static HEDLEY_ALWAYS_INLINE int gf16_finish_packed( - void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, const size_t blockLen, gf16_transform_block finishBlock, gf16_transform_blocku finishBlockU, + void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, const size_t blockLen, gf16_transform_block_rst finishBlock, gf16_transform_blocku_rst finishBlockU, unsigned numOutputs, unsigned outputNum, size_t chunkLen, const unsigned interleaveSize, size_t partOffset, size_t partLen, gf16_checksum_block checksumBlock, gf16_checksum_blocku checksumBlockU, gf16_checksum_exp checksumExp, gf16_finish_block inlineFinishBlock, diff --git a/gf16/gf16_lookup.c b/gf16/gf16_lookup.c index 0e9b4060..c8ceef7a 100644 --- a/gf16/gf16_lookup.c +++ b/gf16/gf16_lookup.c @@ -182,7 +182,7 @@ static HEDLEY_ALWAYS_INLINE void calc_table(uint16_t coefficient, uint16_t* lhta #endif -void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(scratch); UNUSED(mutScratch); uint16_t lhtable[512]; calc_table(coefficient, lhtable); @@ -393,7 +393,7 @@ static HEDLEY_ALWAYS_INLINE void calc_3table(uint16_t coefficient, struct gf16_l } } -void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(scratch); UNUSED(mutScratch); struct gf16_lookup3_tables lookup; calc_3table(coefficient, &lookup); diff --git a/gf16/gf16_lookup.h b/gf16/gf16_lookup.h index adce8b5d..9eaaf6f1 100644 --- a/gf16/gf16_lookup.h +++ b/gf16/gf16_lookup.h @@ -5,15 +5,15 @@ #include "../src/stdint.h" #include -void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_lookup_muladd(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_lookup_powadd(const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); -void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_lookup3_muladd(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_lookup3_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); -void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_lookup_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); diff --git a/gf16/gf16_lookup_sse2.c b/gf16/gf16_lookup_sse2.c index 7831582f..a95d9a7f 100644 --- a/gf16/gf16_lookup_sse2.c +++ b/gf16/gf16_lookup_sse2.c @@ -65,7 +65,7 @@ static HEDLEY_ALWAYS_INLINE void calc_table(uint16_t val, uint16_t* lhtable) { } #endif -void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(scratch); UNUSED(mutScratch); #ifdef __SSE2__ ALIGN_TO(16, uint16_t lhtable[513]); // +1 for potential misaligned load at end diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h index 2c12db49..0344bcef 100644 --- a/gf16/gf16_shuffle.h +++ b/gf16/gf16_shuffle.h @@ -3,7 +3,7 @@ // basic #define FUNCS(v) \ - void gf16_shuffle_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \ + void gf16_shuffle_prepare_##v(void* dst, const void* src, size_t srcLen); \ void gf16_shuffle_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_shuffle_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_shuffle_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ @@ -11,7 +11,7 @@ void gf16_shuffle_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ int gf16_shuffle_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ int gf16_shuffle_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ - void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \ extern int gf16_shuffle_available_##v @@ -38,7 +38,7 @@ FUNCS(512_sve2); #undef FUNCS -void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle_muladd_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle_muladd_prefetch_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); void gf16_shuffle_prepare_packed_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); @@ -47,14 +47,13 @@ void gf16_shuffle_prepare_partial_packsum_vbmi(void *HEDLEY_RESTRICT dst, const extern int gf16_shuffle_available_vbmi; #define FUNCS(v) \ - void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) FUNCS(neon); FUNCS(128_sve); FUNCS(128_sve2); -void gf16_shuffle_muladd_512_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); - +FUNCS(512_sve2); #undef FUNCS @@ -83,7 +82,7 @@ extern int gf16_available_sve2; // shuffle2x #define FUNCS(v) \ - void gf16_shuffle2x_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \ + void gf16_shuffle2x_prepare_##v(void* dst, const void* src, size_t srcLen); \ void gf16_shuffle2x_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_shuffle2x_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_shuffle2x_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ @@ -91,7 +90,7 @@ extern int gf16_available_sve2; void gf16_shuffle2x_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ int gf16_shuffle2x_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ int gf16_shuffle2x_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ - void gf16_shuffle2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_shuffle2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ @@ -108,6 +107,7 @@ void gf16_shuffle2x_prepare_partial_packsum_sve(void *HEDLEY_RESTRICT dst, const void gf16_shuffle2x_finish_packed_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); int gf16_shuffle2x_finish_packed_cksum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); int gf16_shuffle2x_finish_partial_packsum_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); +void gf16_shuffle2x_mul_128_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle2x_muladd_128_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle2x_muladd_multi_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle2x_muladd_multi_packed_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); @@ -122,7 +122,9 @@ void* gf16_shuffle_init_512_sve(int polynomial); int gf16_sve_get_size(); -uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_affine2x_replace_word(void* data, size_t index, uint16_t newValue); uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue); uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue); uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_shuffle2x16_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_shuffle2x32_replace_word(void* data, size_t index, uint16_t newValue); diff --git a/gf16/gf16_shuffle2x128_sve2.c b/gf16/gf16_shuffle2x128_sve2.c index ac1cda5d..4e358b6d 100644 --- a/gf16/gf16_shuffle2x128_sve2.c +++ b/gf16/gf16_shuffle2x128_sve2.c @@ -251,6 +251,43 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_sve2( #endif /*defined(__ARM_FEATURE_SVE2)*/ +void gf16_shuffle2x_mul_128_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); + UNUSED(scratch); +#if defined(__ARM_FEATURE_SVE2) + svuint8_t tbl_ln, tbl_ls, tbl_hn, tbl_hs; + gf16_shuffle2x128_sve2_calc_tables(1, &val, + &tbl_ln, &tbl_ls, &tbl_hn, &tbl_hs, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL + ); + + svuint8_t mask = svreinterpret_u8_u16(svdup_n_u16(0x1000)); + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()) { + svuint8_t data = svld1_u8(svptrue_b8(), _src+ptr);; + svuint8_t tmp1 = svbsl_n_u8(data, mask, 0xf); + svuint8_t tmp2 = svsri_n_u8(mask, data, 4); + data = sveor3_u8( + svtbl_u8(tbl_ln, tmp1), + svtbl_u8(tbl_hn, tmp2), + svreinterpret_u8_u16(svxar_n_u16( + svreinterpret_u16_u8(svtbl_u8(tbl_ls, tmp1)), + svreinterpret_u16_u8(svtbl_u8(tbl_hs, tmp2)), + 8 + )) + ); + svst1_u8(svptrue_b8(), _dst+ptr, data); + } +#else + UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + void gf16_shuffle2x_muladd_128_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__ARM_FEATURE_SVE2) diff --git a/gf16/gf16_shuffle2x_x86.h b/gf16/gf16_shuffle2x_x86.h index 718feed0..43bba321 100644 --- a/gf16/gf16_shuffle2x_x86.h +++ b/gf16/gf16_shuffle2x_x86.h @@ -4,7 +4,7 @@ #ifdef _AVAILABLE # include "gf16_checksum_x86.h" -static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_block)(void* dst, const void* src) { _mword data = _MMI(loadu)((_mword*)src); data = separate_low_high(data); @@ -16,7 +16,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_block)(void *HEDLEY_ _MMI(store)((_mword*)dst, data); } -static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_blocku)(void* dst, const void* src, size_t remaining) { _mword data = partial_load(src, remaining); data = separate_low_high(data); @@ -91,7 +91,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_finish_copy_blocku)(void *HE } #endif -void _FN(gf16_shuffle2x_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) { +void _FN(gf16_shuffle2x_prepare)(void* dst, const void* src, size_t srcLen) { #ifdef _AVAILABLE gf16_prepare(dst, src, srcLen, sizeof(_mword), &_FN(gf16_shuffle2x_prepare_block), &_FN(gf16_shuffle2x_prepare_blocku)); _MM_END diff --git a/gf16/gf16_shuffle512_sve2.c b/gf16/gf16_shuffle512_sve2.c index b2a2682c..83881062 100644 --- a/gf16/gf16_shuffle512_sve2.c +++ b/gf16/gf16_shuffle512_sve2.c @@ -310,6 +310,30 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle512_muladd_x_sve2( #endif /*defined(__ARM_FEATURE_SVE2)*/ +void gf16_shuffle_mul_512_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(__ARM_FEATURE_SVE2) + svuint8_t tbl_l0, tbl_l1, tbl_l2, tbl_h0, tbl_h1, tbl_h2; + gf16_shuffle512_sve2_calc_tables(scratch, 1, &val, + &tbl_l0, &tbl_l1, &tbl_l2, &tbl_h0, &tbl_h1, &tbl_h2, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL + ); + + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + + svuint8_t rl, rh; + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()*2) { + gf16_shuffle512_sve2_round1(svld2_u8(svptrue_b8(), _src+ptr), &rl, &rh, tbl_l0, tbl_l1, tbl_l2, tbl_h0, tbl_h1, tbl_h2); + svst2_u8(svptrue_b8(), _dst+ptr, svcreate2_u8(rl, rh)); + } +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + void gf16_shuffle_muladd_512_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #ifdef __ARM_FEATURE_SVE2 diff --git a/gf16/gf16_shuffle_neon.c b/gf16/gf16_shuffle_neon.c index c2ef7519..90b643a6 100644 --- a/gf16/gf16_shuffle_neon.c +++ b/gf16/gf16_shuffle_neon.c @@ -274,7 +274,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle_muladd_x_neon( -void gf16_shuffle_mul_neon(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { +void gf16_shuffle_mul_neon(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__ARM_NEON) qtbl_t tbl_h[4], tbl_l[4]; diff --git a/gf16/gf16_shuffle_ssse3.c b/gf16/gf16_shuffle_ssse3.c index ddd8247c..1c8daeec 100644 --- a/gf16/gf16_shuffle_ssse3.c +++ b/gf16/gf16_shuffle_ssse3.c @@ -32,8 +32,17 @@ static HEDLEY_ALWAYS_INLINE uint16_t gf16_shuffleX_replace_word(void* data, size return oldValue; } -uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue) { // only used for Affine2x - return gf16_shuffleX_replace_word(data, index, newValue, 8); +static HEDLEY_ALWAYS_INLINE uint16_t gf16_shuffle2X_replace_word(void* data, size_t index, uint16_t newValue, size_t width) { + uint8_t* base = (uint8_t*)data + (index & ~(width-1)) * 2; + unsigned pos = index & (width-1); + uint16_t oldValue = base[pos] | (base[pos + width] << 8); + base[pos] = newValue & 0xff; + base[pos + width] = newValue >> 8; + return oldValue; +} + +uint16_t gf16_affine2x_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_shuffle2X_replace_word(data, index, newValue, 8); } uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue) { return gf16_shuffleX_replace_word(data, index, newValue, 16); @@ -44,6 +53,12 @@ uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue) { return gf16_shuffleX_replace_word(data, index, newValue, 64); } +uint16_t gf16_shuffle2x16_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_shuffle2X_replace_word(data, index, newValue, 16); +} +uint16_t gf16_shuffle2x32_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_shuffle2X_replace_word(data, index, newValue, 32); +} void* gf16_shuffle_init_x86(int polynomial) { diff --git a/gf16/gf16_shuffle_vbmi.c b/gf16/gf16_shuffle_vbmi.c index e2d1da4b..399d9e5f 100644 --- a/gf16/gf16_shuffle_vbmi.c +++ b/gf16/gf16_shuffle_vbmi.c @@ -382,7 +382,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle_muladd_x_vbmi( } #endif -void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { +void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__AVX512VBMI__) && defined(__AVX512VL__) __m512i lo0, lo1, lo2, hi0, hi1, hi2; diff --git a/gf16/gf16_shuffle_x86.h b/gf16/gf16_shuffle_x86.h index 1556e2fb..bd8fd9dc 100644 --- a/gf16/gf16_shuffle_x86.h +++ b/gf16/gf16_shuffle_x86.h @@ -10,7 +10,7 @@ int _FN(gf16_shuffle_available) = 1; int _FN(gf16_shuffle_available) = 0; #endif -void _FN(gf16_shuffle_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) { +void _FN(gf16_shuffle_prepare)(void* dst, const void* src, size_t srcLen) { #ifdef _AVAILABLE gf16_prepare(dst, src, srcLen, sizeof(_mword)*2, &_FN(gf16_shuffle_prepare_block), &_FN(gf16_shuffle_prepare_blocku)); _MM_END diff --git a/gf16/gf16_shuffle_x86_prepare.h b/gf16/gf16_shuffle_x86_prepare.h index 9754ebc4..e29070fd 100644 --- a/gf16/gf16_shuffle_x86_prepare.h +++ b/gf16/gf16_shuffle_x86_prepare.h @@ -2,7 +2,7 @@ #include "gf16_shuffle_x86_common.h" #include "gf16_checksum_x86.h" -static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_block)(void* dst, const void* src) { _mword ta = _MMI(loadu)((_mword*)src); _mword tb = _MMI(loadu)((_mword*)src + 1); @@ -17,7 +17,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_block)(void *HEDLEY_RE ); } // final block -static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_blocku)(void* dst, const void* src, size_t remaining) { _mword ta, tb; if(remaining & sizeof(_mword)) ta = _MMI(loadu)((_mword*)src); diff --git a/gf16/gf16_xor.h b/gf16/gf16_xor.h index bd0b50e6..54e246bd 100644 --- a/gf16/gf16_xor.h +++ b/gf16/gf16_xor.h @@ -4,7 +4,7 @@ #define FUNCS(v) \ void* gf16_xor_jit_init_##v(int polynomial, int jitOptStrat); \ void* gf16_xor_jit_init_mut_##v(); \ - void gf16_xor_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \ + void gf16_xor_prepare_##v(void* dst, const void* src, size_t srcLen); \ void gf16_xor_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_xor_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ void gf16_xor_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ @@ -12,7 +12,7 @@ void gf16_xor_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ int gf16_xor_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ int gf16_xor_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \ - void gf16_xor_jit_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_xor_jit_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_xor_jit_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_xor_jit_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \ extern int gf16_xor_available_##v @@ -30,7 +30,7 @@ void gf16_xor_jit_uninit(void* scratch); // non-JIT version void* gf16_xor_init_sse2(int polynomial); -void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); @@ -40,3 +40,7 @@ void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_REST #define GF16_XOR_JIT_STRAT_COPY 2 #define GF16_XOR_JIT_STRAT_CLR 3 + +uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue); +uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue); diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c index f492f830..a81329bc 100644 --- a/gf16/gf16_xor_avx2.c +++ b/gf16/gf16_xor_avx2.c @@ -182,7 +182,7 @@ static inline int xor_write_avx_main_part(void* jitptr, uint8_t dep1, uint8_t de return xor256_jit_len[dep]; } -static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch) { +static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int mode, const int prefetch) { uint_fast32_t bit; __m256i depmask = _mm256_load_si256((__m256i*)scratch->deps + (val & 0xf)*4); @@ -226,7 +226,7 @@ static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_REST tmp3 = _mm_xor_si128(tmp3, common_elim); tmp4 = _mm_xor_si128(tmp4, common_elim); - if(!xor) { + if(mode != XORDEP_JIT_MODE_MULADD) { lowest = ssse3_tzcnt_epi16(tmp3); _mm_store_si128((__m128i*)dep1_lowest, lowest); tmp3 = _mm_and_si128(tmp3, _mm_add_epi16(tmp3, _mm_set1_epi16(-1))); @@ -292,7 +292,7 @@ static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_REST #define _C_PXOR_R(rD, r2, r1, c) jitptr += _jit_vpxor_r(jitptr, rD, r2, r1) & -(c) /* generate code */ - if(xor) { + if(mode == XORDEP_JIT_MODE_MULADD) { for(bit=0; bit<8; bit++) { int destOffs = (bit<<6)-128; int destOffs2 = destOffs+32; @@ -385,29 +385,40 @@ static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_REST return jitptr+5; } -static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx2_base(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int add, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) { +static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx2_base(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int mode, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) { jit_wx_pair* jit = (jit_wx_pair*)mutScratch; - gf16_xorjit_write_jit(scratch, coefficient, jit, add, doPrefetch, &xor_write_jit_avx); - - gf16_xor256_jit_stub( - (intptr_t)src - 384, - (intptr_t)dst + len - 384, - (intptr_t)dst - 384, - (intptr_t)prefetch - 128, - jit->x - ); + gf16_xorjit_write_jit(scratch, coefficient, jit, mode, doPrefetch, &xor_write_jit_avx); + + if(mode == XORDEP_JIT_MODE_MUL_INSITU) { + ALIGN_TO(32, __m256i spill[3]); + gf16_xor256_jit_stub( + (intptr_t)spill + 128, + (intptr_t)dst + len - 384, + (intptr_t)dst - 384, + (intptr_t)prefetch - 128, + (uint8_t*)jit->x + XORDEP_JIT_SIZE/2 + ); + } else { + gf16_xor256_jit_stub( + (intptr_t)src - 384, + (intptr_t)dst + len - 384, + (intptr_t)dst - 384, + (intptr_t)prefetch - 128, + jit->x + ); + } _mm256_zeroupper(); } #endif /* defined(__AVX2__) && defined(PLATFORM_AMD64) */ -void gf16_xor_jit_mul_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_xor_jit_mul_avx2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { #if defined(__AVX2__) && defined(PLATFORM_AMD64) if(coefficient == 0) { memset(dst, 0, len); return; } - gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, 0, 0, NULL); + gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, dst==src ? XORDEP_JIT_MODE_MUL_INSITU : XORDEP_JIT_MODE_MUL, 0, NULL); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); #endif @@ -416,7 +427,7 @@ void gf16_xor_jit_mul_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RES void gf16_xor_jit_muladd_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { #if defined(__AVX2__) && defined(PLATFORM_AMD64) if(coefficient == 0) return; - gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, 1, 0, NULL); + gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, 0, NULL); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); #endif @@ -425,7 +436,7 @@ void gf16_xor_jit_muladd_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_ void gf16_xor_jit_muladd_prefetch_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) { #if defined(__AVX2__) && defined(PLATFORM_AMD64) if(coefficient == 0) return; - gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, 1, _MM_HINT_T1, prefetch); + gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, _MM_HINT_T1, prefetch); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); UNUSED(prefetch); #endif @@ -664,16 +675,28 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _avx2) #if defined(__AVX2__) && defined(PLATFORM_AMD64) -static size_t xor_write_init_jit(uint8_t *jitCode) { - uint8_t *jitCodeStart = jitCode; - jitCode += _jit_add_i(jitCode, AX, 512); - jitCode += _jit_add_i(jitCode, DX, 512); +static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) { + uint8_t *jitCodeStart = jitCodeNorm; + jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 512); + jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 512); /* only 64-bit supported*/ for(int i=3; i<16; i++) { - jitCode += _jit_vmovdqa_load(jitCode, i, AX, lshift32(i-4, 5)); + jitCodeNorm += _jit_vmovdqa_load(jitCodeNorm, i, AX, lshift32(i-4, 5)); + } + if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart; + + + jitCodeStart = jitCodeInsitu; + jitCodeInsitu += _jit_add_i(jitCodeInsitu, DX, 512); + + for(int i=0; i<16; i++) { + jitCodeInsitu += _jit_vmovdqa_load(jitCodeInsitu, i, DX, lshift32(i-4, 5)); + } + for(int i=0; i<3; i++) { + jitCodeInsitu += _jit_vmovdqa_store(jitCodeInsitu, AX, lshift32(i-4, 5), i); } - return jitCode-jitCodeStart; + if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart; } # include "gf16_bitdep_init_avx2.h" @@ -691,7 +714,7 @@ void* gf16_xor_jit_init_avx2(int polynomial, int jitOptStrat) { gf16_xor_create_jit_lut_avx2(); ret->jitOptStrat = jitOptStrat; - ret->codeStart = (uint_fast8_t)xor_write_init_jit(tmpCode); + xor_write_init_jit(tmpCode, tmpCode, &(ret->codeStart), &(ret->codeStartInsitu)); return ret; #else UNUSED(polynomial); UNUSED(jitOptStrat); @@ -703,7 +726,7 @@ void* gf16_xor_jit_init_mut_avx2() { #if defined(__AVX2__) && defined(PLATFORM_AMD64) jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE); if(!jitCode) return NULL; - xor_write_init_jit(jitCode->w); + xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL); return jitCode; #else return NULL; diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c index 487612bd..cce0625b 100644 --- a/gf16/gf16_xor_avx512.c +++ b/gf16/gf16_xor_avx512.c @@ -364,7 +364,7 @@ static HEDLEY_ALWAYS_INLINE int xor_avx512_merge_part(uint8_t *HEDLEY_RESTRICT j } -static inline void* xor_write_jit_avx512(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch) { +static inline void* xor_write_jit_avx512(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int mode, const int prefetch) { uint_fast32_t bit; __m256i depmask = _mm256_load_si256((__m256i*)scratch->deps + (val & 0xf)*4); @@ -420,7 +420,7 @@ static inline void* xor_write_jit_avx512(const struct gf16_xor_scratch *HEDLEY_R jitptr += _jit_vmovdqa32_load(jitptr, 16, DX, 0); /* generate code */ - if(xor) { + if(mode == XORDEP_JIT_MODE_MULADD) { for(bit=0; bit<8; bit++) { int destOffs = bit<<7; int destOffs2 = destOffs+64; @@ -743,9 +743,9 @@ static void* xor_write_jit_avx512_multi(const struct gf16_xor_scratch *HEDLEY_RE return jitptr; } -static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx512_base(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int add, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) { +static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx512_base(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int mode, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) { jit_wx_pair* jit = (jit_wx_pair*)mutScratch; - gf16_xorjit_write_jit(scratch, coefficient, jit, add, doPrefetch, &xor_write_jit_avx512); + gf16_xorjit_write_jit(scratch, coefficient, jit, mode, doPrefetch, &xor_write_jit_avx512); gf16_xor512_jit_stub( (intptr_t)dst - 1024, @@ -760,13 +760,13 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx512_base(const void *HEDLEY #endif /* defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64) */ -void gf16_xor_jit_mul_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_xor_jit_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { #if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64) if(coefficient == 0) { memset(dst, 0, len); return; } - gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, 0, 0, NULL); + gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MUL, 0, NULL); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); #endif @@ -775,7 +775,7 @@ void gf16_xor_jit_mul_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_R void gf16_xor_jit_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { #if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64) if(coefficient == 0) return; - gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, 1, 0, NULL); + gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, 0, NULL); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); #endif @@ -784,7 +784,7 @@ void gf16_xor_jit_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLE void gf16_xor_jit_muladd_prefetch_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) { #if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64) if(coefficient == 0) return; - gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, 1, _MM_HINT_T1, prefetch); + gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, _MM_HINT_T1, prefetch); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); UNUSED(prefetch); #endif diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h index 730671a7..80877440 100644 --- a/gf16/gf16_xor_common.h +++ b/gf16/gf16_xor_common.h @@ -8,6 +8,9 @@ #define XORDEP_JIT_SIZE 4096 #define XORDEP_JIT_CODE_SIZE 1280 +#define XORDEP_JIT_MODE_MUL 0 +#define XORDEP_JIT_MODE_MULADD 1 +#define XORDEP_JIT_MODE_MUL_INSITU 2 /* we support MSVC and GCC style ASM */ #ifdef PLATFORM_AMD64 @@ -113,18 +116,26 @@ struct gf16_xor_scratch { uint8_t deps[16*16*2*4]; int jitOptStrat; // GF16_XOR_JIT_STRAT_* uint_fast8_t codeStart; + uint_fast8_t codeStartInsitu; }; #ifdef __SSE2__ typedef void*(*gf16_xorjit_write_func)(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch); -static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient, jit_wx_pair* jit, const int add, const int prefetch, gf16_xorjit_write_func writeFunc) { +static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient, jit_wx_pair* jit, const int mode, const int prefetch, gf16_xorjit_write_func writeFunc) { const struct gf16_xor_scratch *HEDLEY_RESTRICT info = (const struct gf16_xor_scratch*)scratch; - uint8_t* jitptr = (uint8_t*)jit->w + info->codeStart; + uint8_t* jitWPtr = (uint8_t*)jit->w; + uint8_t* jitptr; + if(mode == XORDEP_JIT_MODE_MUL_INSITU) { + jitWPtr += XORDEP_JIT_SIZE/2; + jitptr = jitWPtr + info->codeStartInsitu; + } else { + jitptr = jitWPtr + info->codeStart; + } if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT || info->jitOptStrat == GF16_XOR_JIT_STRAT_COPY) { ALIGN_TO(_GF16_XORJIT_COPY_ALIGN, uint8_t jitTemp[XORDEP_JIT_CODE_SIZE]); - uintptr_t copyOffset = info->codeStart; + uintptr_t copyOffset = (mode == XORDEP_JIT_MODE_MUL_INSITU) ? info->codeStartInsitu : info->codeStart; if((uintptr_t)jitptr & (_GF16_XORJIT_COPY_ALIGN-1)) { // copy unaligned part #if _GF16_XORJIT_COPY_ALIGN == 32 && defined(__AVX2__) @@ -138,13 +149,13 @@ static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRI else jitptr = jitTemp; - jitptr = writeFunc(info, jitptr, coefficient, add, prefetch); + jitptr = writeFunc(info, jitptr, coefficient, mode, prefetch); write32(jitptr, (int32_t)(jitTemp - copyOffset - jitptr -4)); jitptr[4] = 0xC3; /* ret */ jitptr += 5; /* memcpy to destination */ - uint8_t* jitdst = (uint8_t*)jit->w + copyOffset; + uint8_t* jitdst = jitWPtr + copyOffset; if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT) { // 256-bit NT copies never seem to be better, so just stick to 128-bit for(uint_fast32_t i=0; i<(uint_fast32_t)(jitptr-jitTemp); i+=64) { @@ -185,8 +196,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRI for(int i=0; iw - jitptr -4)); + jitptr = writeFunc(info, jitptr, coefficient, mode, prefetch); + write32(jitptr, (int32_t)(jitWPtr - jitptr -4)); jitptr[4] = 0xC3; /* ret */ } #ifdef GF16_XORJIT_ENABLE_DUAL_MAPPING diff --git a/gf16/gf16_xor_common_funcs.h b/gf16/gf16_xor_common_funcs.h index 435e866e..c607da06 100644 --- a/gf16/gf16_xor_common_funcs.h +++ b/gf16/gf16_xor_common_funcs.h @@ -1,6 +1,7 @@ #include "../src/hedley.h" #include +#include /* type returned by *movemask* function */ #if MWORD_SIZE == 64 @@ -18,16 +19,16 @@ #ifdef _AVAILABLE # include "gf16_checksum_x86.h" -static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_write(_mword ta, _mword tb, umask_t* _dst) { +static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_split(_mword ta, _mword tb, _mword* tl, _mword* th) { /* split to high/low parts */ #if MWORD_SIZE == 64 // arrange to hlhl... _mword tmp1 = _mm512_shuffle_epi8(ta, _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200)); _mword tmp2 = _mm512_shuffle_epi8(tb, _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200)); - _mword th = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64( + *th = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64( 15, 13, 11, 9, 7, 5, 3, 1 ), tmp2); - _mword tl = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64( + *tl = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64( 14, 12, 10, 8, 6, 4, 2, 0 ), tmp2); #elif MWORD_SIZE == 32 @@ -41,48 +42,83 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_write(_mword ta, _mword tb, umask 0x0e0c0a08, 0x06040200, 0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200, 0x0f0d0b09, 0x07050301 )); - _mword th = _mm256_blend_epi32(tmp1, tmp2, 0x33); - _mword tl = _mm256_blend_epi32(tmp2, tmp1, 0x33); - tl = _mm256_permute4x64_epi64(tl, _MM_SHUFFLE(3,1,2,0)); - th = _mm256_permute4x64_epi64(th, _MM_SHUFFLE(2,0,3,1)); + *th = _mm256_blend_epi32(tmp1, tmp2, 0x33); + *tl = _mm256_blend_epi32(tmp2, tmp1, 0x33); + *tl = _mm256_permute4x64_epi64(*tl, _MM_SHUFFLE(3,1,2,0)); + *th = _mm256_permute4x64_epi64(*th, _MM_SHUFFLE(2,0,3,1)); #else - _mword th = _mm_packus_epi16( + *th = _mm_packus_epi16( _mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8) ); - _mword tl = _mm_packus_epi16( + *tl = _mm_packus_epi16( _mm_and_si128(tb, _mm_set1_epi16(0xff)), _mm_and_si128(ta, _mm_set1_epi16(0xff)) ); #endif - - /* save to dest by extracting masks */ - _dst[0] = MOVMASK(th); - for(int i=1; i<8; i++) { - th = _MM(add_epi8)(th, th); - _dst[i*8] = MOVMASK(th); - } - _dst[64] = MOVMASK(tl); +} +static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_write(umask_t* _dst, _mword bytes) { + _dst[0] = MOVMASK(bytes); for(int i=1; i<8; i++) { - tl = _MM(add_epi8)(tl, tl); - _dst[64+i*8] = MOVMASK(tl); + bytes = _MM(add_epi8)(bytes, bytes); + _dst[i*8] = MOVMASK(bytes); } } static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) { uint8_t* _src = (uint8_t*)src; umask_t* _dst = (umask_t*)dst; + _mword tl, th; for(int j=0; j<8; j++) { - gf16_xor_prep_write( - _MMI(loadu)((_mword*)_src), - _MMI(loadu)((_mword*)_src + 1), - _dst - ); + gf16_xor_prep_split(_MMI(loadu)((_mword*)_src), _MMI(loadu)((_mword*)_src + 1), &tl, &th); + + /* save to dest by extracting masks */ + gf16_xor_prep_write(_dst, th); + gf16_xor_prep_write(_dst+64, tl); + _src += sizeof(_mword)*2; _dst++; } } -static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_block_insitu)(void* dst, const void* src) { + assert(dst == src); + _mword* _src = (_mword*)src; + umask_t* _dst = (umask_t*)dst; + + _mword tl0, tl1, tl2, tl3, tl4, tl5, tl6, tl7; + _mword th0, th1, th2, th3, th4, th5, th6, th7; + + // load 8 registers (need to load the first half of the block) + gf16_xor_prep_split(_MMI(loadu)(_src + 0), _MMI(loadu)(_src + 1), &tl0, &th0); + gf16_xor_prep_split(_MMI(loadu)(_src + 2), _MMI(loadu)(_src + 3), &tl1, &th1); + gf16_xor_prep_split(_MMI(loadu)(_src + 4), _MMI(loadu)(_src + 5), &tl2, &th2); + gf16_xor_prep_split(_MMI(loadu)(_src + 6), _MMI(loadu)(_src + 7), &tl3, &th3); + + // free up 4 of them (th* can now be freely written) + gf16_xor_prep_write(_dst+0, th0); + gf16_xor_prep_write(_dst+1, th1); + gf16_xor_prep_write(_dst+2, th2); + gf16_xor_prep_write(_dst+3, th3); + + gf16_xor_prep_split(_MMI(loadu)(_src + 8), _MMI(loadu)(_src + 9), &tl4, &th4); + gf16_xor_prep_write(_dst+4, th4); + gf16_xor_prep_split(_MMI(loadu)(_src + 10), _MMI(loadu)(_src + 11), &tl5, &th5); + gf16_xor_prep_write(_dst+5, th5); + gf16_xor_prep_split(_MMI(loadu)(_src + 12), _MMI(loadu)(_src + 13), &tl6, &th6); + gf16_xor_prep_write(_dst+6, th6); + gf16_xor_prep_split(_MMI(loadu)(_src + 14), _MMI(loadu)(_src + 15), &tl7, &th7); + gf16_xor_prep_write(_dst+7, th7); + + gf16_xor_prep_write(_dst+64, tl0); + gf16_xor_prep_write(_dst+65, tl1); + gf16_xor_prep_write(_dst+66, tl2); + gf16_xor_prep_write(_dst+67, tl3); + gf16_xor_prep_write(_dst+68, tl4); + gf16_xor_prep_write(_dst+69, tl5); + gf16_xor_prep_write(_dst+70, tl6); + gf16_xor_prep_write(_dst+71, tl7); +} +static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_blocku)(void* dst, const void* src, size_t remaining) { // handle unaligned area with a simple copy and repeat uint8_t tmp[MWORD_SIZE*16] = {0}; memcpy(tmp, src, remaining); @@ -92,9 +128,14 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_blocku)(void *HEDLEY_RESTR -void _FN(gf16_xor_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) { +void _FN(gf16_xor_prepare)(void* dst, const void* src, size_t srcLen) { #ifdef _AVAILABLE - gf16_prepare(dst, src, srcLen, sizeof(_mword)*16, &_FN(gf16_xor_prepare_block), &_FN(gf16_xor_prepare_blocku)); + if(dst == src) { + // prepare_blocku is unused for in-situ prepare + assert(srcLen % (sizeof(_mword)*16) == 0); + gf16_prepare(dst, src, srcLen, sizeof(_mword)*16, &_FN(gf16_xor_prepare_block_insitu), &_FN(gf16_xor_prepare_blocku)); + } else + gf16_prepare(dst, src, srcLen, sizeof(_mword)*16, &_FN(gf16_xor_prepare_block), &_FN(gf16_xor_prepare_blocku)); _MM_END #else UNUSED(dst); UNUSED(src); UNUSED(srcLen); diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c index ff4117c0..e4f27287 100644 --- a/gf16/gf16_xor_sse2.c +++ b/gf16/gf16_xor_sse2.c @@ -315,7 +315,7 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* des -static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch) { +static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int mode, const int prefetch) { uint_fast32_t bit; ALIGN_TO(16, uint32_t lumask[8]); @@ -495,7 +495,7 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST jitptr += ((c)<<2)+(c) /* generate code */ - if(xor) { + if(mode == XORDEP_JIT_MODE_MULADD) { for(bit=0; bit<8; bit++) { int destOffs = (bit<<5)-128; int destOffs2 = destOffs+16; @@ -669,30 +669,46 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST return jitptr; } -static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_sse2_base(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int add, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) { +static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_sse2_base(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int mode, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) { jit_wx_pair* jit = (jit_wx_pair*)mutScratch; - gf16_xorjit_write_jit(scratch, coefficient, jit, add, doPrefetch, &xor_write_jit_sse); + gf16_xorjit_write_jit(scratch, coefficient, jit, mode, doPrefetch, &xor_write_jit_sse); // exec /* adding 128 to the destination pointer allows the register offset to be coded in 1 byte * eg: 'movdqa xmm0, [rdx+0x90]' is 8 bytes, whilst 'movdqa xmm0, [rdx-0x60]' is 5 bytes */ - gf16_xor_jit_stub( - (intptr_t)src - 128, - (intptr_t)dst + len - 128, - (intptr_t)dst - 128, - (intptr_t)prefetch - 128, - jit->x - ); + if(mode == XORDEP_JIT_MODE_MUL_INSITU) { + // need a place to store a copy of the source, that won't fit in registers; these will be used as the memory source +#ifdef PLATFORM_AMD64 + ALIGN_TO(16, __m128i spill[3]); +#else + ALIGN_TO(16, __m128i spill[11]); +#endif + gf16_xor_jit_stub( + (intptr_t)spill + 128, + (intptr_t)dst + len - 128, + (intptr_t)dst - 128, + (intptr_t)prefetch - 128, + (uint8_t*)jit->x + XORDEP_JIT_SIZE/2 + ); + } else { + gf16_xor_jit_stub( + (intptr_t)src - 128, + (intptr_t)dst + len - 128, + (intptr_t)dst - 128, + (intptr_t)prefetch - 128, + jit->x + ); + } } #endif /* defined(__SSE2__) */ -void gf16_xor_jit_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { +void gf16_xor_jit_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { #ifdef __SSE2__ if(coefficient == 0) { memset(dst, 0, len); return; } - gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, 0, 0, NULL); + gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, dst==src ? XORDEP_JIT_MODE_MUL_INSITU : XORDEP_JIT_MODE_MUL, 0, NULL); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); #endif @@ -701,7 +717,7 @@ void gf16_xor_jit_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RES void gf16_xor_jit_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { #ifdef __SSE2__ if(coefficient == 0) return; - gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, 1, 0, NULL); + gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, 0, NULL); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); #endif @@ -710,7 +726,7 @@ void gf16_xor_jit_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_ void gf16_xor_jit_muladd_prefetch_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) { #ifdef __SSE2__ if(coefficient == 0) return; - gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, 1, _MM_HINT_T1, prefetch); + gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, _MM_HINT_T1, prefetch); #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); UNUSED(prefetch); #endif @@ -818,9 +834,53 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_write_deptable(intptr_t *HEDLEY_RESTRI */ } } + +static HEDLEY_ALWAYS_INLINE void gf16_xor_mul_block_sse2(const uint8_t* inP, uint8_t* outP, uint_fast32_t counts[16], intptr_t deptable[256]) { + /* Note that we assume that all counts are at least 1; I don't think it's possible for that to be false */ + #define STEP(bit, type, typev, typed) { \ + intptr_t* deps = deptable + bit*16; \ + typev tmp = _mm_load_ ## type((typed*)(inP + deps[ 0])); \ + HEDLEY_ASSUME(counts[bit] <= 15); \ + switch(counts[bit]) { \ + case 15: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[15])); /* FALLTHRU */ \ + case 14: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[14])); /* FALLTHRU */ \ + case 13: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[13])); /* FALLTHRU */ \ + case 12: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[12])); /* FALLTHRU */ \ + case 11: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[11])); /* FALLTHRU */ \ + case 10: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[10])); /* FALLTHRU */ \ + case 9: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 9])); /* FALLTHRU */ \ + case 8: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 8])); /* FALLTHRU */ \ + case 7: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 7])); /* FALLTHRU */ \ + case 6: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 6])); /* FALLTHRU */ \ + case 5: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 5])); /* FALLTHRU */ \ + case 4: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 4])); /* FALLTHRU */ \ + case 3: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 3])); /* FALLTHRU */ \ + case 2: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 2])); /* FALLTHRU */ \ + case 1: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 1])); /* FALLTHRU */ \ + } \ + _mm_store_ ## type((typed*)outP + bit, tmp); \ + } + STEP( 0, si128, __m128i, __m128i) + STEP( 1, si128, __m128i, __m128i) + STEP( 2, si128, __m128i, __m128i) + STEP( 3, si128, __m128i, __m128i) + STEP( 4, si128, __m128i, __m128i) + STEP( 5, si128, __m128i, __m128i) + STEP( 6, si128, __m128i, __m128i) + STEP( 7, si128, __m128i, __m128i) + STEP( 8, si128, __m128i, __m128i) + STEP( 9, si128, __m128i, __m128i) + STEP(10, si128, __m128i, __m128i) + STEP(11, si128, __m128i, __m128i) + STEP(12, si128, __m128i, __m128i) + STEP(13, si128, __m128i, __m128i) + STEP(14, si128, __m128i, __m128i) + STEP(15, si128, __m128i, __m128i) + #undef STEP +} #endif -void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { +void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #ifdef __SSE2__ if(val == 0) { @@ -831,51 +891,24 @@ void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRIC ALIGN_TO(16, intptr_t deptable[256]); uint8_t* _dst = (uint8_t*)dst + len; - gf16_xor_write_deptable(deptable, counts, (uint8_t*)scratch, val, (uintptr_t)src - (uintptr_t)dst); - - for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)*16) { - uint8_t* p = _dst + ptr; - /* Note that we assume that all counts are at least 1; I don't think it's possible for that to be false */ - #define STEP(bit, type, typev, typed) { \ - intptr_t* deps = deptable + bit*16; \ - typev tmp = _mm_load_ ## type((typed*)(p + deps[ 0])); \ - HEDLEY_ASSUME(counts[bit] <= 15); \ - switch(counts[bit]) { \ - case 15: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[15])); /* FALLTHRU */ \ - case 14: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[14])); /* FALLTHRU */ \ - case 13: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[13])); /* FALLTHRU */ \ - case 12: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[12])); /* FALLTHRU */ \ - case 11: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[11])); /* FALLTHRU */ \ - case 10: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[10])); /* FALLTHRU */ \ - case 9: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 9])); /* FALLTHRU */ \ - case 8: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 8])); /* FALLTHRU */ \ - case 7: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 7])); /* FALLTHRU */ \ - case 6: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 6])); /* FALLTHRU */ \ - case 5: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 5])); /* FALLTHRU */ \ - case 4: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 4])); /* FALLTHRU */ \ - case 3: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 3])); /* FALLTHRU */ \ - case 2: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 2])); /* FALLTHRU */ \ - case 1: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 1])); /* FALLTHRU */ \ - } \ - _mm_store_ ## type((typed*)p + bit, tmp); \ + if(dst == src) { + // for in-situ mul, write to a temp block and copy back + ALIGN_TO(16, uint8_t tmp[256]); + __m128i* _tmp = (__m128i*)tmp; + gf16_xor_write_deptable(deptable, counts, (uint8_t*)scratch, val, 0); + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)*16) { + uint8_t* p = _dst + ptr; + gf16_xor_mul_block_sse2(p, tmp, counts, deptable); + for(int i=0; i<16; i++) { + _mm_store_si128((__m128i*)p + i, _mm_load_si128(_tmp+i)); + } + } + } else { + gf16_xor_write_deptable(deptable, counts, (uint8_t*)scratch, val, (uintptr_t)src - (uintptr_t)dst); + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)*16) { + gf16_xor_mul_block_sse2(_dst + ptr, _dst + ptr, counts, deptable); } - STEP( 0, si128, __m128i, __m128i) - STEP( 1, si128, __m128i, __m128i) - STEP( 2, si128, __m128i, __m128i) - STEP( 3, si128, __m128i, __m128i) - STEP( 4, si128, __m128i, __m128i) - STEP( 5, si128, __m128i, __m128i) - STEP( 6, si128, __m128i, __m128i) - STEP( 7, si128, __m128i, __m128i) - STEP( 8, si128, __m128i, __m128i) - STEP( 9, si128, __m128i, __m128i) - STEP(10, si128, __m128i, __m128i) - STEP(11, si128, __m128i, __m128i) - STEP(12, si128, __m128i, __m128i) - STEP(13, si128, __m128i, __m128i) - STEP(14, si128, __m128i, __m128i) - STEP(15, si128, __m128i, __m128i) - #undef STEP } #else UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); @@ -1164,23 +1197,47 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _sse2) #include "gf16_bitdep_init_sse2.h" #ifdef PLATFORM_X86 -static size_t xor_write_init_jit(uint8_t *jitCode) { - uint8_t *jitCodeStart = jitCode; - jitCode += _jit_add_i(jitCode, AX, 256); - jitCode += _jit_add_i(jitCode, DX, 256); +static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) { + uint8_t *jitCodeStart = jitCodeNorm; + jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 256); + jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 256); # ifdef PLATFORM_AMD64 /* preload upper 13 inputs into registers */ for(int i=3; i<16; i++) { - jitCode += _jit_movaps_load(jitCode, i, AX, lshift32(i-8, 4)); + jitCodeNorm += _jit_movaps_load(jitCodeNorm, i, AX, lshift32(i-8, 4)); } # else /* can only fit 5 in 32-bit mode :( */ for(int i=3; i<8; i++) { /* despite appearances, we're actually loading the top 5, not mid 5 */ - jitCode += _jit_movaps_load(jitCode, i, AX, i<<4); + jitCodeNorm += _jit_movaps_load(jitCodeNorm, i, AX, i<<4); } # endif - return jitCode-jitCodeStart; + + if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart; + + // in-situ version + jitCodeStart = jitCodeInsitu; + jitCodeInsitu += _jit_add_i(jitCodeInsitu, DX, 256); + +# ifdef PLATFORM_AMD64 + for(int i=0; i<16; i++) { + jitCodeInsitu += _jit_movaps_load(jitCodeInsitu, i, DX, lshift32(i-8, 4)); + } + for(int i=0; i<3; i++) { + jitCodeInsitu += _jit_movaps_store(jitCodeInsitu, AX, lshift32(i-8, 4), i); + } +# else + for(int i=0; i<11; i++) { + jitCodeInsitu += _jit_movaps_load(jitCodeInsitu, 0, DX, lshift32(i-8, 4)); + jitCodeInsitu += _jit_movaps_store(jitCodeInsitu, AX, lshift32(i-8, 4), 0); + } + for(int i=3; i<8; i++) { /* despite appearances, we're actually loading the top 5, not mid 5 */ + jitCodeInsitu += _jit_movaps_load(jitCodeInsitu, i, DX, i<<4); + } +# endif + + if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart; } #endif @@ -1195,7 +1252,7 @@ void* gf16_xor_jit_init_sse2(int polynomial, int jitOptStrat) { gf16_xor_create_jit_lut_sse2(); ret->jitOptStrat = jitOptStrat; - ret->codeStart = (uint_fast8_t)xor_write_init_jit(tmpCode); + xor_write_init_jit(tmpCode, tmpCode, &(ret->codeStart), &(ret->codeStartInsitu)); return ret; #else UNUSED(polynomial); UNUSED(jitOptStrat); @@ -1207,7 +1264,7 @@ void* gf16_xor_jit_init_mut_sse2() { #ifdef PLATFORM_X86 jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE); if(!jitCode) return NULL; - xor_write_init_jit(jitCode->w); + xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL); return jitCode; #else return NULL; @@ -1222,6 +1279,36 @@ void gf16_xor_jit_uninit(void* scratch) { #endif } +static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t index, uint16_t newValue, size_t width, unsigned byteFlip) { + uint8_t* base = (uint8_t*)data + (index & ~(width*8-1)) * 2; // advance pointer to correct group + base += ((index >> 3) & (width-1)) ^ byteFlip; // advance to correct byte + // TODO: remove byteFlip parameter + + unsigned bitIndex = index&7; + uint16_t oldValue = 0; + unsigned _newValue = newValue << bitIndex; + uint8_t byteMask = 1 << bitIndex; + for(int i=0; i<16; i++) { + uint8_t byte = base[i*width]; + oldValue <<= 1; + _newValue <<= 1; + oldValue |= (byte >> bitIndex) & 1; + + base[i*width] = (byte & ~byteMask) | ((_newValue >> 16) & byteMask); + } + return oldValue; +} +uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_xorX_replace_word(data, index, newValue, 16, 1); +} +uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_xorX_replace_word(data, index, newValue, 32, 0); +} +uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue) { + return gf16_xorX_replace_word(data, index, newValue, 64, 0); +} + + void* gf16_xor_init_sse2(int polynomial) { #ifdef __SSE2__ void* ret; diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 917de3e0..690d4a88 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -636,7 +636,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; - replace_word = &gf16_shuffle32_replace_word; + replace_word = &gf16_shuffle2x32_replace_word; break; case GF16_SHUFFLE2X_AVX2: scratch = gf16_shuffle_init_x86(GF16_POLYNOMIAL); @@ -664,7 +664,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; - replace_word = &gf16_shuffle16_replace_word; + replace_word = &gf16_shuffle2x16_replace_word; break; case GF16_SHUFFLE_NEON: @@ -697,6 +697,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { int available = gf16_clmul_init_arm(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_available_neon && available) + _mul = &gf16_clmul_mul_neon; _mul_add = &gf16_clmul_muladd_neon; _mul_add_multi = &gf16_clmul_muladd_multi_neon; _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_neon; @@ -762,6 +763,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { case GF16_SHUFFLE2X_128_SVE2: METHOD_REQUIRES(gf16_available_sve2 && gf16_sve_get_size() >= 32) + _mul = &gf16_shuffle2x_mul_128_sve2; _mul_add = &gf16_shuffle2x_muladd_128_sve2; _mul_add_multi = &gf16_shuffle2x_muladd_multi_128_sve2; _mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_128_sve2; @@ -785,6 +787,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { scratch = gf16_shuffle_init_512_sve(GF16_POLYNOMIAL); + _mul = &gf16_shuffle_mul_512_sve2; _mul_add = &gf16_shuffle_muladd_512_sve2; _mul_add_multi = &gf16_shuffle_muladd_multi_512_sve2; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_512_sve2; @@ -805,6 +808,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { case GF16_CLMUL_SVE2: METHOD_REQUIRES(gf16_available_sve2) + _mul = &gf16_clmul_mul_sve2; _mul_add = &gf16_clmul_muladd_sve2; _mul_add_multi = &gf16_clmul_muladd_multi_sve2; _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sve2; @@ -915,6 +919,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { case GF16_AFFINE2X_AVX512: scratch = gf16_affine_init_avx512(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512) + _mul = &gf16_affine2x_mul_avx512; _mul_add = &gf16_affine2x_muladd_avx512; _mul_add_multi = &gf16_affine2x_muladd_multi_avx512; _mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx512; @@ -937,12 +942,13 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; - replace_word = &gf16_shuffle32_replace_word; + replace_word = &gf16_affine2x_replace_word; break; case GF16_AFFINE2X_AVX2: scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_affine_available_avx2 && gf16_shuffle_available_avx2) + _mul = &gf16_affine2x_mul_avx2; _mul_add = &gf16_affine2x_muladd_avx2; _mul_add_multi = &gf16_affine2x_muladd_multi_avx2; _mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx2; @@ -965,12 +971,13 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; - replace_word = &gf16_shuffle16_replace_word; + replace_word = &gf16_affine2x_replace_word; break; case GF16_AFFINE2X_GFNI: scratch = gf16_affine_init_gfni(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_affine_available_gfni && gf16_shuffle_available_ssse3) + _mul = &gf16_affine2x_mul_gfni; _mul_add = &gf16_affine2x_muladd_gfni; _mul_add_multi = &gf16_affine2x_muladd_multi_gfni; _mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_gfni; @@ -993,7 +1000,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_gfni; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; - replace_word = &gf16_shuffle8_replace_word; + replace_word = &gf16_affine2x_replace_word; break; case GF16_XOR_JIT_AVX512: @@ -1031,7 +1038,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_sse2; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; - replace_word = NULL; + replace_word = gf16_xor16_replace_word; break; /* case GF16_XOR_JIT_AVX: @@ -1053,7 +1060,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx; copy_cksum = &gf16_cksum_copy_sse2; copy_cksum_check = &gf16_cksum_copy_check_sse2; - replace_word = NULL; + replace_word = gf16_xor16_replace_word; break; */ case GF16_XOR_JIT_AVX2: @@ -1075,7 +1082,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx2; copy_cksum = &gf16_cksum_copy_avx2; copy_cksum_check = &gf16_cksum_copy_check_avx2; - replace_word = NULL; + replace_word = gf16_xor32_replace_word; break; case GF16_XOR_JIT_AVX512: METHOD_REQUIRES(gf16_xor_available_avx512) @@ -1098,7 +1105,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx512; copy_cksum = &gf16_cksum_copy_avx512; copy_cksum_check = &gf16_cksum_copy_check_avx512; - replace_word = NULL; + replace_word = gf16_xor64_replace_word; break; default: break; // for pedantic compilers } @@ -1159,7 +1166,6 @@ Galois16Mul::Galois16Mul(Galois16Methods method) { finish_packed = NULL; replace_word = &Galois16Mul::_replace_word; - _mul = NULL; _mul_add_pf = NULL; add_multi = &gf_add_multi_generic; add_multi_packed = &gf_add_multi_packed_generic; @@ -1208,6 +1214,7 @@ void Galois16Mul::move(Galois16Mul& other) { _pow_add = other._pow_add; copy_cksum = other.copy_cksum; copy_cksum_check = other.copy_cksum_check; + replace_word = other.replace_word; } #endif @@ -1283,12 +1290,18 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu if(caps.hasSVE2) { if(gf16_sve_get_size() >= 64) return GF16_SHUFFLE_512_SVE2; - return inputs > 3 ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2; + return inputs > 3 && !forInvert ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2; } if(caps.hasSVE && gf16_sve_get_size() > 16) return GF16_SHUFFLE_128_SVE; if(gf16_available_neon && caps.hasNEON) - return inputs > 3 ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; + return +# ifdef __aarch64__ + inputs > 3 +# else + inputs > 1 +# endif + && !forInvert ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; #endif diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index 9d3e7a59..8b3223c0 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -7,7 +7,7 @@ #include #include -typedef void(*Galois16MulTransform) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); +typedef void(*Galois16MulTransform) (void* dst, const void* src, size_t srcLen); typedef void(*Galois16MulTransformPacked) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); typedef void(*Galois16MulTransformPackedPartial) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); typedef void(*Galois16MulUntransform) (void *HEDLEY_RESTRICT dst, size_t len); @@ -18,7 +18,8 @@ typedef int(*Galois16MulUntransformPackedCksumPartial) (void *HEDLEY_RESTRICT ds typedef uint16_t(*Galois16ReplaceWord) (void* data, size_t index, uint16_t newValue); -typedef void(*Galois16MulFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +typedef void(*Galois16MulFunc) (const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); +typedef void(*Galois16MulRstFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); typedef void(*Galois16MulPfFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); typedef void(*Galois16PowFunc) (const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); typedef void(*Galois16MulMultiFunc) (const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); @@ -112,7 +113,7 @@ class Galois16Mul { Galois16MethodInfo _info; Galois16MulFunc _mul; - Galois16MulFunc _mul_add; + Galois16MulRstFunc _mul_add; Galois16MulPfFunc _mul_add_pf; Galois16PowFunc _pow; Galois16PowFunc _pow_add; @@ -120,8 +121,9 @@ class Galois16Mul { Galois16MulPackedFunc _mul_add_multi_packed; Galois16MulPackPfFunc _mul_add_multi_packpf; - static void _prepare_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) { - memcpy(dst, src, srcLen); + static void _prepare_none(void* dst, const void* src, size_t srcLen) { + if(dst != src) + memcpy(dst, src, srcLen); } static void _finish_none(void *HEDLEY_RESTRICT, size_t) {} static void _prepare_packed_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); @@ -185,7 +187,7 @@ class Galois16Mul { inline HEDLEY_CONST bool isMultipleOfStride(size_t len) const { #if defined(_M_ARM64) || defined(__aarch64__) // SVE can have non-power-of-2 strides - if((_info.stride & (_info.stride-1)) != 0) // ...but most of the time, expect stride to be a power of 2 + if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) // ...but most of the time, expect stride to be a power of 2 return (len % _info.stride) == 0; #endif return (len & (_info.stride-1)) == 0; @@ -193,7 +195,7 @@ class Galois16Mul { inline HEDLEY_CONST size_t alignToStride(size_t len) const { size_t alignMask = _info.stride-1; #if defined(_M_ARM64) || defined(__aarch64__) - if((_info.stride & (_info.stride-1)) != 0) { + if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) { return ((len + alignMask) / _info.stride) * _info.stride; } #endif @@ -218,32 +220,24 @@ class Galois16Mul { HEDLEY_MALLOC void* mutScratch_alloc() const; void mutScratch_free(void* mutScratch) const; - inline void mul(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const { - assert(((uintptr_t)dst & (_info.alignment-1)) == 0); - assert(((uintptr_t)src & (_info.alignment-1)) == 0); + inline void mul(void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const { assert(isMultipleOfStride(len)); assert(len > 0); - if(!(coefficient & 0xfffe)) { + if(HEDLEY_UNLIKELY(!(coefficient & 0xfffe))) { if(coefficient == 0) memset(dst, 0, len); - else + else if(dst != src) memcpy(dst, src, len); } - else if(_mul) + else _mul(scratch, dst, src, len, coefficient, mutScratch); - else { - memset(dst, 0, len); - _mul_add(scratch, dst, src, len, coefficient, mutScratch); - } } inline void mul_add(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const { - assert(((uintptr_t)dst & (_info.alignment-1)) == 0); - assert(((uintptr_t)src & (_info.alignment-1)) == 0); assert(isMultipleOfStride(len)); assert(len > 0); - if(coefficient == 0) return; + if(HEDLEY_UNLIKELY(coefficient == 0)) return; _mul_add(scratch, dst, src, len, coefficient, mutScratch); } @@ -252,7 +246,7 @@ class Galois16Mul { assert(len > 0); assert(outputs > 0); - if(!(coefficient & 0xfffe)) { + if(HEDLEY_UNLIKELY(!(coefficient & 0xfffe))) { if(coefficient == 0) { for(unsigned output = 0; output < outputs; output++) memset((uint8_t*)dst[output] + offset, 0, len); @@ -268,20 +262,11 @@ class Galois16Mul { memset((uint8_t*)dst[output] + offset, 0, len); _pow_add(scratch, outputs, offset, dst, src, len, coefficient, mutScratch); } - else if(_mul) { - void* prev = (uint8_t*)src + offset; - for(unsigned output = 0; output < outputs; output++) { - void* cur = (uint8_t*)dst[output] + offset; - _mul(scratch, cur, prev, len, coefficient, mutScratch); - prev = cur; - } - } else { void* prev = (uint8_t*)src + offset; for(unsigned output = 0; output < outputs; output++) { void* cur = (uint8_t*)dst[output] + offset; - memset(cur, 0, len); - _mul_add(scratch, cur, prev, len, coefficient, mutScratch); + _mul(scratch, cur, prev, len, coefficient, mutScratch); prev = cur; } } @@ -291,7 +276,7 @@ class Galois16Mul { assert(len > 0); assert(outputs > 0); - if(coefficient == 0) return; + if(HEDLEY_UNLIKELY(coefficient == 0)) return; _pow_add(scratch, outputs, offset, dst, src, len, coefficient, mutScratch); } From 9362d265355800cde41268d4e2d8e735aa58540b Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 6 Jun 2023 12:31:35 +1000 Subject: [PATCH 08/91] Make Xor SSE2 memory layout consistent with AVX* implementations --- gf16/gf16_xor_common_funcs.h | 8 ++-- gf16/gf16_xor_sse2.c | 75 ++++++++++++++++++------------------ 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/gf16/gf16_xor_common_funcs.h b/gf16/gf16_xor_common_funcs.h index c607da06..90235423 100644 --- a/gf16/gf16_xor_common_funcs.h +++ b/gf16/gf16_xor_common_funcs.h @@ -48,12 +48,12 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_split(_mword ta, _mword tb, _mwor *th = _mm256_permute4x64_epi64(*th, _MM_SHUFFLE(2,0,3,1)); #else *th = _mm_packus_epi16( - _mm_srli_epi16(tb, 8), - _mm_srli_epi16(ta, 8) + _mm_srli_epi16(ta, 8), + _mm_srli_epi16(tb, 8) ); *tl = _mm_packus_epi16( - _mm_and_si128(tb, _mm_set1_epi16(0xff)), - _mm_and_si128(ta, _mm_set1_epi16(0xff)) + _mm_and_si128(ta, _mm_set1_epi16(0xff)), + _mm_and_si128(tb, _mm_set1_epi16(0xff)) ); #endif } diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c index e4f27287..723ad74f 100644 --- a/gf16/gf16_xor_sse2.c +++ b/gf16/gf16_xor_sse2.c @@ -1065,14 +1065,14 @@ void gf16_xor_finish_block_sse2(void *HEDLEY_RESTRICT dst) { srcVec = _mm_add_epi8(srcVec, srcVec); \ write16((target)+0, _mm_movemask_epi8(srcVec)); \ } - EXTRACT_BITS_HALF(_dst + 0, dstA, 0, srcDQb) - EXTRACT_BITS_HALF(_dst + 8, dstA, 1, srcDQa) - EXTRACT_BITS_HALF(_dst + 16, dstB, 0, srcDQd) - EXTRACT_BITS_HALF(_dst + 24, dstB, 1, srcDQc) - EXTRACT_BITS_HALF(_dst + 32, dstC, 0, srcDQf) - EXTRACT_BITS_HALF(_dst + 40, dstC, 1, srcDQe) - EXTRACT_BITS_HALF(_dst + 48, dstD, 0, srcDQh) - EXTRACT_BITS_HALF(_dst + 56, dstD, 1, srcDQg) + EXTRACT_BITS_HALF(_dst + 0, dstA, 0, srcDQa) + EXTRACT_BITS_HALF(_dst + 8, dstA, 1, srcDQb) + EXTRACT_BITS_HALF(_dst + 16, dstB, 0, srcDQc) + EXTRACT_BITS_HALF(_dst + 24, dstB, 1, srcDQd) + EXTRACT_BITS_HALF(_dst + 32, dstC, 0, srcDQe) + EXTRACT_BITS_HALF(_dst + 40, dstC, 1, srcDQf) + EXTRACT_BITS_HALF(_dst + 48, dstD, 0, srcDQg) + EXTRACT_BITS_HALF(_dst + 56, dstD, 1, srcDQh) #undef EXTRACT_BITS_HALF @@ -1098,14 +1098,14 @@ void gf16_xor_finish_block_sse2(void *HEDLEY_RESTRICT dst) { // extract & write all bits // TODO: consider saving some to a register to reduce write ops - EXTRACT_BITS(_dst + 64 + 0, srcDQb) - EXTRACT_BITS(_dst + 64 + 8, srcDQa) - EXTRACT_BITS(_dst + 64 + 16, srcDQd) - EXTRACT_BITS(_dst + 64 + 24, srcDQc) - EXTRACT_BITS(_dst + 64 + 32, srcDQf) - EXTRACT_BITS(_dst + 64 + 40, srcDQe) - EXTRACT_BITS(_dst + 64 + 48, srcDQh) - EXTRACT_BITS(_dst + 64 + 56, srcDQg) + EXTRACT_BITS(_dst + 64 + 0, srcDQa) + EXTRACT_BITS(_dst + 64 + 8, srcDQb) + EXTRACT_BITS(_dst + 64 + 16, srcDQc) + EXTRACT_BITS(_dst + 64 + 24, srcDQd) + EXTRACT_BITS(_dst + 64 + 32, srcDQe) + EXTRACT_BITS(_dst + 64 + 40, srcDQf) + EXTRACT_BITS(_dst + 64 + 48, srcDQg) + EXTRACT_BITS(_dst + 64 + 56, srcDQh) } void gf16_xor_finish_copy_block_sse2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) { uint16_t* _dst = (uint16_t*)dst; @@ -1125,14 +1125,14 @@ void gf16_xor_finish_copy_block_sse2(void *HEDLEY_RESTRICT dst, const void *HEDL UNPACK_VECTS; // write extracted bits - EXTRACT_BITS(_dst + 0, srcDQb) - EXTRACT_BITS(_dst + 8, srcDQa) - EXTRACT_BITS(_dst + 16, srcDQd) - EXTRACT_BITS(_dst + 24, srcDQc) - EXTRACT_BITS(_dst + 32, srcDQf) - EXTRACT_BITS(_dst + 40, srcDQe) - EXTRACT_BITS(_dst + 48, srcDQh) - EXTRACT_BITS(_dst + 56, srcDQg) + EXTRACT_BITS(_dst + 0, srcDQa) + EXTRACT_BITS(_dst + 8, srcDQb) + EXTRACT_BITS(_dst + 16, srcDQc) + EXTRACT_BITS(_dst + 24, srcDQd) + EXTRACT_BITS(_dst + 32, srcDQe) + EXTRACT_BITS(_dst + 40, srcDQf) + EXTRACT_BITS(_dst + 48, srcDQg) + EXTRACT_BITS(_dst + 56, srcDQh) // load second half @@ -1143,14 +1143,14 @@ void gf16_xor_finish_copy_block_sse2(void *HEDLEY_RESTRICT dst, const void *HEDL UNPACK_VECTS; - EXTRACT_BITS(_dst + 64 + 0, srcDQb) - EXTRACT_BITS(_dst + 64 + 8, srcDQa) - EXTRACT_BITS(_dst + 64 + 16, srcDQd) - EXTRACT_BITS(_dst + 64 + 24, srcDQc) - EXTRACT_BITS(_dst + 64 + 32, srcDQf) - EXTRACT_BITS(_dst + 64 + 40, srcDQe) - EXTRACT_BITS(_dst + 64 + 48, srcDQh) - EXTRACT_BITS(_dst + 64 + 56, srcDQg) + EXTRACT_BITS(_dst + 64 + 0, srcDQa) + EXTRACT_BITS(_dst + 64 + 8, srcDQb) + EXTRACT_BITS(_dst + 64 + 16, srcDQc) + EXTRACT_BITS(_dst + 64 + 24, srcDQd) + EXTRACT_BITS(_dst + 64 + 32, srcDQe) + EXTRACT_BITS(_dst + 64 + 40, srcDQf) + EXTRACT_BITS(_dst + 64 + 48, srcDQg) + EXTRACT_BITS(_dst + 64 + 56, srcDQh) } void gf16_xor_finish_copy_blocku_sse2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t bytes) { uint16_t block[128]; @@ -1279,10 +1279,9 @@ void gf16_xor_jit_uninit(void* scratch) { #endif } -static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t index, uint16_t newValue, size_t width, unsigned byteFlip) { +static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t index, uint16_t newValue, size_t width) { uint8_t* base = (uint8_t*)data + (index & ~(width*8-1)) * 2; // advance pointer to correct group - base += ((index >> 3) & (width-1)) ^ byteFlip; // advance to correct byte - // TODO: remove byteFlip parameter + base += ((index >> 3) & (width-1)); // advance to correct byte unsigned bitIndex = index&7; uint16_t oldValue = 0; @@ -1299,13 +1298,13 @@ static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t i return oldValue; } uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue) { - return gf16_xorX_replace_word(data, index, newValue, 16, 1); + return gf16_xorX_replace_word(data, index, newValue, 16); } uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue) { - return gf16_xorX_replace_word(data, index, newValue, 32, 0); + return gf16_xorX_replace_word(data, index, newValue, 32); } uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue) { - return gf16_xorX_replace_word(data, index, newValue, 64, 0); + return gf16_xorX_replace_word(data, index, newValue, 64); } From 0dc711448531ccc96a1c656082d021965322607f Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 6 Jun 2023 18:20:51 +1000 Subject: [PATCH 09/91] Adjust inversion for integration with par2cmdline-turbo --- gf16/gfmat_inv.cpp | 30 +++++++++++++++++++++++------- gf16/gfmat_inv.h | 14 ++++++++++++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 2bc5cd1a..a9b512e9 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -1,4 +1,5 @@ #include "gfmat_coeff.h" +#include "gfmat_inv.h" #ifdef PARPAR_INVERT_SUPPORT extern "C" uint16_t* gf16_recip; @@ -7,7 +8,9 @@ extern "C" uint16_t* gf16_recip; #include "../src/platform.h" // for ALIGN_* #include "gf16mul.h" -uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned validCount, std::vector& recovery, unsigned& stride) { +bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { + if(mat) ALIGN_FREE(mat); + unsigned matWidth = inputValid.size() * sizeof(uint16_t); Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true)); stride = gf.alignToStride(matWidth); @@ -16,21 +19,27 @@ uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned unsigned invalidCount = inputValid.size() - validCount; assert(validCount < inputValid.size()); // i.e. invalidCount > 0 + assert(inputValid.size() <= 32768); + assert(recovery.size() <= 65535); - uint16_t* mat; ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment); unsigned validCol, missingCol; unsigned stride16 = stride / sizeof(uint16_t); assert(stride16 * sizeof(uint16_t) == stride); + uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc + invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? if(invalidCount > recovery.size()) { // not enough recovery gf.mutScratch_free(gfScratch); ALIGN_FREE(mat); - return nullptr; + mat = nullptr; + return false; } + if(progressCb) progressCb(0, totalProgress); + // generate matrix validCol = 0; missingCol = validCount; @@ -44,7 +53,11 @@ uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned assert(validCol == validCount); // pre-transform + uint16_t progressOffset = 1; if(gf.needPrepare()) { + if(progressCb) progressCb(1, totalProgress); + progressOffset = 2; + for(unsigned rec = 0; rec < invalidCount; rec++) { uint16_t* row = mat + rec * stride16; //memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory @@ -54,9 +67,10 @@ uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned // invert // TODO: optimise: multi-thread + packed arrangement - // TODO: progress hook missingCol = validCount; for(unsigned rec = 0; rec < invalidCount; rec++) { + if(progressCb) progressCb(rec + progressOffset, totalProgress); + uint16_t* row = mat + rec * stride16; // scale down factor uint16_t baseCoeff = gf.replace_word(row, missingCol, 1); @@ -84,6 +98,8 @@ uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned // post transform if(gf.needPrepare()) { + if(progressCb) progressCb(totalProgress-1, totalProgress); + for(unsigned rec = 0; rec < invalidCount; rec++) { uint16_t* row = mat + rec * stride16; gf.finish(row, stride); @@ -105,11 +121,11 @@ uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned recovery.resize(invalidCount); gf.mutScratch_free(gfScratch); - return mat; + return true; } -void free_recovery_matrix(uint16_t* mat) { - ALIGN_FREE(mat); +Galois16RecMatrix::~Galois16RecMatrix() { + if(mat) ALIGN_FREE(mat); } #endif diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index c70cacba..3aa4aa25 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -2,11 +2,21 @@ #define GFMAT_INV_H #include +#include #include "../src/stdint.h" #ifdef PARPAR_INVERT_SUPPORT -uint16_t* compute_recovery_matrix(const std::vector& inputValid, unsigned validCount, std::vector& recovery, unsigned& stride); -void free_recovery_matrix(uint16_t* mat); +class Galois16RecMatrix { + uint16_t* mat; + unsigned stride; +public: + Galois16RecMatrix() : mat(nullptr) {} + ~Galois16RecMatrix(); + bool Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb = nullptr); + inline uint16_t GetFactor(uint16_t inIdx, uint16_t recIdx) const { + return mat[recIdx * stride/sizeof(uint16_t) + inIdx]; + } +}; #endif #endif From 83c9f8b066c6b8b20ebbbc318688c0559a822d21 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 6 Jun 2023 18:21:09 +1000 Subject: [PATCH 10/91] Compile bugfix --- gf16/gf16_xor_avx2.c | 2 +- gf16/gf16_xor_sse2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c index a81329bc..d9ba9578 100644 --- a/gf16/gf16_xor_avx2.c +++ b/gf16/gf16_xor_avx2.c @@ -726,7 +726,7 @@ void* gf16_xor_jit_init_mut_avx2() { #if defined(__AVX2__) && defined(PLATFORM_AMD64) jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE); if(!jitCode) return NULL; - xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL); + xor_write_init_jit(jitCode->w, (uint8_t*)jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL); return jitCode; #else return NULL; diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c index 723ad74f..1f428c6c 100644 --- a/gf16/gf16_xor_sse2.c +++ b/gf16/gf16_xor_sse2.c @@ -1264,7 +1264,7 @@ void* gf16_xor_jit_init_mut_sse2() { #ifdef PLATFORM_X86 jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE); if(!jitCode) return NULL; - xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL); + xor_write_init_jit(jitCode->w, (uint8_t*)jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL); return jitCode; #else return NULL; From 451422641a8c830855e214b784caaeb441a802d0 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 7 Jun 2023 20:52:48 +1000 Subject: [PATCH 11/91] Add option to specify exact recovery exponents to use --- README.md | 1 + bin/parpar.js | 13 ++++++++ help.txt | 5 ++++ lib/par2gen.js | 74 +++++++++++++++++++++++++++++++++------------- lib/par2outfile.js | 4 +-- 5 files changed, 73 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 6fbf5a4e..67a5d505 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,7 @@ var par2creator = require('@animetosho/parpar').run( value: 65537 }, recoveryOffset: 0, + recoveryExponents: null, // if an array of numbers is specified, recoveryOffset is ignored, and a single output file is produced regardless of output* options memoryLimit: null, // 0 to specify no limit minChunkSize: 128*1024, processBatchSize: 12, diff --git a/bin/parpar.js b/bin/parpar.js index 42082df9..f03af0b6 100755 --- a/bin/parpar.js +++ b/bin/parpar.js @@ -61,6 +61,10 @@ var opts = { type: 'int', map: 'recoveryOffset' }, + 'recovery-exponents': { + type: 'list', + map: 'recoveryExponents' + }, 'comment': { alias: 'c', type: 'array', @@ -417,6 +421,15 @@ if(argv['md5-method']) { require('../lib/par2.js').set_outhash_method(argv['md5-method']); } +if(argv['recovery-exponents']) { + ['recovery-slices', 'min-recovery-slices', 'max-recovery-slices', 'recovery-offset', 'slice-dist', 'slices-per-file', 'slices-first-file', 'recovery-files'].forEach(function(conflictOpt) { + if(argv[conflictOpt]) + error('`--recovery-exponents` cannot be used with `--' + conflictOpt + '`'); + }); + if(!argv.noindex) + error('`--recovery-exponents` cannot be used with `--noindex`'); +} + var inputFiles = argv._; // copied from Nyuu; TODO: dedupe this somehow? diff --git a/help.txt b/help.txt index 9ed758ea..da6a3a05 100644 --- a/help.txt +++ b/help.txt @@ -88,6 +88,11 @@ PAR2 Options: This default also causes an error to be thrown if the number exceeds the maximum of 65535. -e, --recovery-offset Recovery slice start offset. Default 0. + --recovery-exponents Comma-separated list of exact recovery exponents + to use. If specified, `--recovery-offset` and + `recovery-slices` related options cannot be used, + and only a single PAR2 output file will be + produced. -c, --comment Add PAR2 comment. Can be specified multiple times. --packet-redundancy How many copies of critical packets to use in recovery files. This option uses the same syntax diff --git a/lib/par2gen.js b/lib/par2gen.js index 278fdf9d..3545550d 100644 --- a/lib/par2gen.js +++ b/lib/par2gen.js @@ -181,6 +181,7 @@ function PAR2Gen(fileInfo, sliceSize, opts) { value: 65536 }, recoveryOffset: 0, + recoveryExponents: null, // if an array of numbers is specified, recoveryOffset is ignored, and a single output file is produced regardless of output* options memoryLimit: null, // 0 to specify no limit minChunkSize: 128*1024, // 0 to disable chunking processBatchSize: null, // default is typically 12 (may be adjusted based on GF method's preferred multiple) @@ -222,6 +223,30 @@ function PAR2Gen(fileInfo, sliceSize, opts) { }; if(opts) Par2._extend(o, opts); + if(o.recoveryExponents) { + o.outputIndex = false; + o.outputSizeScheme = 'equal'; + o.outputFirstFileSlices = null; + o.outputFileMaxSlices = { + unit: 'slices', + value: 65535 + }; + o.outputFileCount = 0; + o.recoveryOffset = 0; + + var used = {}; + o.recoveryExponents = o.recoveryExponents.map(function(exp) { + exp = exp|0; + if(exp < 0 || exp > 65534) + throw new Error('Invalid recovery exponent ' + exp); + if(exp in used) + throw new Error('Duplicate recovery exponent ' + exp); + used[exp] = 1; + return exp; + }); + o.recoverySlices = o.recoveryExponents.length; + } + if(o.criticalRedundancyScheme === 'pow2') o.criticalRedundancyScheme = {unit: 'log', value: 2}; // backwards compatibility if(!fileInfo || (typeof fileInfo != 'object')) throw new Error('No input files supplied'); var totalSize = 0, dataFiles = 0; @@ -356,21 +381,23 @@ function PAR2Gen(fileInfo, sliceSize, opts) { if(o.seqReadSize > MAX_BUFFER_SIZE_MOD2) throw new Error('Read buffer size (' + o.seqReadSize + ') exceeds maximum size supported by this version of Node.js of ' + MAX_BUFFER_SIZE_MOD2 + ' bytes'); - o.recoverySlices = calcNumRecoverySlices(o.recoverySlices, o.sliceSize, this.inputSlices, fileInfo); - // check+apply min/max limits - var minRecSlices = Math.ceil(o.recoverySlices), maxRecSlices = Math.floor(o.recoverySlices); - if(o.minRecoverySlices !== null) - minRecSlices = Math.ceil(calcNumRecoverySlices(o.minRecoverySlices, o.sliceSize, this.inputSlices, fileInfo)); - if(o.maxRecoverySlices !== null) - maxRecSlices = Math.floor(calcNumRecoverySlices(o.maxRecoverySlices, o.sliceSize, this.inputSlices, fileInfo)); - o.recoverySlices = Math.max(o.recoverySlices, minRecSlices); - o.recoverySlices = Math.min(o.recoverySlices, maxRecSlices); - o.recoverySlices = Math.round(o.recoverySlices); - if(o.recoverySlices < minRecSlices || o.recoverySlices > maxRecSlices /*pedant check*/) - throw new Error('Could not satisfy specified min/max recovery slice count constraints'); - - if(o.recoverySlices < 0 || isNaN(o.recoverySlices) || !isFinite(o.recoverySlices)) throw new Error('Invalid number of recovery slices'); - if(o.recoverySlices+o.recoveryOffset > 65535) throw new Error('Cannot generate specified number of recovery slices: ' + (o.recoverySlices+o.recoveryOffset) + ' exceeds maximum of 65535'); + if(!o.recoveryExponents) { + o.recoverySlices = calcNumRecoverySlices(o.recoverySlices, o.sliceSize, this.inputSlices, fileInfo); + // check+apply min/max limits + var minRecSlices = Math.ceil(o.recoverySlices), maxRecSlices = Math.floor(o.recoverySlices); + if(o.minRecoverySlices !== null) + minRecSlices = Math.ceil(calcNumRecoverySlices(o.minRecoverySlices, o.sliceSize, this.inputSlices, fileInfo)); + if(o.maxRecoverySlices !== null) + maxRecSlices = Math.floor(calcNumRecoverySlices(o.maxRecoverySlices, o.sliceSize, this.inputSlices, fileInfo)); + o.recoverySlices = Math.max(o.recoverySlices, minRecSlices); + o.recoverySlices = Math.min(o.recoverySlices, maxRecSlices); + o.recoverySlices = Math.round(o.recoverySlices); + if(o.recoverySlices < minRecSlices || o.recoverySlices > maxRecSlices /*pedant check*/) + throw new Error('Could not satisfy specified min/max recovery slice count constraints'); + + if(o.recoverySlices < 0 || isNaN(o.recoverySlices) || !isFinite(o.recoverySlices)) throw new Error('Invalid number of recovery slices'); + if(o.recoverySlices+o.recoveryOffset > 65535) throw new Error('Cannot generate specified number of recovery slices: ' + (o.recoverySlices+o.recoveryOffset) + ' exceeds maximum of 65535'); + } if(this.inputSlices < 1 && o.recoverySlices > 0) throw new Error('Cannot generate recovery from empty input data'); @@ -789,7 +816,10 @@ function PAR2Gen(fileInfo, sliceSize, opts) { throw new Error('Cannot allocate '+o.recoverySlices+' recovery slices to '+o.outputFileCount+' volumes as there aren\'t enough slices'); if(this.totalSize > 0) { - if(o.outputSizeScheme == 'pow2') { + if(o.recoveryExponents) { + this._rfPush(o.recoveryExponents.length, o.recoveryExponents, critPackets, creatorPkt); + } + else if(o.outputSizeScheme == 'pow2') { var slices = o.outputFirstFileSlices || 1, totalSlices = o.recoverySlices + o.recoveryOffset; var getSliceNumsOffsets = function(slices) { var result = []; @@ -917,7 +947,7 @@ PAR2Gen.prototype = { readSize: 0, _buf: null, - _rfPush: function(numSlices, sliceOffset, critPackets, creator) { + _rfPush: function(numSlices, sliceOffsetOrExponents, critPackets, creator) { var packets, recvSize = 0, critTotalSize = sumSize(critPackets); if(numSlices) recvSize = this.par2.packetRecoverySize(); @@ -933,7 +963,8 @@ PAR2Gen.prototype = { packets = Array(numSlices + critNum +1); var pos = 0, critWritten = 0; for(var i=0; i Date: Wed, 7 Jun 2023 22:30:51 +1000 Subject: [PATCH 12/91] Add dot-product optimisation to matrix inversion --- gf16/gfmat_inv.cpp | 138 ++++++++++++++++++++++++++++++++++++--------- gf16/gfmat_inv.h | 4 ++ 2 files changed, 115 insertions(+), 27 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index a9b512e9..eab26279 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -8,6 +8,96 @@ extern "C" uint16_t* gf16_recip; #include "../src/platform.h" // for ALIGN_* #include "gf16mul.h" +template +int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch) { + unsigned missingCol = validCount + rec; + + uint16_t baseCoeff; + uint16_t coeff[rows]; + + void* srcRows[rows]; + srcRows[0] = mat + rec * (stride / sizeof(uint16_t)); + for(unsigned i=1; i= 2) { + // multiply-add to the next row + MULADD_ROW(srcRows[1], 0); + // scale it, and multiply-add back + SCALE_ROW(1); + MULADD_ROW(srcRows[0], 1); + } + if(rows >= 3) { + MULADD_MULTI_ROW(srcRows[2], 0, 2); + SCALE_ROW(2); + if(rows >= 4) { + MULADD_MULTI_ROW(srcRows[3], 0, 2); + MULADD_ROW(srcRows[3], 2); + SCALE_ROW(3); + MULADD_ROW(srcRows[2], 3); + MULADD_MULTI_ROW(srcRows[0], 2, 2); + MULADD_MULTI_ROW(srcRows[1], 2, 2); + } else { + MULADD_ROW(srcRows[0], 2); + MULADD_ROW(srcRows[1], 2); + } + } + if(rows >= 5) { + MULADD_MULTI_ROW(srcRows[4], 0, 4); + SCALE_ROW(4); + if(rows >= 6) { + MULADD_MULTI_ROW(srcRows[5], 0, 4); + MULADD_ROW(srcRows[5], 4); + SCALE_ROW(5); + MULADD_ROW(srcRows[4], 5); + for(unsigned rec2 = 0; rec2 < 4; rec2++) { + MULADD_MULTI_ROW(srcRows[rec2], 4, 2); + } + } else { + for(unsigned rec2 = 0; rec2 < 4; rec2++) { + MULADD_ROW(srcRows[rec2], 4); + } + } + } + + for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) { + if(HEDLEY_UNLIKELY(rec2 >= rec && rec2 < rec+rows)) continue; + uint16_t* row2 = mat + rec2 * (stride / sizeof(uint16_t)); + if(rows > 1) { + MULADD_MULTI_ROW(row2, 0, rows); + } else { + MULADD_ROW(row2, 0); + } + } + + #undef SCALE_ROW + #undef MULADD_ROW + #undef MULADD_MULTI_ROW + + return -1; +} + bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { if(mat) ALIGN_FREE(mat); @@ -67,34 +157,28 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // invert // TODO: optimise: multi-thread + packed arrangement - missingCol = validCount; - for(unsigned rec = 0; rec < invalidCount; rec++) { - if(progressCb) progressCb(rec + progressOffset, totalProgress); - - uint16_t* row = mat + rec * stride16; - // scale down factor - uint16_t baseCoeff = gf.replace_word(row, missingCol, 1); - if(HEDLEY_UNLIKELY(baseCoeff == 0)) { // bad recovery coeff - // ignore this recovery row and try again - recovery.erase(recovery.begin() + rec); - goto invert_loop; - } - baseCoeff = gf16_recip[baseCoeff]; // TODO: consider prefetching this? - if(HEDLEY_LIKELY(baseCoeff != 1)) { - gf.mul(row, row, stride, baseCoeff, gfScratch); - } - - for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) { - if(HEDLEY_UNLIKELY(rec == rec2)) continue; - uint16_t* row2 = mat + rec2 * stride16; - uint16_t coeff = gf.replace_word(row2, missingCol, 0); - if(HEDLEY_LIKELY(coeff != 0)) { - gf.mul_add(row2, row, stride, coeff, gfScratch); - } // TODO: is a coefficient of 0 ever correct? + unsigned rec = 0; + #define INVERT_GROUP(rows) \ + if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \ + for(; rec <= invalidCount-rows; rec+=rows) { \ + if(progressCb) progressCb(rec + progressOffset, totalProgress); \ + \ + int badRowOffset = processRow(rec, validCount, invalidCount, gf, gfScratch); \ + if(badRowOffset >= 0) { \ + /* ignore this recovery row and try again */ \ + recovery.erase(recovery.begin() + rec + badRowOffset); \ + goto invert_loop; \ + } \ + } \ } - - missingCol++; - } + // max out at 6 groups (registers + cache assoc?) + INVERT_GROUP(6) + INVERT_GROUP(5) + INVERT_GROUP(4) + INVERT_GROUP(3) + INVERT_GROUP(2) + INVERT_GROUP(1) + #undef INVERT_GROUP // post transform if(gf.needPrepare()) { diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 3aa4aa25..ee29911c 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -6,9 +6,13 @@ #include "../src/stdint.h" #ifdef PARPAR_INVERT_SUPPORT +class Galois16Mul; class Galois16RecMatrix { uint16_t* mat; unsigned stride; + + template + int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch); public: Galois16RecMatrix() : mat(nullptr) {} ~Galois16RecMatrix(); From d2a4e99b3ca8807aec821e3d5453ae4536291ca5 Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 10 Jun 2023 18:16:21 +1000 Subject: [PATCH 13/91] Add prefetching to matrix inversion --- gf16/gf16_affine.h | 2 + gf16/gf16_clmul.h | 1 + gf16/gf16_muladd_multi.h | 99 ++++++++++++++++++++++++++++++++++++++-- gf16/gf16_shuffle.h | 3 ++ gf16/gf16mul.cpp | 19 ++++++++ gf16/gf16mul.h | 29 ++++++++++++ gf16/gfmat_inv.cpp | 97 +++++++++++++++++++++++++++++++-------- 7 files changed, 226 insertions(+), 24 deletions(-) diff --git a/gf16/gf16_affine.h b/gf16/gf16_affine.h index 254e5334..33c3b074 100644 --- a/gf16/gf16_affine.h +++ b/gf16/gf16_affine.h @@ -6,6 +6,7 @@ void gf16_affine_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \ void gf16_affine_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_affine_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \ void gf16_affine_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \ void gf16_affine_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ @@ -24,6 +25,7 @@ FUNCS(avx512); void gf16_affine2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_affine2x_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \ void gf16_affine2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_affine2x_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \ void gf16_affine2x_prepare_##v(void* dst, const void* src, size_t srcLen); \ diff --git a/gf16/gf16_clmul.h b/gf16/gf16_clmul.h index 696f0dcc..d8f189c2 100644 --- a/gf16/gf16_clmul.h +++ b/gf16/gf16_clmul.h @@ -3,6 +3,7 @@ #define FUNCS(v) \ void gf16_clmul_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_clmul_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \ void gf16_clmul_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_clmul_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \ void gf16_clmul_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h index 41a7e36f..98ccc9d4 100644 --- a/gf16/gf16_muladd_multi.h +++ b/gf16/gf16_muladd_multi.h @@ -29,6 +29,11 @@ void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsign gf16_muladd_multi(scratch, &xfn, procRegions, regions, offset, dst, src, len, coefficients); \ finisher; \ } \ +void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \ + UNUSED(mutScratch); \ + gf16_muladd_multi_stridepf(scratch, &xfn, procRegions, regions, srcStride, dst, src, len, coefficients, pfFactor, prefetch); \ + finisher; \ +} \ void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ UNUSED(mutScratch); \ gf16_muladd_multi_packed(scratch, &xfn, procRegions, procRegions, packedRegions, regions, dst, src, len, blocksize, coefficients); \ @@ -45,6 +50,10 @@ void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsign UNUSED(mutScratch); \ UNUSED(scratch); UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \ } \ +void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \ + UNUSED(mutScratch); \ + UNUSED(scratch); UNUSED(regions); UNUSED(srcStride); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); UNUSED(prefetch); \ +} \ void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ UNUSED(mutScratch); \ UNUSED(scratch); UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \ @@ -102,8 +111,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s uint8_t* _dst = (uint8_t*)dst + offset + len; #define _SRC(limit, n) limit > n ? (const uint8_t*)src[region+n] + offset + len : NULL - unsigned region = 0; - if(regions >= interleave) do { + unsigned region; + for(region = 0; region + interleave <= regions; region += interleave) { muladd_pf( scratch, _dst, 1, interleave, (const uint8_t*)src[region] + offset + len, @@ -114,8 +123,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s _SRC(interleave,17), len, coefficients + region, 0, NULL ); - region += interleave; - } while(interleave <= regions - region); + } unsigned remaining = regions - region; HEDLEY_ASSUME(remaining < interleave); // doesn't seem to always work, so we have additional checks in the switch cases switch(remaining) { @@ -140,6 +148,89 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s #undef _SRC } +static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetch) { + uint8_t* _dst = (uint8_t*)dst + len; + uint8_t* srcEnd = (uint8_t*)src + len; + + size_t pfLen = len>>pfFactor; + const char* _pf = (const char*)prefetch + pfLen; + unsigned outputPfRounds = 1< n ? srcEnd + srcStride*n : NULL + unsigned region; + for(region = 0; region + interleave <= regions && outputPfRounds; region += interleave) { + muladd_pf( + scratch, _dst, 1, interleave, + srcEnd, + _SRC(interleave, 1), _SRC(interleave, 2), _SRC(interleave, 3), _SRC(interleave, 4), + _SRC(interleave, 5), _SRC(interleave, 6), _SRC(interleave, 7), _SRC(interleave, 8), + _SRC(interleave, 9), _SRC(interleave, 10), _SRC(interleave, 11), _SRC(interleave, 12), + _SRC(interleave,13), _SRC(interleave, 14), _SRC(interleave, 15), _SRC(interleave, 16), + _SRC(interleave,17), + len, coefficients + region, 1, _pf + ); + srcEnd += srcStride*interleave; + outputPfRounds--; + _pf += pfLen; + } + for(; region + interleave <= regions; region += interleave) { + muladd_pf( + scratch, _dst, 1, interleave, + srcEnd, + _SRC(interleave, 1), _SRC(interleave, 2), _SRC(interleave, 3), _SRC(interleave, 4), + _SRC(interleave, 5), _SRC(interleave, 6), _SRC(interleave, 7), _SRC(interleave, 8), + _SRC(interleave, 9), _SRC(interleave, 10), _SRC(interleave, 11), _SRC(interleave, 12), + _SRC(interleave,13), _SRC(interleave, 14), _SRC(interleave, 15), _SRC(interleave, 16), + _SRC(interleave,17), + len, coefficients + region, 0, NULL + ); + srcEnd += srcStride*interleave; + } + unsigned remaining = regions - region; + HEDLEY_ASSUME(remaining < interleave); // doesn't seem to always work, so we have additional checks in the switch cases + if(outputPfRounds) { + switch(remaining) { + #define CASE(x) \ + case x: \ + HEDLEY_ASSUME(x < interleave); \ + muladd_pf( \ + scratch, _dst, 1, x, \ + srcEnd, \ + _SRC(x, 1), _SRC(x, 2), _SRC(x, 3), _SRC(x, 4), \ + _SRC(x, 5), _SRC(x, 6), _SRC(x, 7), _SRC(x, 8), \ + _SRC(x, 9), _SRC(x, 10), _SRC(x, 11), _SRC(x, 12), \ + _SRC(x,13), _SRC(x, 14), _SRC(x, 15), _SRC(x, 16), \ + _SRC(x,17), \ + len, coefficients + region, 1, _pf \ + ); \ + break + REMAINING_CASES; + #undef CASE + default: break; + } + } else { + switch(remaining) { + #define CASE(x) \ + case x: \ + HEDLEY_ASSUME(x < interleave); \ + muladd_pf( \ + scratch, _dst, 1, x, \ + srcEnd, \ + _SRC(x, 1), _SRC(x, 2), _SRC(x, 3), _SRC(x, 4), \ + _SRC(x, 5), _SRC(x, 6), _SRC(x, 7), _SRC(x, 8), \ + _SRC(x, 9), _SRC(x, 10), _SRC(x, 11), _SRC(x, 12), \ + _SRC(x,13), _SRC(x, 14), _SRC(x, 15), _SRC(x, 16), \ + _SRC(x,17), \ + len, coefficients + region, 0, NULL \ + ); \ + break + REMAINING_CASES; + #undef CASE + default: break; + } + } + #undef _SRC +} static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients) { ASSUME(regions <= inputPackSize); diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h index 0344bcef..87f3fe05 100644 --- a/gf16/gf16_shuffle.h +++ b/gf16/gf16_shuffle.h @@ -26,6 +26,7 @@ FUNCS(avx512); // multi-region #define FUNCS(v) \ void gf16_shuffle_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_shuffle_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \ void gf16_shuffle_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) @@ -93,6 +94,7 @@ extern int gf16_available_sve2; void gf16_shuffle2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ + void gf16_shuffle2x_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \ void gf16_shuffle2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_shuffle2x_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) @@ -110,6 +112,7 @@ int gf16_shuffle2x_finish_partial_packsum_sve(void *HEDLEY_RESTRICT dst, void *H void gf16_shuffle2x_mul_128_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle2x_muladd_128_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle2x_muladd_multi_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); +void gf16_shuffle2x_muladd_multi_stridepf_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); void gf16_shuffle2x_muladd_multi_packed_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle2x_muladd_multi_packpf_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 690d4a88..292805b3 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -558,6 +558,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { #ifdef PLATFORM_AMD64 // if 32 registers are available, can do multi-region _mul_add_multi = &gf16_shuffle_muladd_multi_avx512; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_avx512; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_avx512; _mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_avx512; add_multi_packed = &gf_add_multi_packed_v2i3_avx512; @@ -590,6 +591,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { add_multi = &gf_add_multi_avx512; #ifdef PLATFORM_AMD64 _mul_add_multi = &gf16_shuffle_muladd_multi_vbmi; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_vbmi; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_vbmi; _mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_vbmi; add_multi_packed = &gf_add_multi_packed_v2i4_avx512; @@ -618,6 +620,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { add_multi = &gf_add_multi_avx512; #ifdef PLATFORM_AMD64 _mul_add_multi = &gf16_shuffle2x_muladd_multi_avx512; + _mul_add_multi_stridepf = &gf16_shuffle2x_muladd_multi_stridepf_avx512; _mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_avx512; _mul_add_multi_packpf = &gf16_shuffle2x_muladd_multi_packpf_avx512; add_multi_packed = &gf_add_multi_packed_v1i6_avx512; @@ -646,6 +649,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { add_multi = &gf_add_multi_avx2; #ifdef PLATFORM_AMD64 _mul_add_multi = &gf16_shuffle2x_muladd_multi_avx2; + _mul_add_multi_stridepf = &gf16_shuffle2x_muladd_multi_stridepf_avx2; _mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_avx2; _mul_add_multi_packpf = &gf16_shuffle2x_muladd_multi_packpf_avx2; add_multi_packed = &gf_add_multi_packed_v1i2_avx2; @@ -677,6 +681,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { #ifdef __aarch64__ // enable only if 32 registers available _mul_add_multi = &gf16_shuffle_muladd_multi_neon; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_neon; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_neon; // TODO: on Cortex A53, prefetching seems to be slower, so disabled for now //_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_neon; @@ -700,6 +705,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_clmul_mul_neon; _mul_add = &gf16_clmul_muladd_neon; _mul_add_multi = &gf16_clmul_muladd_multi_neon; + _mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_neon; _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_neon; add_multi = &gf_add_multi_neon; add_multi_packed = &gf_add_multi_packed_clmul_neon; @@ -724,6 +730,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_shuffle_mul_128_sve; _mul_add = &gf16_shuffle_muladd_128_sve; _mul_add_multi = &gf16_shuffle_muladd_multi_128_sve; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_128_sve; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_128_sve; //_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_128_sve; add_multi = &gf_add_multi_sve; @@ -745,6 +752,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_shuffle_mul_128_sve2; _mul_add = &gf16_shuffle_muladd_128_sve2; _mul_add_multi = &gf16_shuffle_muladd_multi_128_sve2; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_128_sve2; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_128_sve2; //_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_128_sve2; add_multi = &gf_add_multi_sve2; @@ -766,6 +774,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_shuffle2x_mul_128_sve2; _mul_add = &gf16_shuffle2x_muladd_128_sve2; _mul_add_multi = &gf16_shuffle2x_muladd_multi_128_sve2; + _mul_add_multi_stridepf = &gf16_shuffle2x_muladd_multi_stridepf_128_sve2; _mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_128_sve2; //_mul_add_multi_packpf = &gf16_shuffle2x_muladd_multi_packpf_128_sve2; add_multi = &gf_add_multi_sve2; @@ -790,6 +799,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_shuffle_mul_512_sve2; _mul_add = &gf16_shuffle_muladd_512_sve2; _mul_add_multi = &gf16_shuffle_muladd_multi_512_sve2; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_512_sve2; _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_512_sve2; //_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_512_sve2; add_multi = &gf_add_multi_sve2; @@ -811,6 +821,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_clmul_mul_sve2; _mul_add = &gf16_clmul_muladd_sve2; _mul_add_multi = &gf16_clmul_muladd_multi_sve2; + _mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_sve2; _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sve2; //_mul_add_multi_packpf = &gf16_clmul_muladd_multi_packpf_sve2; add_multi = &gf_add_multi_sve2; @@ -835,6 +846,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { add_multi = &gf_add_multi_avx512; #ifdef PLATFORM_AMD64 _mul_add_multi = &gf16_affine_muladd_multi_avx512; + _mul_add_multi_stridepf = &gf16_affine_muladd_multi_stridepf_avx512; _mul_add_multi_packed = &gf16_affine_muladd_multi_packed_avx512; _mul_add_multi_packpf = &gf16_affine_muladd_multi_packpf_avx512; add_multi_packed = &gf_add_multi_packed_v2i6_avx512; @@ -865,6 +877,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { add_multi = &gf_add_multi_avx2; #ifdef PLATFORM_AMD64 _mul_add_multi = &gf16_affine_muladd_multi_avx2; + _mul_add_multi_stridepf = &gf16_affine_muladd_multi_stridepf_avx2; _mul_add_multi_packed = &gf16_affine_muladd_multi_packed_avx2; _mul_add_multi_packpf = &gf16_affine_muladd_multi_packpf_avx2; add_multi_packed = &gf_add_multi_packed_v2i3_avx2; @@ -895,6 +908,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { add_multi = &gf_add_multi_sse2; #ifdef PLATFORM_AMD64 _mul_add_multi = &gf16_affine_muladd_multi_gfni; + _mul_add_multi_stridepf = &gf16_affine_muladd_multi_stridepf_gfni; _mul_add_multi_packed = &gf16_affine_muladd_multi_packed_gfni; _mul_add_multi_packpf = &gf16_affine_muladd_multi_packpf_gfni; add_multi_packed = &gf_add_multi_packed_v2i3_sse2; @@ -922,6 +936,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_affine2x_mul_avx512; _mul_add = &gf16_affine2x_muladd_avx512; _mul_add_multi = &gf16_affine2x_muladd_multi_avx512; + _mul_add_multi_stridepf = &gf16_affine2x_muladd_multi_stridepf_avx512; _mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx512; _mul_add_multi_packpf = &gf16_affine2x_muladd_multi_packpf_avx512; add_multi = &gf_add_multi_avx512; @@ -951,6 +966,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_affine2x_mul_avx2; _mul_add = &gf16_affine2x_muladd_avx2; _mul_add_multi = &gf16_affine2x_muladd_multi_avx2; + _mul_add_multi_stridepf = &gf16_affine2x_muladd_multi_stridepf_avx2; _mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx2; _mul_add_multi_packpf = &gf16_affine2x_muladd_multi_packpf_avx2; add_multi = &gf_add_multi_avx2; @@ -980,6 +996,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { _mul = &gf16_affine2x_mul_gfni; _mul_add = &gf16_affine2x_muladd_gfni; _mul_add_multi = &gf16_affine2x_muladd_multi_gfni; + _mul_add_multi_stridepf = &gf16_affine2x_muladd_multi_stridepf_gfni; _mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_gfni; _mul_add_multi_packpf = &gf16_affine2x_muladd_multi_packpf_gfni; add_multi = &gf_add_multi_sse2; @@ -1171,6 +1188,7 @@ Galois16Mul::Galois16Mul(Galois16Methods method) { add_multi_packed = &gf_add_multi_packed_generic; add_multi_packpf = &gf_add_multi_packpf_generic; _mul_add_multi = NULL; + _mul_add_multi_stridepf = NULL; _mul_add_multi_packed = NULL; _mul_add_multi_packpf = NULL; copy_cksum = &gf16_cksum_copy_generic; @@ -1208,6 +1226,7 @@ void Galois16Mul::move(Galois16Mul& other) { _mul_add = other._mul_add; _mul_add_pf = other._mul_add_pf; _mul_add_multi = other._mul_add_multi; + _mul_add_multi_stridepf = other._mul_add_multi_stridepf; _mul_add_multi_packed = other._mul_add_multi_packed; _mul_add_multi_packpf = other._mul_add_multi_packpf; _pow = other._pow; diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index 8b3223c0..53540b26 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -23,6 +23,7 @@ typedef void(*Galois16MulRstFunc) (const void *HEDLEY_RESTRICT scratch, void *HE typedef void(*Galois16MulPfFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); typedef void(*Galois16PowFunc) (const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); typedef void(*Galois16MulMultiFunc) (const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); +typedef void(*Galois16MulStridePfFunc) (const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); typedef void(*Galois16MulPackedFunc) (const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); typedef void(*Galois16MulPackPfFunc) (const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); typedef void(*Galois16AddFunc) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len); @@ -118,6 +119,7 @@ class Galois16Mul { Galois16PowFunc _pow; Galois16PowFunc _pow_add; Galois16MulMultiFunc _mul_add_multi; + Galois16MulStridePfFunc _mul_add_multi_stridepf; Galois16MulPackedFunc _mul_add_multi_packed; Galois16MulPackPfFunc _mul_add_multi_packpf; @@ -241,6 +243,17 @@ class Galois16Mul { _mul_add(scratch, dst, src, len, coefficient, mutScratch); } + inline void mul_add_pf(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) const { + assert(isMultipleOfStride(len)); + assert(len > 0); + + if(HEDLEY_UNLIKELY(coefficient == 0)) return; + if(_mul_add_pf) + _mul_add_pf(scratch, dst, src, len, coefficient, mutScratch, prefetch); + else + _mul_add(scratch, dst, src, len, coefficient, mutScratch); + } + inline void pow(unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const { assert(isMultipleOfStride(len)); assert(len > 0); @@ -294,6 +307,22 @@ class Galois16Mul { } } + inline void mul_add_multi_stridepf(unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) const { + assert(isMultipleOfStride(len)); + assert(len > 0); + assert(srcStride > 0); + assert(regions > 0); + + if(_mul_add_multi_stridepf) + _mul_add_multi_stridepf(scratch, regions, srcStride, dst, src, len, coefficients, mutScratch, prefetch); + else { + // TODO: _mul_add_pf fallback; _mul_add_multi shouldn't be set (exception: XorJit AVX512) + for(unsigned region = 0; region 0); diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index eab26279..ad72e830 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -15,8 +15,10 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in uint16_t baseCoeff; uint16_t coeff[rows]; + #define MAT_ROW(r) (mat + (r) * (stride / sizeof(uint16_t))) + void* srcRows[rows]; - srcRows[0] = mat + rec * (stride / sizeof(uint16_t)); + srcRows[0] = MAT_ROW(rec); for(unsigned i=1; i= 2) { // multiply-add to the next row MULADD_ROW(srcRows[1], 0); // scale it, and multiply-add back SCALE_ROW(1); - MULADD_ROW(srcRows[0], 1); + if(rows > 2) { + MULADD_ROW_PF(srcRows[0], 1, srcRows[2]); + } else MULADD_LASTROW(srcRows[0], 1) + } else { + if(rec2 >= invalidCount) + return -1; } if(rows >= 3) { - MULADD_MULTI_ROW(srcRows[2], 0, 2); - SCALE_ROW(2); if(rows >= 4) { + MULADD_MULTI_ROW_PF(srcRows[2], 0, 2, srcRows[3]); + SCALE_ROW(2); MULADD_MULTI_ROW(srcRows[3], 0, 2); MULADD_ROW(srcRows[3], 2); SCALE_ROW(3); MULADD_ROW(srcRows[2], 3); MULADD_MULTI_ROW(srcRows[0], 2, 2); - MULADD_MULTI_ROW(srcRows[1], 2, 2); + if(rows > 4) { + MULADD_MULTI_ROW_PF(srcRows[1], 2, 2, srcRows[4]); + } else MULADD_MULTI_LASTROW(srcRows[1], 2, 2) } else { + MULADD_MULTI_ROW(srcRows[2], 0, 2); + SCALE_ROW(2); MULADD_ROW(srcRows[0], 2); - MULADD_ROW(srcRows[1], 2); + MULADD_LASTROW(srcRows[1], 2) } } if(rows >= 5) { - MULADD_MULTI_ROW(srcRows[4], 0, 4); - SCALE_ROW(4); if(rows >= 6) { + MULADD_MULTI_ROW_PF(srcRows[4], 0, 4, srcRows[5]); + SCALE_ROW(4); MULADD_MULTI_ROW(srcRows[5], 0, 4); MULADD_ROW(srcRows[5], 4); SCALE_ROW(5); MULADD_ROW(srcRows[4], 5); - for(unsigned rec2 = 0; rec2 < 4; rec2++) { - MULADD_MULTI_ROW(srcRows[rec2], 4, 2); + for(unsigned r = 0; r < 3; r++) { + MULADD_MULTI_ROW(srcRows[r], 4, 2); } + MULADD_MULTI_LASTROW(srcRows[3], 4, 2) } else { - for(unsigned rec2 = 0; rec2 < 4; rec2++) { - MULADD_ROW(srcRows[rec2], 4); + MULADD_MULTI_ROW(srcRows[4], 0, 4); + SCALE_ROW(4); + for(unsigned r = 0; r < 3; r++) { + MULADD_ROW(srcRows[r], 4); } + MULADD_LASTROW(srcRows[3], 4) } } - for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) { - if(HEDLEY_UNLIKELY(rec2 >= rec && rec2 < rec+rows)) continue; - uint16_t* row2 = mat + rec2 * (stride / sizeof(uint16_t)); + // do main elimination, using the source group + while(1) { + uint16_t* row2 = MAT_ROW(rec2); + rec2++; + if(HEDLEY_UNLIKELY(rec2 == rec)) + rec2 += rows; if(rows > 1) { - MULADD_MULTI_ROW(row2, 0, rows); + MULADD_MULTI_LASTROW(row2, 0, rows) } else { - MULADD_ROW(row2, 0); + MULADD_LASTROW(row2, 0) } } + #undef MAT_ROW #undef SCALE_ROW #undef MULADD_ROW + #undef MULADD_ROW_PF #undef MULADD_MULTI_ROW - - return -1; + #undef MULADD_MULTI_ROW_PF + #undef MULADD_LASTROW + #undef MULADD_MULTI_LASTROW } bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { From 2e73db2d7665dae48f835cdbafeda5d7cd30d1c2 Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 10 Jun 2023 18:55:05 +1000 Subject: [PATCH 14/91] Faster matrix construction for inversion + prefetch fallback for mul_add_multi_stridepf --- gf16/gf16mul.h | 21 +++++++++++++++------ gf16/gfmat_inv.cpp | 45 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index 53540b26..b1e91b7b 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -313,13 +313,22 @@ class Galois16Mul { assert(srcStride > 0); assert(regions > 0); - if(_mul_add_multi_stridepf) + if(_mul_add_multi_stridepf) { _mul_add_multi_stridepf(scratch, regions, srcStride, dst, src, len, coefficients, mutScratch, prefetch); - else { - // TODO: _mul_add_pf fallback; _mul_add_multi shouldn't be set (exception: XorJit AVX512) - for(unsigned region = 0; region>_info.prefetchDownscale; + const char* _pf = (const char*)prefetch; + for(unsigned outputPfRounds = 1<<_info.prefetchDownscale; region& inputValid, unsigned va unsigned invalidCount = inputValid.size() - validCount; assert(validCount < inputValid.size()); // i.e. invalidCount > 0 - assert(inputValid.size() <= 32768); - assert(recovery.size() <= 65535); + assert(inputValid.size() <= 32768 && inputValid.size() > 0); + assert(recovery.size() <= 65535 && recovery.size() > 0); ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment); @@ -190,11 +190,36 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // generate matrix validCol = 0; missingCol = validCount; - for(unsigned input = 0; input < inputValid.size(); input++) { - uint16_t inputLog = gfmat_input_log(input); - unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; - for(unsigned rec = 0; rec < invalidCount; rec++) { - mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); + unsigned rec, recStart = 0; + if(recovery.at(0) == 0) { // first recovery has exponent 0 is a common case + for(unsigned input = 0; input < inputValid.size(); input++) { + mat[input] = 1; + } + recStart++; + } + { + unsigned input = 0; + const unsigned GROUP_AMOUNT = 4; + for(; input + GROUP_AMOUNT <= inputValid.size(); input+=GROUP_AMOUNT) { + uint16_t inputLog[GROUP_AMOUNT]; + unsigned targetCol[GROUP_AMOUNT]; + for(unsigned i=0; i& inputValid, unsigned va if(progressCb) progressCb(1, totalProgress); progressOffset = 2; - for(unsigned rec = 0; rec < invalidCount; rec++) { + for(rec = 0; rec < invalidCount; rec++) { uint16_t* row = mat + rec * stride16; //memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory gf.prepare(row, row, stride); @@ -214,7 +239,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // invert // TODO: optimise: multi-thread + packed arrangement - unsigned rec = 0; + rec = 0; #define INVERT_GROUP(rows) \ if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \ for(; rec <= invalidCount-rows; rec+=rows) { \ @@ -241,7 +266,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va if(gf.needPrepare()) { if(progressCb) progressCb(totalProgress-1, totalProgress); - for(unsigned rec = 0; rec < invalidCount; rec++) { + for(rec = 0; rec < invalidCount; rec++) { uint16_t* row = mat + rec * stride16; gf.finish(row, stride); From cddb4663dbe6b650e551a1e081240034f953206d Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 11 Jun 2023 11:36:17 +1000 Subject: [PATCH 15/91] Construct matrix for inversion via multiplication, if possible Point multiplication via SIMD is often faster than lookups for exponentiation --- gf16/gf16pmul.cpp | 28 +++++++ gf16/gf16pmul.h | 20 +++++ gf16/gf16pmul_clmul_sse.c | 90 ++++++++++++++++++++ gf16/gfmat_inv.cpp | 169 +++++++++++++++++++++++++++----------- gf16/gfmat_inv.h | 1 + 5 files changed, 258 insertions(+), 50 deletions(-) create mode 100644 gf16/gf16pmul.cpp create mode 100644 gf16/gf16pmul.h create mode 100644 gf16/gf16pmul_clmul_sse.c diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp new file mode 100644 index 00000000..b9bb3b24 --- /dev/null +++ b/gf16/gf16pmul.cpp @@ -0,0 +1,28 @@ +#include "gf16pmul.h" +#include "../src/cpuid.h" + +Gf16PMulFunc gf16pmul = nullptr; +size_t gf16pmul_alignment = 1; +size_t gf16pmul_blocklen = 1; + +void setup_pmul() { + gf16pmul = nullptr; + gf16pmul_alignment = 1; + gf16pmul_blocklen = 1; + + // CPU detection +#ifdef PLATFORM_X86 + int cpuInfo[4]; + _cpuid(cpuInfo, 1); + bool hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL + if(hasClMul && gf16pmul_clmul_sse_available) { + gf16pmul = &gf16pmul_clmul_sse; + gf16pmul_alignment = 16; + gf16pmul_blocklen = 16; + } else + gf16pmul_clmul_sse_available = 0; +#endif + +#ifdef PLATFORM_ARM +#endif +} diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h new file mode 100644 index 00000000..a7819721 --- /dev/null +++ b/gf16/gf16pmul.h @@ -0,0 +1,20 @@ +#ifndef __GF16PMUL_H__ +#define __GF16PMUL_H__ + +#include "../src/hedley.h" +#include + +// TODO: consider multi-dest +typedef void(*Gf16PMulFunc)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); +extern Gf16PMulFunc gf16pmul; +extern size_t gf16pmul_alignment; +extern size_t gf16pmul_blocklen; + +void setup_pmul(); + +HEDLEY_BEGIN_C_DECLS +void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); +extern int gf16pmul_clmul_sse_available; +HEDLEY_END_C_DECLS + +#endif // defined(__GF16PMUL_H__) diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_clmul_sse.c new file mode 100644 index 00000000..d7fde702 --- /dev/null +++ b/gf16/gf16pmul_clmul_sse.c @@ -0,0 +1,90 @@ +#include "gf16_global.h" + +#if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__) +int gf16pmul_clmul_sse_available = 1; + +void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + assert(len % sizeof(__m128i) == 0); + + const uint8_t* _src1 = (const uint8_t*)src1 + len; + const uint8_t* _src2 = (const uint8_t*)src2 + len; + uint8_t* _dst = (uint8_t*)dst + len; + + __m128i wordMask = _mm_set1_epi32(0xffff); + __m128i shufLoHi = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)) { + __m128i data1 = _mm_load_si128((__m128i*)(_src1 + ptr)); + __m128i data2 = _mm_load_si128((__m128i*)(_src2 + ptr)); + + // do multiply + __m128i data1Even = _mm_and_si128(wordMask, data1); + __m128i data1Odd = _mm_andnot_si128(wordMask, data1); + __m128i data2Even = _mm_and_si128(wordMask, data2); + __m128i data2Odd = _mm_andnot_si128(wordMask, data2); + __m128i prod1Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x00); + __m128i prod2Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x11); + __m128i prod1Odd = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x00); + __m128i prod2Odd = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x11); + __m128i prod1 = _mm_blend_epi16(prod1Even, prod1Odd, 0xCC); + __m128i prod2 = _mm_blend_epi16(prod2Even, prod2Odd, 0xCC); + + // do reduction + /* obvious Barret reduction strategy, using CLMUL instructions + const __m128i barretConst = _mm_set_epi32(0, 0x1100b, 0, 0x1111a); + + __m128i quot1 = _mm_srli_epi32(prod1, 16); + __m128i quot2 = _mm_srli_epi32(prod2, 16); + __m128i quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x00); + __m128i quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x01); + __m128i quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x00); + __m128i quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x01); + quot1 = _mm_unpacklo_epi64(quot11, quot12); + quot2 = _mm_unpacklo_epi64(quot21, quot22); + + quot1 = _mm_srli_epi32(quot1, 16); + quot2 = _mm_srli_epi32(quot2, 16); + quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x10); + quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x11); + quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x10); + quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x11); + quot1 = _mm_unpacklo_epi64(quot11, quot12); + quot2 = _mm_unpacklo_epi64(quot21, quot22); + + quot1 = _mm_xor_si128(quot1, prod1); + quot2 = _mm_xor_si128(quot2, prod2); + + __m128i result = _mm_packus_epi32( + _mm_and_si128(wordMask, quot1), + _mm_and_si128(wordMask, quot2) + ); + */ + + // since there aren't that many bits in the Barret constants, doing manual shift+xor is more efficient + // split low/high 16-bit parts + __m128i tmp1 = _mm_shuffle_epi8(prod1, shufLoHi); + __m128i tmp2 = _mm_shuffle_epi8(prod2, shufLoHi); + __m128i rem = _mm_unpacklo_epi64(tmp1, tmp2); + __m128i quot = _mm_unpackhi_epi64(tmp1, tmp2); + + // multiply by 0x1111a (or rather, 0x11118, since the '2' bit doesn't matter due to the product being at most 31 bits) and retain high half + tmp1 = _mm_xor_si128(quot, _mm_srli_epi16(quot, 4)); + tmp1 = _mm_xor_si128(tmp1, _mm_srli_epi16(tmp1, 8)); + quot = _mm_xor_si128(tmp1, _mm_srli_epi16(quot, 13)); + + // multiply by 0x100b, retain low half + tmp1 = _mm_xor_si128(quot, _mm_slli_epi16(quot, 3)); + tmp1 = _mm_xor_si128(tmp1, _mm_add_epi16(quot, quot)); + quot = _mm_xor_si128(tmp1, _mm_slli_epi16(quot, 12)); + + __m128i result = _mm_xor_si128(quot, rem); + + _mm_store_si128((__m128i*)(_dst + ptr), result); + } +} + +#else +int gf16pmul_clmul_sse_available = 0; +void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); +} +#endif diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 7e8077cc..fc2c97d1 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -1,5 +1,7 @@ #include "gfmat_coeff.h" #include "gfmat_inv.h" +#include "gf16pmul.h" +#include #ifdef PARPAR_INVERT_SUPPORT extern "C" uint16_t* gf16_recip; @@ -155,8 +157,107 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in #undef MULADD_MULTI_LASTROW } + +// construct initial matrix (pre-inversion) +void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery) { + unsigned validCol = 0; + unsigned missingCol = validCount; + unsigned recStart = 0; + unsigned stride16 = stride / sizeof(uint16_t); + unsigned invalidCount = inputValid.size() - validCount; + if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case + for(unsigned input = 0; input < inputValid.size(); input++) { + mat[input] = 1; + } + recStart++; + } + if(recStart >= recovery.size()) return; + + + unsigned input = 0; + const unsigned GROUP_AMOUNT = 4; + #define CONSTRUCT_VIA_EXP(loopcond) \ + for(; input + GROUP_AMOUNT <= inputValid.size(); input+=GROUP_AMOUNT) { \ + uint16_t inputLog[GROUP_AMOUNT]; \ + unsigned targetCol[GROUP_AMOUNT]; \ + for(unsigned i=0; i recSkips; + recSkips.reserve(invalidCount); + recSkips.push_back(recStart); + unsigned maxSkips = invalidCount/2; // TODO: tune threshold + uint16_t lastExp = 1; + for(unsigned rec = recStart+1; rec < invalidCount; rec++) { + uint16_t exp = recovery.at(rec); + if(exp != lastExp+1) { + recSkips.push_back(rec); + if(recSkips.size() >= maxSkips) break; + } + lastExp = exp; + } + + if(recSkips.size() < maxSkips) { + // not many gaps - use the strategy of filling these gaps first... + CONSTRUCT_VIA_EXP(uint16_t rec : recSkips); + + // ...then compute most of the rows via multiplication + lastExp = 1; + uint16_t* src1 = mat + recStart * stride16; + for(unsigned rec = recStart+1; rec < invalidCount; rec++) { + uint16_t exp = recovery.at(rec); + bool skip = (exp != lastExp+1); + lastExp = exp; + if(skip) continue; + + gf16pmul(mat + rec * stride16, src1, mat + (rec-1) * stride16, stride); + } + + return; + } + } + } + + CONSTRUCT_VIA_EXP(unsigned rec = recStart; rec < invalidCount; rec++); + #undef CONSTRUCT_VIA_EXP +} + bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { - if(mat) ALIGN_FREE(mat); + unsigned invalidCount = inputValid.size() - validCount; + assert(validCount < inputValid.size()); // i.e. invalidCount > 0 + assert(inputValid.size() <= 32768 && inputValid.size() > 0); + assert(recovery.size() <= 65535 && recovery.size() > 0); + + if(invalidCount > recovery.size()) return false; + unsigned matWidth = inputValid.size() * sizeof(uint16_t); Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true)); @@ -164,19 +265,23 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va const auto gfInfo = gf.info(); void* gfScratch = gf.mutScratch_alloc(); - unsigned invalidCount = inputValid.size() - validCount; - assert(validCount < inputValid.size()); // i.e. invalidCount > 0 - assert(inputValid.size() <= 32768 && inputValid.size() > 0); - assert(recovery.size() <= 65535 && recovery.size() > 0); - + if(mat) ALIGN_FREE(mat); ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment); - unsigned validCol, missingCol; unsigned stride16 = stride / sizeof(uint16_t); assert(stride16 * sizeof(uint16_t) == stride); uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc + // easier to handle if exponents are in order + std::sort(recovery.begin(), recovery.end()); + + static bool pmulInit = false; + if(!pmulInit) { + pmulInit = true; + setup_pmul(); + } + invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? if(invalidCount > recovery.size()) { // not enough recovery gf.mutScratch_free(gfScratch); @@ -186,43 +291,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } if(progressCb) progressCb(0, totalProgress); - - // generate matrix - validCol = 0; - missingCol = validCount; - unsigned rec, recStart = 0; - if(recovery.at(0) == 0) { // first recovery has exponent 0 is a common case - for(unsigned input = 0; input < inputValid.size(); input++) { - mat[input] = 1; - } - recStart++; - } - { - unsigned input = 0; - const unsigned GROUP_AMOUNT = 4; - for(; input + GROUP_AMOUNT <= inputValid.size(); input+=GROUP_AMOUNT) { - uint16_t inputLog[GROUP_AMOUNT]; - unsigned targetCol[GROUP_AMOUNT]; - for(unsigned i=0; i& inputValid, unsigned va if(progressCb) progressCb(1, totalProgress); progressOffset = 2; - for(rec = 0; rec < invalidCount; rec++) { - uint16_t* row = mat + rec * stride16; + for(unsigned r = 0; r < invalidCount; r++) { + uint16_t* row = mat + r * stride16; //memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory gf.prepare(row, row, stride); } @@ -239,7 +308,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // invert // TODO: optimise: multi-thread + packed arrangement - rec = 0; + unsigned rec = 0; #define INVERT_GROUP(rows) \ if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \ for(; rec <= invalidCount-rows; rec+=rows) { \ @@ -266,15 +335,15 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va if(gf.needPrepare()) { if(progressCb) progressCb(totalProgress-1, totalProgress); - for(rec = 0; rec < invalidCount; rec++) { - uint16_t* row = mat + rec * stride16; + for(unsigned r = 0; r < invalidCount; r++) { + uint16_t* row = mat + r * stride16; gf.finish(row, stride); /* // check for zeroes; TODO: does this need to be the full row? for(unsigned col = validCount; col < inputValid.size(); col++) { if(HEDLEY_UNLIKELY(row[col] == 0)) { // bad coeff - recovery.erase(recovery.begin() + rec); + recovery.erase(recovery.begin() + r); goto invert_loop; } } diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index ee29911c..b5290900 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -11,6 +11,7 @@ class Galois16RecMatrix { uint16_t* mat; unsigned stride; + void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); template int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch); public: From 9686b903498d04dc7cffd8db801ba526fb65254a Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 11 Jun 2023 21:01:46 +1000 Subject: [PATCH 16/91] Add AVX2+VPCLMUL+GFNI variants for PointMultiply --- gf16/gf16pmul.cpp | 43 ++++++- gf16/gf16pmul.h | 12 +- gf16/gf16pmul_clmul_avx2.c | 12 ++ gf16/gf16pmul_clmul_sse.c | 90 -------------- gf16/gf16pmul_clmul_vpclgfni.c | 15 +++ gf16/gf16pmul_clmul_vpclmul.c | 14 +++ gf16/gf16pmul_clmul_x86.h | 213 +++++++++++++++++++++++++++++++++ 7 files changed, 304 insertions(+), 95 deletions(-) create mode 100644 gf16/gf16pmul_clmul_avx2.c delete mode 100644 gf16/gf16pmul_clmul_sse.c create mode 100644 gf16/gf16pmul_clmul_vpclgfni.c create mode 100644 gf16/gf16pmul_clmul_vpclmul.c create mode 100644 gf16/gf16pmul_clmul_x86.h diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp index b9bb3b24..0e1e99a9 100644 --- a/gf16/gf16pmul.cpp +++ b/gf16/gf16pmul.cpp @@ -13,14 +13,51 @@ void setup_pmul() { // CPU detection #ifdef PLATFORM_X86 int cpuInfo[4]; + int cpuInfoX[4]; _cpuid(cpuInfo, 1); bool hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL - if(hasClMul && gf16pmul_clmul_sse_available) { + bool hasAVX2 = false, hasVPCLMUL = false, hasGFNI = false; + +#if !defined(_MSC_VER) || _MSC_VER >= 1600 + _cpuidX(cpuInfoX, 7, 0); + if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE + int xcr = _GET_XCR() & 0xff; + if((xcr & 6) == 6) { // AVX enabled + hasAVX2 = cpuInfoX[1] & 0x20; + hasVPCLMUL = hasAVX2 && (cpuInfoX[2] & 0x400); + } + } + hasGFNI = (cpuInfoX[2] & 0x100) == 0x100; +#endif + + if(!hasGFNI) gf16pmul_clmul_available_vpclgfni = 0; + if(!hasVPCLMUL) { + gf16pmul_clmul_available_vpclmul = 0; + gf16pmul_clmul_available_vpclgfni = 0; + } + if(!hasAVX2) gf16pmul_clmul_available_avx2 = 0; + if(!hasClMul) gf16pmul_clmul_available_sse = 0; + + if(gf16pmul_clmul_available_vpclgfni) { + gf16pmul = &gf16pmul_clmul_vpclgfni; + gf16pmul_alignment = 32; + gf16pmul_blocklen = 64; + } + else if(gf16pmul_clmul_available_vpclmul) { + gf16pmul = &gf16pmul_clmul_vpclmul; + gf16pmul_alignment = 32; + gf16pmul_blocklen = 32; + } + else if(gf16pmul_clmul_available_avx2) { + gf16pmul = &gf16pmul_clmul_avx2; + gf16pmul_alignment = 32; + gf16pmul_blocklen = 32; + } + else if(gf16pmul_clmul_available_sse) { gf16pmul = &gf16pmul_clmul_sse; gf16pmul_alignment = 16; gf16pmul_blocklen = 16; - } else - gf16pmul_clmul_sse_available = 0; + } #endif #ifdef PLATFORM_ARM diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h index a7819721..ac472f12 100644 --- a/gf16/gf16pmul.h +++ b/gf16/gf16pmul.h @@ -13,8 +13,16 @@ extern size_t gf16pmul_blocklen; void setup_pmul(); HEDLEY_BEGIN_C_DECLS -void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); -extern int gf16pmul_clmul_sse_available; +#define _PMUL_DECL(f) \ + void gf16pmul_clmul_##f(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); \ + extern int gf16pmul_clmul_available_##f + +_PMUL_DECL(sse); +_PMUL_DECL(avx2); +_PMUL_DECL(vpclmul); +_PMUL_DECL(vpclgfni); + +#undef _PMUL_DECL HEDLEY_END_C_DECLS #endif // defined(__GF16PMUL_H__) diff --git a/gf16/gf16pmul_clmul_avx2.c b/gf16/gf16pmul_clmul_avx2.c new file mode 100644 index 00000000..ce965f4c --- /dev/null +++ b/gf16/gf16pmul_clmul_avx2.c @@ -0,0 +1,12 @@ +#include "../src/platform.h" + +#define _mword __m256i +#define _MM(f) _mm256_ ## f +#define _MMI(f) _mm256_ ## f ## _si256 +#define MWORD_SIZE 32 +#define _FNSUFFIX _avx2 + +#if defined(__PCLMUL__) && defined(__AVX2__) +# define _AVAILABLE 1 +#endif +#include "gf16pmul_clmul_x86.h" diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_clmul_sse.c deleted file mode 100644 index d7fde702..00000000 --- a/gf16/gf16pmul_clmul_sse.c +++ /dev/null @@ -1,90 +0,0 @@ -#include "gf16_global.h" - -#if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__) -int gf16pmul_clmul_sse_available = 1; - -void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { - assert(len % sizeof(__m128i) == 0); - - const uint8_t* _src1 = (const uint8_t*)src1 + len; - const uint8_t* _src2 = (const uint8_t*)src2 + len; - uint8_t* _dst = (uint8_t*)dst + len; - - __m128i wordMask = _mm_set1_epi32(0xffff); - __m128i shufLoHi = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); - for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)) { - __m128i data1 = _mm_load_si128((__m128i*)(_src1 + ptr)); - __m128i data2 = _mm_load_si128((__m128i*)(_src2 + ptr)); - - // do multiply - __m128i data1Even = _mm_and_si128(wordMask, data1); - __m128i data1Odd = _mm_andnot_si128(wordMask, data1); - __m128i data2Even = _mm_and_si128(wordMask, data2); - __m128i data2Odd = _mm_andnot_si128(wordMask, data2); - __m128i prod1Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x00); - __m128i prod2Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x11); - __m128i prod1Odd = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x00); - __m128i prod2Odd = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x11); - __m128i prod1 = _mm_blend_epi16(prod1Even, prod1Odd, 0xCC); - __m128i prod2 = _mm_blend_epi16(prod2Even, prod2Odd, 0xCC); - - // do reduction - /* obvious Barret reduction strategy, using CLMUL instructions - const __m128i barretConst = _mm_set_epi32(0, 0x1100b, 0, 0x1111a); - - __m128i quot1 = _mm_srli_epi32(prod1, 16); - __m128i quot2 = _mm_srli_epi32(prod2, 16); - __m128i quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x00); - __m128i quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x01); - __m128i quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x00); - __m128i quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x01); - quot1 = _mm_unpacklo_epi64(quot11, quot12); - quot2 = _mm_unpacklo_epi64(quot21, quot22); - - quot1 = _mm_srli_epi32(quot1, 16); - quot2 = _mm_srli_epi32(quot2, 16); - quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x10); - quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x11); - quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x10); - quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x11); - quot1 = _mm_unpacklo_epi64(quot11, quot12); - quot2 = _mm_unpacklo_epi64(quot21, quot22); - - quot1 = _mm_xor_si128(quot1, prod1); - quot2 = _mm_xor_si128(quot2, prod2); - - __m128i result = _mm_packus_epi32( - _mm_and_si128(wordMask, quot1), - _mm_and_si128(wordMask, quot2) - ); - */ - - // since there aren't that many bits in the Barret constants, doing manual shift+xor is more efficient - // split low/high 16-bit parts - __m128i tmp1 = _mm_shuffle_epi8(prod1, shufLoHi); - __m128i tmp2 = _mm_shuffle_epi8(prod2, shufLoHi); - __m128i rem = _mm_unpacklo_epi64(tmp1, tmp2); - __m128i quot = _mm_unpackhi_epi64(tmp1, tmp2); - - // multiply by 0x1111a (or rather, 0x11118, since the '2' bit doesn't matter due to the product being at most 31 bits) and retain high half - tmp1 = _mm_xor_si128(quot, _mm_srli_epi16(quot, 4)); - tmp1 = _mm_xor_si128(tmp1, _mm_srli_epi16(tmp1, 8)); - quot = _mm_xor_si128(tmp1, _mm_srli_epi16(quot, 13)); - - // multiply by 0x100b, retain low half - tmp1 = _mm_xor_si128(quot, _mm_slli_epi16(quot, 3)); - tmp1 = _mm_xor_si128(tmp1, _mm_add_epi16(quot, quot)); - quot = _mm_xor_si128(tmp1, _mm_slli_epi16(quot, 12)); - - __m128i result = _mm_xor_si128(quot, rem); - - _mm_store_si128((__m128i*)(_dst + ptr), result); - } -} - -#else -int gf16pmul_clmul_sse_available = 0; -void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { - UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); -} -#endif diff --git a/gf16/gf16pmul_clmul_vpclgfni.c b/gf16/gf16pmul_clmul_vpclgfni.c new file mode 100644 index 00000000..26ad0478 --- /dev/null +++ b/gf16/gf16pmul_clmul_vpclgfni.c @@ -0,0 +1,15 @@ +#include "../src/platform.h" + +#define _mword __m256i +#define _MM(f) _mm256_ ## f +#define _MMI(f) _mm256_ ## f ## _si256 +#define MWORD_SIZE 32 +#define _FNSUFFIX _vpclgfni + +#define _USE_VPCLMUL 1 +#define _USE_GFNI 1 + +#if defined(__VPCLMULQDQ__) && defined(__GFNI__) && defined(__AVX2__) +# define _AVAILABLE 1 +#endif +#include "gf16pmul_clmul_x86.h" diff --git a/gf16/gf16pmul_clmul_vpclmul.c b/gf16/gf16pmul_clmul_vpclmul.c new file mode 100644 index 00000000..715544a9 --- /dev/null +++ b/gf16/gf16pmul_clmul_vpclmul.c @@ -0,0 +1,14 @@ +#include "../src/platform.h" + +#define _mword __m256i +#define _MM(f) _mm256_ ## f +#define _MMI(f) _mm256_ ## f ## _si256 +#define MWORD_SIZE 32 +#define _FNSUFFIX _vpclmul + +#define _USE_VPCLMUL 1 + +#if defined(__VPCLMULQDQ__) && defined(__AVX2__) +# define _AVAILABLE 1 +#endif +#include "gf16pmul_clmul_x86.h" diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_clmul_x86.h new file mode 100644 index 00000000..03589049 --- /dev/null +++ b/gf16/gf16pmul_clmul_x86.h @@ -0,0 +1,213 @@ +#include "gf16_global.h" + +#if defined(_AVAILABLE) +int _FN(gf16pmul_clmul_available) = 1; + +static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1, const _mword* src2, _mword* prod1, _mword* prod2) { + _mword wordMask = _MM(set1_epi32)(0xffff); + + _mword data1 = _MMI(load)(src1); + _mword data2 = _MMI(load)(src2); + + // do multiply + _mword data1Even = _MMI(and)(wordMask, data1); + _mword data1Odd = _MMI(andnot)(wordMask, data1); + _mword data2Even = _MMI(and)(wordMask, data2); + _mword data2Odd = _MMI(andnot)(wordMask, data2); +#if MWORD_SIZE == 32 && !defined(_USE_VPCLMUL) + __m128i data1EvenA = _mm256_castsi256_si128(data1Even); + __m128i data1EvenB = _mm256_extracti128_si256(data1Even, 1); + __m128i data1OddA = _mm256_castsi256_si128(data1Odd); + __m128i data1OddB = _mm256_extracti128_si256(data1Odd, 1); + __m128i data2EvenA = _mm256_castsi256_si128(data2Even); + __m128i data2EvenB = _mm256_extracti128_si256(data2Even, 1); + __m128i data2OddA = _mm256_castsi256_si128(data2Odd); + __m128i data2OddB = _mm256_extracti128_si256(data2Odd, 1); + + __m128i prod1EvenA = _mm_clmulepi64_si128(data1EvenA, data2EvenA, 0x00); + __m128i prod1EvenB = _mm_clmulepi64_si128(data1EvenB, data2EvenB, 0x00); + __m128i prod2EvenA = _mm_clmulepi64_si128(data1EvenA, data2EvenA, 0x11); + __m128i prod2EvenB = _mm_clmulepi64_si128(data1EvenB, data2EvenB, 0x11); + __m128i prod1OddA = _mm_clmulepi64_si128(data1OddA, data2OddA, 0x00); + __m128i prod1OddB = _mm_clmulepi64_si128(data1OddB, data2OddB, 0x00); + __m128i prod2OddA = _mm_clmulepi64_si128(data1OddA, data2OddA, 0x11); + __m128i prod2OddB = _mm_clmulepi64_si128(data1OddB, data2OddB, 0x11); + + __m128i prod1A = _mm_blend_epi16(prod1EvenA, prod1OddA, 0xCC); + __m128i prod1B = _mm_blend_epi16(prod1EvenB, prod1OddB, 0xCC); + __m128i prod2A = _mm_blend_epi16(prod2EvenA, prod2OddA, 0xCC); + __m128i prod2B = _mm_blend_epi16(prod2EvenB, prod2OddB, 0xCC); + *prod1 = _mm256_inserti128_si256(_mm256_castsi128_si256(prod1A), prod1B, 1); + *prod2 = _mm256_inserti128_si256(_mm256_castsi128_si256(prod2A), prod2B, 1); +#else +# if MWORD_SIZE == 16 + _mword prod1Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x00); + _mword prod2Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x11); + _mword prod1Odd = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x00); + _mword prod2Odd = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x11); +# else + _mword prod1Even = _MM(clmulepi64_epi128)(data1Even, data2Even, 0x00); + _mword prod2Even = _MM(clmulepi64_epi128)(data1Even, data2Even, 0x11); + _mword prod1Odd = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x00); + _mword prod2Odd = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x11); +# endif + *prod1 = _MM(blend_epi16)(prod1Even, prod1Odd, 0xCC); + *prod2 = _MM(blend_epi16)(prod2Even, prod2Odd, 0xCC); +#endif +} + +void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + assert(len % sizeof(_mword) == 0); + + const uint8_t* _src1 = (const uint8_t*)src1 + len; + const uint8_t* _src2 = (const uint8_t*)src2 + len; + uint8_t* _dst = (uint8_t*)dst + len; + + _mword shufLoHi = _MM(set_epi16)( +#if MWORD_SIZE >= 32 + 0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100, +#endif + 0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100 + ); + +#ifdef _USE_GFNI + assert(len % (sizeof(_mword)*2) == 0); + _mword shufBLoHi = _MM(set_epi8)( +# if MWORD_SIZE >= 32 + 15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0, +# endif + 15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0 + ); + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) { + _mword prod1, prod2, prod3, prod4; + _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); + _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr) +1, (_mword*)(_src2 + ptr) +1, &prod3, &prod4); + + // split low/high + _mword tmp1 = _MM(shuffle_epi8)(prod1, shufLoHi); + _mword tmp2 = _MM(shuffle_epi8)(prod2, shufLoHi); + _mword rem1 = _MM(unpacklo_epi64)(tmp1, tmp2); + _mword quot1 = _MM(unpackhi_epi64)(tmp1, tmp2); + tmp1 = _MM(shuffle_epi8)(prod3, shufLoHi); + tmp2 = _MM(shuffle_epi8)(prod4, shufLoHi); + _mword rem2 = _MM(unpacklo_epi64)(tmp1, tmp2); + _mword quot2 = _MM(unpackhi_epi64)(tmp1, tmp2); + + // split quot into bytes + tmp1 = _MM(shuffle_epi8)(quot1, shufBLoHi); + tmp2 = _MM(shuffle_epi8)(quot2, shufBLoHi); + quot1 = _MM(unpacklo_epi64)(tmp1, tmp2); + quot2 = _MM(unpackhi_epi64)(tmp1, tmp2); + + // do reduction + tmp2 = _MMI(xor)( + _MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xbb77eedd0b162c58), 0), + _MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0xa040800011224488), 0) + ); + tmp1 = _MMI(xor)( + _MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xb1d3a6fdfbf7eedd), 0), + _MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0x113366ddba74e8d0), 0) + ); + + /* mappings for above affine matrices: (tmp1 = bottom, tmp2 = top) + * Mul by 0x1111a + * top->top: top ^ top>>4 + * top->bot: top ^ top>>4 ^ top<<4 ^ top>>5 ^ top>>7 + * bot->bot: bot ^ bot>>4 + * Mul by 0x100b + * top->top: top ^ top<<1 ^ top<<3 + * bot->top: bot>>7 ^ bot>>5 ^ bot<<4 + * bot->bot: bot ^ bot<<1 ^ bot<<3 + * Together: + * top->top: + * b = top ^ top<<4 ^ top>>4 ^ top>>5 ^ top>>7 + * top ^= top>>4 + * top ^= top<<1 ^ top<<3 + * top ^= b>>7 ^ b>>5 ^ b<<4 + * top->bot: + * bot = top ^ top<<4 ^ top>>4 ^ top>>5 ^ top>>7 + * bot ^= bot<<1 ^ bot<<3 + * bot->top: + * b = bot ^ bot>>4 + * top = b>>7 ^ b>>5 ^ b<<4 + * bot->bot: + * bot ^= bot>>4 + * bot ^= bot<<1 ^ bot<<3 + */ + + // join together + quot1 = _MM(unpacklo_epi8)(tmp1, tmp2); + quot2 = _MM(unpackhi_epi8)(tmp1, tmp2); + + // xor with rem + quot1 = _MMI(xor)(quot1, rem1); + quot2 = _MMI(xor)(quot2, rem2); + + _MMI(store)((_mword*)(_dst + ptr), quot1); + _MMI(store)((_mword*)(_dst + ptr) + 1, quot2); + } +#else + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)) { + _mword prod1, prod2; + _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); + + // do reduction + /* obvious Barret reduction strategy, using CLMUL instructions + const __m128i barretConst = _mm_set_epi32(0, 0x1100b, 0, 0x1111a); + + __m128i quot1 = _mm_srli_epi32(prod1, 16); + __m128i quot2 = _mm_srli_epi32(prod2, 16); + __m128i quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x00); + __m128i quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x01); + __m128i quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x00); + __m128i quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x01); + quot1 = _mm_unpacklo_epi64(quot11, quot12); + quot2 = _mm_unpacklo_epi64(quot21, quot22); + + quot1 = _mm_srli_epi32(quot1, 16); + quot2 = _mm_srli_epi32(quot2, 16); + quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x10); + quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x11); + quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x10); + quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x11); + quot1 = _mm_unpacklo_epi64(quot11, quot12); + quot2 = _mm_unpacklo_epi64(quot21, quot22); + + quot1 = _mm_xor_si128(quot1, prod1); + quot2 = _mm_xor_si128(quot2, prod2); + + __m128i result = _mm_packus_epi32( + _mm_and_si128(wordMask, quot1), + _mm_and_si128(wordMask, quot2) + ); + */ + + // since there aren't that many bits in the Barret constants, doing manual shift+xor is more efficient + // split low/high 16-bit parts + _mword tmp1 = _MM(shuffle_epi8)(prod1, shufLoHi); + _mword tmp2 = _MM(shuffle_epi8)(prod2, shufLoHi); + _mword rem = _MM(unpacklo_epi64)(tmp1, tmp2); + _mword quot = _MM(unpackhi_epi64)(tmp1, tmp2); + + // multiply by 0x1111a (or rather, 0x11118, since the '2' bit doesn't matter due to the product being at most 31 bits) and retain high half + tmp1 = _MMI(xor)(quot, _MM(srli_epi16)(quot, 4)); + tmp1 = _MMI(xor)(tmp1, _MM(srli_epi16)(tmp1, 8)); + quot = _MMI(xor)(tmp1, _MM(srli_epi16)(quot, 13)); + + // multiply by 0x100b, retain low half + tmp1 = _MMI(xor)(quot, _MM(slli_epi16)(quot, 3)); + tmp1 = _MMI(xor)(tmp1, _MM(add_epi16)(quot, quot)); + quot = _MMI(xor)(tmp1, _MM(slli_epi16)(quot, 12)); + + _mword result = _MMI(xor)(quot, rem); + _MMI(store)((_mword*)(_dst + ptr), result); + } +#endif +} + +#else +int _FN(gf16pmul_clmul_available) = 0; +void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); +} +#endif From 43b6d41159ac258076c2911b9117add1a1a1481d Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 11 Jun 2023 21:02:27 +1000 Subject: [PATCH 17/91] Missed in last commit --- gf16/gf16pmul_clmul_sse.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 gf16/gf16pmul_clmul_sse.c diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_clmul_sse.c new file mode 100644 index 00000000..5338858d --- /dev/null +++ b/gf16/gf16pmul_clmul_sse.c @@ -0,0 +1,12 @@ +#include "../src/platform.h" + +#define _mword __m128i +#define _MM(f) _mm_ ## f +#define _MMI(f) _mm_ ## f ## _si128 +#define MWORD_SIZE 16 +#define _FNSUFFIX _sse + +#if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__) +# define _AVAILABLE 1 +#endif +#include "gf16pmul_clmul_x86.h" From 84bef8eda29aec56d650d9de2661158c937860f8 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 12 Jun 2023 20:35:07 +1000 Subject: [PATCH 18/91] Make VPCLMUL PointMultiply compatible with AVX-512, should it ever be useful --- gf16/gf16pmul_clmul_x86.h | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_clmul_x86.h index 03589049..9dfa801e 100644 --- a/gf16/gf16pmul_clmul_x86.h +++ b/gf16/gf16pmul_clmul_x86.h @@ -51,8 +51,13 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1, _mword prod1Odd = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x00); _mword prod2Odd = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x11); # endif +# if MWORD_SIZE >= 64 + *prod1 = _MM(mask_blend_epi32)(0xAAAA, prod1Even, prod1Odd); + *prod2 = _MM(mask_blend_epi32)(0xAAAA, prod2Even, prod2Odd); +# else *prod1 = _MM(blend_epi16)(prod1Even, prod1Odd, 0xCC); *prod2 = _MM(blend_epi16)(prod2Even, prod2Odd, 0xCC); +# endif #endif } @@ -63,21 +68,29 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void const uint8_t* _src2 = (const uint8_t*)src2 + len; uint8_t* _dst = (uint8_t*)dst + len; +#if MWORD_SIZE >= 64 + _mword shufLoHi = _MM(set4_epi32)(0x0f0e0b0a, 0x07060302, 0x0d0c0908, 0x05040100); +#else _mword shufLoHi = _MM(set_epi16)( -#if MWORD_SIZE >= 32 +# if MWORD_SIZE >= 32 0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100, -#endif +# endif 0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100 ); +#endif #ifdef _USE_GFNI assert(len % (sizeof(_mword)*2) == 0); +# if MWORD_SIZE >= 64 + _mword shufBLoHi = _MM(set4_epi32)(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200); +# else _mword shufBLoHi = _MM(set_epi8)( -# if MWORD_SIZE >= 32 +# if MWORD_SIZE >= 32 15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0, -# endif +# endif 15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0 ); +# endif for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) { _mword prod1, prod2, prod3, prod4; _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); @@ -100,14 +113,20 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void quot2 = _MM(unpackhi_epi64)(tmp1, tmp2); // do reduction + #if MWORD_SIZE >= 64 + # define SET1_EPI64 _MM(set1_epi64) + #else + # define SET1_EPI64 _MM(set1_epi64x) + #endif tmp2 = _MMI(xor)( - _MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xbb77eedd0b162c58), 0), - _MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0xa040800011224488), 0) + _MM(gf2p8affine_epi64_epi8)(quot2, SET1_EPI64(0xbb77eedd0b162c58), 0), + _MM(gf2p8affine_epi64_epi8)(quot1, SET1_EPI64(0xa040800011224488), 0) ); tmp1 = _MMI(xor)( - _MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xb1d3a6fdfbf7eedd), 0), - _MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0x113366ddba74e8d0), 0) + _MM(gf2p8affine_epi64_epi8)(quot2, SET1_EPI64(0xb1d3a6fdfbf7eedd), 0), + _MM(gf2p8affine_epi64_epi8)(quot1, SET1_EPI64(0x113366ddba74e8d0), 0) ); + #undef SET1_EPI64 /* mappings for above affine matrices: (tmp1 = bottom, tmp2 = top) * Mul by 0x1111a From 19195b904ea27ce40373434b4bb9ff1b5b47f591 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 12 Jun 2023 20:46:36 +1000 Subject: [PATCH 19/91] Prep support for EOR3 in ClMul NEON --- gf16/gf16_clmul_neon.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c index 7d73a938..1f301a79 100644 --- a/gf16/gf16_clmul_neon.c +++ b/gf16/gf16_clmul_neon.c @@ -3,7 +3,6 @@ #include "gf16_muladd_multi.h" // TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort -// can also look at BCAX/EOR3 from SHA3 if bored; SVE2 implementation can also use XAR #if defined(__ARM_NEON) @@ -47,6 +46,7 @@ typedef poly8x8_t coeff_t; # define coeff_fn(f1, f2) f1##_##f2 #endif +// NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__) // Apple M1 supports fusing PMULL+EOR, so ensure these are paired static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { @@ -113,6 +113,10 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8 *high2 = pmacl_high(*high2, data.val[1], coeff[1]); } +static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { + return veorq_u8(a, veorq_u8(b, c)); +} + static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) { // put data in proper form uint8x16x2_t hibytes = vuzpq_u8(vreinterpretq_u8_p16(*high1), vreinterpretq_u8_p16(high2)); @@ -121,8 +125,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, pol // merge mid into high/low uint8x16x2_t midbytes = vuzpq_u8(vreinterpretq_u8_p16(mid1), vreinterpretq_u8_p16(mid2)); uint8x16_t libytes = veorq_u8(hibytes.val[0], lobytes.val[1]); - lobytes.val[1] = veorq_u8(libytes, veorq_u8(lobytes.val[0], midbytes.val[0])); - hibytes.val[0] = veorq_u8(libytes, veorq_u8(hibytes.val[1], midbytes.val[1])); + lobytes.val[1] = eor3q_u8(libytes, lobytes.val[0], midbytes.val[0]); + hibytes.val[0] = eor3q_u8(libytes, hibytes.val[1], midbytes.val[1]); // Barrett reduction @@ -130,7 +134,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, pol // multiply hibytes by 0x11100 uint8x16_t highest_nibble = vshrq_n_u8(hibytes.val[1], 4); uint8x16_t th0 = vsriq_n_u8(vshlq_n_u8(hibytes.val[1], 4), hibytes.val[0], 4); - th0 = veorq_u8(th0, veorq_u8(hibytes.val[0], hibytes.val[1])); + th0 = eor3q_u8(th0, hibytes.val[0], hibytes.val[1]); uint8x16_t th1 = veorq_u8(hibytes.val[1], highest_nibble); // subsequent polynomial multiplication doesn't need the low bits of th0 to be correct, so trim these now for a shorter dep chain @@ -154,11 +158,11 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, pol poly8x16_t redL = vdupq_n_p8(0x0b); hibytes.val[1] = veorq_u8(th0_hi3, th0_hi1); hibytes.val[1] = vsliq_n_u8(hibytes.val[1], th0, 4); - lobytes.val[1] = veorq_u8(lobytes.val[1], vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL))); + th1 = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL)); hibytes.val[0] = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th0), redL)); *low1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[0], hibytes.val[0])); - *high1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[1], hibytes.val[1])); + *high1 = vreinterpretq_p16_u8(eor3q_u8(hibytes.val[1], lobytes.val[1], th1)); } #ifdef __aarch64__ From 908b74b0a9ad4dcb3e1ae35b49bd1fbb9835a54e Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 12 Jun 2023 22:08:44 +1000 Subject: [PATCH 20/91] Add NEON+SVE2 versions of PointMultiply --- gf16/gf16_clmul_neon.c | 93 +----------------------------------- gf16/gf16_clmul_neon.h | 97 ++++++++++++++++++++++++++++++++++++++ gf16/gf16_clmul_sve2.c | 49 +------------------ gf16/gf16_clmul_sve2.h | 52 ++++++++++++++++++++ gf16/gf16pmul.cpp | 13 +++++ gf16/gf16pmul.h | 5 ++ gf16/gf16pmul_clmul_neon.c | 39 +++++++++++++++ gf16/gf16pmul_clmul_sve2.c | 44 +++++++++++++++++ 8 files changed, 252 insertions(+), 140 deletions(-) create mode 100644 gf16/gf16_clmul_neon.h create mode 100644 gf16/gf16_clmul_sve2.h create mode 100644 gf16/gf16pmul_clmul_neon.c create mode 100644 gf16/gf16pmul_clmul_sve2.c diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c index 1f301a79..95a89cf2 100644 --- a/gf16/gf16_clmul_neon.c +++ b/gf16/gf16_clmul_neon.c @@ -1,51 +1,11 @@ -#include "gf16_neon_common.h" +#include "gf16_clmul_neon.h" #include "gf16_muladd_multi.h" // TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort #if defined(__ARM_NEON) -// `vaddq_p8` and co seems to be missing from some compilers (like GCC), so define our own variant -static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) { - return vreinterpretq_p8_u8(veorq_u8(vreinterpretq_u8_p8(a), vreinterpretq_u8_p8(b))); -} - -#ifdef __aarch64__ -typedef poly8x16_t coeff_t; -# if defined(__GNUC__) || defined(__clang__) -// because GCC/CLang doesn't seem to handle these cases well, explicitly tell them what to do -static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_low(poly8x16_t a, poly8x16_t b) { - poly16x8_t result; - __asm__ ("pmull %0.8h,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} -static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_high(poly8x16_t a, poly8x16_t b) { - poly16x8_t result; - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} -# else -# define pmull_low(x, y) vmull_p8(vget_low_p8(x), vget_low_p8(y)) -# define pmull_high vmull_high_p8 -# endif -# define coeff_fn(f1, f2) f1##q_##f2 -#else -static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) { - return vreinterpret_p8_u8(veor_u8(vreinterpret_u8_p8(a), vreinterpret_u8_p8(b))); -} -typedef poly8x8_t coeff_t; -# define pmull_low(x, y) vmull_p8(vget_low_p8(x), y) -# define pmull_high(x, y) vmull_p8(vget_high_p8(x), y) -# define coeff_fn(f1, f2) f1##_##f2 -#endif - // NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__) // Apple M1 supports fusing PMULL+EOR, so ensure these are paired @@ -113,57 +73,6 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8 *high2 = pmacl_high(*high2, data.val[1], coeff[1]); } -static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { - return veorq_u8(a, veorq_u8(b, c)); -} - -static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) { - // put data in proper form - uint8x16x2_t hibytes = vuzpq_u8(vreinterpretq_u8_p16(*high1), vreinterpretq_u8_p16(high2)); - uint8x16x2_t lobytes = vuzpq_u8(vreinterpretq_u8_p16(*low1), vreinterpretq_u8_p16(low2)); - - // merge mid into high/low - uint8x16x2_t midbytes = vuzpq_u8(vreinterpretq_u8_p16(mid1), vreinterpretq_u8_p16(mid2)); - uint8x16_t libytes = veorq_u8(hibytes.val[0], lobytes.val[1]); - lobytes.val[1] = eor3q_u8(libytes, lobytes.val[0], midbytes.val[0]); - hibytes.val[0] = eor3q_u8(libytes, hibytes.val[1], midbytes.val[1]); - - - // Barrett reduction - // first reduction coefficient is 0x1111a - // multiply hibytes by 0x11100 - uint8x16_t highest_nibble = vshrq_n_u8(hibytes.val[1], 4); - uint8x16_t th0 = vsriq_n_u8(vshlq_n_u8(hibytes.val[1], 4), hibytes.val[0], 4); - th0 = eor3q_u8(th0, hibytes.val[0], hibytes.val[1]); - uint8x16_t th1 = veorq_u8(hibytes.val[1], highest_nibble); - - // subsequent polynomial multiplication doesn't need the low bits of th0 to be correct, so trim these now for a shorter dep chain - uint8x16_t th0_hi3 = vshrq_n_u8(th0, 5); - uint8x16_t th0_hi1 = vshrq_n_u8(th0_hi3, 2); // or is `vshrq_n_u8(th0, 7)` better? - - // mul by 0x1a => we only care about upper byte -#ifdef __aarch64__ - th0 = veorq_u8(th0, vqtbl1q_u8( - vmakeq_u8(0,1,3,2,6,7,5,4,13,12,14,15,11,10,8,9), - highest_nibble - )); -#else - th0 = veorq_u8(th0, vshrq_n_u8(vreinterpretq_u8_p8(vmulq_p8( - vreinterpretq_p8_u8(highest_nibble), - vdupq_n_p8(0x1a) - )), 4)); -#endif - - // multiply by polynomial: 0x100b - poly8x16_t redL = vdupq_n_p8(0x0b); - hibytes.val[1] = veorq_u8(th0_hi3, th0_hi1); - hibytes.val[1] = vsliq_n_u8(hibytes.val[1], th0, 4); - th1 = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL)); - hibytes.val[0] = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th0), redL)); - - *low1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[0], hibytes.val[0])); - *high1 = vreinterpretq_p16_u8(eor3q_u8(hibytes.val[1], lobytes.val[1], th1)); -} #ifdef __aarch64__ # define CLMUL_NUM_REGIONS 8 diff --git a/gf16/gf16_clmul_neon.h b/gf16/gf16_clmul_neon.h new file mode 100644 index 00000000..a638101f --- /dev/null +++ b/gf16/gf16_clmul_neon.h @@ -0,0 +1,97 @@ +#include "gf16_neon_common.h" + +#if defined(__ARM_NEON) + +// `vaddq_p8` and co seems to be missing from some compilers (like GCC), so define our own variant +static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) { + return vreinterpretq_p8_u8(veorq_u8(vreinterpretq_u8_p8(a), vreinterpretq_u8_p8(b))); +} + +#ifdef __aarch64__ +typedef poly8x16_t coeff_t; +# if defined(__GNUC__) || defined(__clang__) +// because GCC/CLang doesn't seem to handle these cases well, explicitly tell them what to do +static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_low(poly8x16_t a, poly8x16_t b) { + poly16x8_t result; + __asm__ ("pmull %0.8h,%1.8b,%2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} +static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_high(poly8x16_t a, poly8x16_t b) { + poly16x8_t result; + __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} +# else +# define pmull_low(x, y) vmull_p8(vget_low_p8(x), vget_low_p8(y)) +# define pmull_high vmull_high_p8 +# endif +# define coeff_fn(f1, f2) f1##q_##f2 +#else +static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) { + return vreinterpret_p8_u8(veor_u8(vreinterpret_u8_p8(a), vreinterpret_u8_p8(b))); +} +typedef poly8x8_t coeff_t; +# define pmull_low(x, y) vmull_p8(vget_low_p8(x), y) +# define pmull_high(x, y) vmull_p8(vget_high_p8(x), y) +# define coeff_fn(f1, f2) f1##_##f2 +#endif + +static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { + return veorq_u8(a, veorq_u8(b, c)); +} + +static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) { + // put data in proper form + uint8x16x2_t hibytes = vuzpq_u8(vreinterpretq_u8_p16(*high1), vreinterpretq_u8_p16(high2)); + uint8x16x2_t lobytes = vuzpq_u8(vreinterpretq_u8_p16(*low1), vreinterpretq_u8_p16(low2)); + + // merge mid into high/low + uint8x16x2_t midbytes = vuzpq_u8(vreinterpretq_u8_p16(mid1), vreinterpretq_u8_p16(mid2)); + uint8x16_t libytes = veorq_u8(hibytes.val[0], lobytes.val[1]); + lobytes.val[1] = eor3q_u8(libytes, lobytes.val[0], midbytes.val[0]); + hibytes.val[0] = eor3q_u8(libytes, hibytes.val[1], midbytes.val[1]); + + + // Barrett reduction + // first reduction coefficient is 0x1111a + // multiply hibytes by 0x11100 + uint8x16_t highest_nibble = vshrq_n_u8(hibytes.val[1], 4); + uint8x16_t th0 = vsriq_n_u8(vshlq_n_u8(hibytes.val[1], 4), hibytes.val[0], 4); + th0 = eor3q_u8(th0, hibytes.val[0], hibytes.val[1]); + uint8x16_t th1 = veorq_u8(hibytes.val[1], highest_nibble); + + // subsequent polynomial multiplication doesn't need the low bits of th0 to be correct, so trim these now for a shorter dep chain + uint8x16_t th0_hi3 = vshrq_n_u8(th0, 5); + uint8x16_t th0_hi1 = vshrq_n_u8(th0_hi3, 2); // or is `vshrq_n_u8(th0, 7)` better? + + // mul by 0x1a => we only care about upper byte +#ifdef __aarch64__ + th0 = veorq_u8(th0, vqtbl1q_u8( + vmakeq_u8(0,1,3,2,6,7,5,4,13,12,14,15,11,10,8,9), + highest_nibble + )); +#else + th0 = veorq_u8(th0, vshrq_n_u8(vreinterpretq_u8_p8(vmulq_p8( + vreinterpretq_p8_u8(highest_nibble), + vdupq_n_p8(0x1a) + )), 4)); +#endif + + // multiply by polynomial: 0x100b + poly8x16_t redL = vdupq_n_p8(0x0b); + hibytes.val[1] = veorq_u8(th0_hi3, th0_hi1); + hibytes.val[1] = vsliq_n_u8(hibytes.val[1], th0, 4); + th1 = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL)); + hibytes.val[0] = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th0), redL)); + + *low1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[0], hibytes.val[0])); + *high1 = vreinterpretq_p16_u8(eor3q_u8(hibytes.val[1], lobytes.val[1], th1)); +} + +#endif diff --git a/gf16/gf16_clmul_sve2.c b/gf16/gf16_clmul_sve2.c index 913dc8a9..c038ad47 100644 --- a/gf16/gf16_clmul_sve2.c +++ b/gf16/gf16_clmul_sve2.c @@ -1,5 +1,5 @@ -#include "gf16_sve_common.h" +#include "gf16_clmul_sve2.h" #include "gf16_muladd_multi.h" #if defined(__ARM_FEATURE_SVE2) @@ -42,53 +42,6 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_sve2_merge2( *high2 = sveor3_u8(*high2, high2b, high2c); } -static HEDLEY_ALWAYS_INLINE void gf16_clmul_sve2_reduction(svuint8_t* low1, svuint8_t low2, svuint8_t mid1, svuint8_t mid2, svuint8_t* high1, svuint8_t high2) { - // put data in proper form - svuint8_t hibytesL = svtrn1_u8(*high1, high2); - svuint8_t hibytesH = svtrn2_u8(*high1, high2); - svuint8_t lobytesL = svtrn1_u8(*low1, low2); - svuint8_t lobytesH = svtrn2_u8(*low1, low2); - - // merge mid into high/low - svuint8_t midbytesL = svtrn1_u8(mid1, mid2); - svuint8_t midbytesH = svtrn2_u8(mid1, mid2); - svuint8_t libytes = NOMASK(sveor_u8, hibytesL, lobytesH); - lobytesH = sveor3_u8(midbytesL, lobytesL, libytes); - hibytesL = sveor3_u8(midbytesH, hibytesH, libytes); - - // Barrett reduction - // first reduction coefficient is 0x1111a - svuint8_t highest_nibble = NOMASK(svlsr_n_u8, hibytesH, 4); - - svuint8_t th0 = svsri_n_u8(NOMASK(svlsl_n_u8, hibytesH, 4), hibytesL, 4); - th0 = sveor3_u8(th0, hibytesH, hibytesL); - svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5); - th0 = NOMASK(sveor_u8, th0, NOMASK(svlsr_n_u8, - svpmul_n_u8(highest_nibble, 0x1a), 4 - )); - - // alternative strategy to above, using nibble flipped ops; looks like one less op, but 0xf vector needs to be constructed, so still the same; maybe there's a better way to leverage it? - // svuint8_t th0 = svxar_n_u8(hibytesH, hibytesL, 4); - // th0 = svbcax_n_u8(th0, svpmul_n_u8(highest_nibble, 0x1a), 0xf); - // th0 = svxar_n_u8(th0, svbsl_n_u8(hibytesH, hibytesL, 0xf), 4); - // svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5); - - svuint8_t th1 = NOMASK(sveor_u8, hibytesH, highest_nibble); - - - // multiply by polynomial: 0x100b - lobytesH = sveor3_u8( - lobytesH, - svpmul_n_u8(th1, 0x0b), - NOMASK(svlsr_n_u8, th0_hi3, 2) - ); - lobytesH = NOMASK(sveor_u8, lobytesH, svsli_n_u8(th0_hi3, th0, 4)); - lobytesL = NOMASK(sveor_u8, lobytesL, svpmul_n_u8(th0, 0x0b)); - - *low1 = lobytesL; - *high1 = lobytesH; -} - #define CLMUL_NUM_REGIONS 8 static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_sve2( diff --git a/gf16/gf16_clmul_sve2.h b/gf16/gf16_clmul_sve2.h new file mode 100644 index 00000000..f5fc40f6 --- /dev/null +++ b/gf16/gf16_clmul_sve2.h @@ -0,0 +1,52 @@ +#include "gf16_sve_common.h" + +#if defined(__ARM_FEATURE_SVE2) + +static HEDLEY_ALWAYS_INLINE void gf16_clmul_sve2_reduction(svuint8_t* low1, svuint8_t low2, svuint8_t mid1, svuint8_t mid2, svuint8_t* high1, svuint8_t high2) { + // put data in proper form + svuint8_t hibytesL = svtrn1_u8(*high1, high2); + svuint8_t hibytesH = svtrn2_u8(*high1, high2); + svuint8_t lobytesL = svtrn1_u8(*low1, low2); + svuint8_t lobytesH = svtrn2_u8(*low1, low2); + + // merge mid into high/low + svuint8_t midbytesL = svtrn1_u8(mid1, mid2); + svuint8_t midbytesH = svtrn2_u8(mid1, mid2); + svuint8_t libytes = NOMASK(sveor_u8, hibytesL, lobytesH); + lobytesH = sveor3_u8(midbytesL, lobytesL, libytes); + hibytesL = sveor3_u8(midbytesH, hibytesH, libytes); + + // Barrett reduction + // first reduction coefficient is 0x1111a + svuint8_t highest_nibble = NOMASK(svlsr_n_u8, hibytesH, 4); + + svuint8_t th0 = svsri_n_u8(NOMASK(svlsl_n_u8, hibytesH, 4), hibytesL, 4); + th0 = sveor3_u8(th0, hibytesH, hibytesL); + svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5); + th0 = NOMASK(sveor_u8, th0, NOMASK(svlsr_n_u8, + svpmul_n_u8(highest_nibble, 0x1a), 4 + )); + + // alternative strategy to above, using nibble flipped ops; looks like one less op, but 0xf vector needs to be constructed, so still the same; maybe there's a better way to leverage it? + // svuint8_t th0 = svxar_n_u8(hibytesH, hibytesL, 4); + // th0 = svbcax_n_u8(th0, svpmul_n_u8(highest_nibble, 0x1a), 0xf); + // th0 = svxar_n_u8(th0, svbsl_n_u8(hibytesH, hibytesL, 0xf), 4); + // svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5); + + svuint8_t th1 = NOMASK(sveor_u8, hibytesH, highest_nibble); + + + // multiply by polynomial: 0x100b + lobytesH = sveor3_u8( + lobytesH, + svpmul_n_u8(th1, 0x0b), + NOMASK(svlsr_n_u8, th0_hi3, 2) + ); + lobytesH = NOMASK(sveor_u8, lobytesH, svsli_n_u8(th0_hi3, th0, 4)); + lobytesL = NOMASK(sveor_u8, lobytesL, svpmul_n_u8(th0, 0x0b)); + + *low1 = lobytesL; + *high1 = lobytesH; +} + +#endif diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp index 0e1e99a9..ffc59f35 100644 --- a/gf16/gf16pmul.cpp +++ b/gf16/gf16pmul.cpp @@ -61,5 +61,18 @@ void setup_pmul() { #endif #ifdef PLATFORM_ARM + if(!CPU_HAS_SVE2) gf16pmul_clmul_available_sve2 = 0; + if(!CPU_HAS_NEON) gf16pmul_clmul_available_neon = 0; + + if(gf16pmul_clmul_available_sve2) { + gf16pmul = &gf16pmul_clmul_sve2; + gf16pmul_alignment = gf16pmul_clmul_sve2_width(); + gf16pmul_blocklen = gf16pmul_alignment*2; + } + else if(gf16pmul_clmul_available_neon) { + gf16pmul = &gf16pmul_clmul_neon; + gf16pmul_alignment = 16; + gf16pmul_blocklen = 32; + } #endif } diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h index ac472f12..180a9025 100644 --- a/gf16/gf16pmul.h +++ b/gf16/gf16pmul.h @@ -21,8 +21,13 @@ _PMUL_DECL(sse); _PMUL_DECL(avx2); _PMUL_DECL(vpclmul); _PMUL_DECL(vpclgfni); +_PMUL_DECL(neon); +_PMUL_DECL(sve2); #undef _PMUL_DECL + +unsigned gf16pmul_clmul_sve2_width(); + HEDLEY_END_C_DECLS #endif // defined(__GF16PMUL_H__) diff --git a/gf16/gf16pmul_clmul_neon.c b/gf16/gf16pmul_clmul_neon.c new file mode 100644 index 00000000..c4d8f76c --- /dev/null +++ b/gf16/gf16pmul_clmul_neon.c @@ -0,0 +1,39 @@ +#include "gf16_global.h" +#include "gf16_clmul_neon.h" + +#ifdef __ARM_NEON +int gf16pmul_clmul_available_neon = 1; + +void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + assert(len % sizeof(uint8x16_t)*2 == 0); + + const poly8_t* _src1 = (const poly8_t*)src1 + len; + const poly8_t* _src2 = (const poly8_t*)src2 + len; + uint8_t* _dst = (uint8_t*)dst + len; + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { + poly8x16x2_t data1 = vld2q_p8(_src1+ptr); + poly8x16x2_t data2 = vld2q_p8(_src2+ptr); + poly16x8_t low1 = pmull_low(data1.val[0], data2.val[0]); + poly16x8_t low2 = pmull_high(data1.val[0], data2.val[0]); + poly8x16_t dataMid1 = veorq_p8(data1.val[0], data1.val[1]); + poly8x16_t dataMid2 = veorq_p8(data2.val[0], data2.val[1]); + poly16x8_t mid1 = pmull_low(dataMid1, dataMid2); + poly16x8_t mid2 = pmull_high(dataMid1, dataMid2); + poly16x8_t high1 = pmull_low(data1.val[1], data2.val[1]); + poly16x8_t high2 = pmull_high(data1.val[1], data2.val[1]); + + gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); + uint8x16x2_t out; + out.val[0] = vreinterpretq_u8_p16(low1); + out.val[1] = vreinterpretq_u8_p16(high1); + vst2q_u8(_dst+ptr, out); + } +} + +#else // defined(__ARM_NEON) +int gf16pmul_clmul_available_neon = 0; +void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); +} +#endif diff --git a/gf16/gf16pmul_clmul_sve2.c b/gf16/gf16pmul_clmul_sve2.c new file mode 100644 index 00000000..b88fe2c2 --- /dev/null +++ b/gf16/gf16pmul_clmul_sve2.c @@ -0,0 +1,44 @@ +#include "gf16_global.h" +#include "gf16_clmul_sve2.h" + +#ifdef __ARM_FEATURE_SVE2 +int gf16pmul_clmul_available_sve2 = 1; + +void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + assert(len % svcntb()*2 == 0); + + const uint8_t* _src1 = (const uint8_t*)src1 + len; + const uint8_t* _src2 = (const uint8_t*)src2 + len; + uint8_t* _dst = (uint8_t*)dst + len; + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()*2) { + svuint8x2_t data1 = svld2_u8(svptrue_b8(), _src1+ptr); + svuint8x2_t data2 = svld2_u8(svptrue_b8(), _src2+ptr); + svuint8_t low1 = svpmullb_pair_u8(svget2(data1, 0), svget2(data2, 0)); + svuint8_t low2 = svpmullt_pair_u8(svget2(data1, 0), svget2(data2, 0)); + svuint8_t dataMid1 = NOMASK(sveor_u8, svget2(data1, 0), svget2(data1, 1)); + svuint8_t dataMid2 = NOMASK(sveor_u8, svget2(data2, 0), svget2(data2, 1)); + svuint8_t mid1 = svpmullb_pair_u8(dataMid1, dataMid2); + svuint8_t mid2 = svpmullt_pair_u8(dataMid1, dataMid2); + svuint8_t high1 = svpmullb_pair_u8(svget2(data1, 1), svget2(data2, 1)); + svuint8_t high2 = svpmullt_pair_u8(svget2(data1, 1), svget2(data2, 1)); + + gf16_clmul_sve2_reduction(&low1, low2, mid1, mid2, &high1, high2); + svst2_u8(svptrue_b8(), _dst+ptr, svcreate2_u8(low1, high1)); + } +} + +unsigned gf16pmul_clmul_sve2_width() { + return svcntb(); +} + +#else // defined(__ARM_FEATURE_SVE2) +int gf16pmul_clmul_available_sve2 = 0; +void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { + UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); +} + +unsigned gf16pmul_clmul_sve2_width() { + return 1; +} +#endif From b4e37d8b85ef9074d759b8b8dff0bfecb17f8538 Mon Sep 17 00:00:00 2001 From: animetosho Date: Fri, 16 Jun 2023 11:54:16 +1000 Subject: [PATCH 21/91] Initial loop-tiling + striping for matrix inversion --- gf16/gfmat_inv.cpp | 232 ++++++++++++++++++++++++++------------------- gf16/gfmat_inv.h | 11 ++- 2 files changed, 145 insertions(+), 98 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index fc2c97d1..d5874406 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -11,48 +11,61 @@ extern "C" uint16_t* gf16_recip; #include "gf16mul.h" template -int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch) { +int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) { unsigned missingCol = validCount + rec; uint16_t baseCoeff; uint16_t coeff[rows]; - #define MAT_ROW(r) (mat + (r) * (stride / sizeof(uint16_t))) + unsigned sw16 = stripeWidth / sizeof(uint16_t); + // TODO: consider optimisation for numStripes == 1 ? + + + #define MAT_ROW(s, r) (mat + (((s)*invalidCount) + (r)) * sw16) + #define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/sw16, r), (c)%sw16, v) void* srcRows[rows]; - srcRows[0] = MAT_ROW(rec); + srcRows[0] = MAT_ROW(0, rec); for(unsigned i=1; i= 2) { // multiply-add to the next row - MULADD_ROW(srcRows[1], 0); + MULADD_ROW(rec+1, 0); // scale it, and multiply-add back SCALE_ROW(1); if(rows > 2) { - MULADD_ROW_PF(srcRows[0], 1, srcRows[2]); - } else MULADD_LASTROW(srcRows[0], 1) + MULADD_ROW_PF(rec+0, 1, srcRows[2]); + } else MULADD_LASTROW(rec+0, 1) } else { - if(rec2 >= invalidCount) + if(recFirst >= invalidCount) return -1; } if(rows >= 3) { if(rows >= 4) { - MULADD_MULTI_ROW_PF(srcRows[2], 0, 2, srcRows[3]); + MULADD_MULTI_ROW_PF(rec+2, 0, 2, srcRows[3]); SCALE_ROW(2); - MULADD_MULTI_ROW(srcRows[3], 0, 2); - MULADD_ROW(srcRows[3], 2); + MULADD_MULTI_ROW(rec+3, 0, 2); + MULADD_ROW(rec+3, 2); SCALE_ROW(3); - MULADD_ROW(srcRows[2], 3); - MULADD_MULTI_ROW(srcRows[0], 2, 2); + MULADD_ROW(rec+2, 3); + MULADD_MULTI_ROW(rec+0, 2, 2); if(rows > 4) { - MULADD_MULTI_ROW_PF(srcRows[1], 2, 2, srcRows[4]); - } else MULADD_MULTI_LASTROW(srcRows[1], 2, 2) + MULADD_MULTI_ROW_PF(rec+1, 2, 2, srcRows[4]); + } else MULADD_MULTI_LASTROW(rec+1, 2, 2) } else { - MULADD_MULTI_ROW(srcRows[2], 0, 2); + MULADD_MULTI_ROW(rec+2, 0, 2); SCALE_ROW(2); - MULADD_ROW(srcRows[0], 2); - MULADD_LASTROW(srcRows[1], 2) + MULADD_ROW(rec+0, 2); + MULADD_LASTROW(rec+1, 2) } } if(rows >= 5) { if(rows >= 6) { - MULADD_MULTI_ROW_PF(srcRows[4], 0, 4, srcRows[5]); + MULADD_MULTI_ROW_PF(rec+4, 0, 4, srcRows[5]); SCALE_ROW(4); - MULADD_MULTI_ROW(srcRows[5], 0, 4); - MULADD_ROW(srcRows[5], 4); + MULADD_MULTI_ROW(rec+5, 0, 4); + MULADD_ROW(rec+5, 4); SCALE_ROW(5); - MULADD_ROW(srcRows[4], 5); + MULADD_ROW(rec+4, 5); for(unsigned r = 0; r < 3; r++) { - MULADD_MULTI_ROW(srcRows[r], 4, 2); + MULADD_MULTI_ROW(rec+r, 4, 2); } - MULADD_MULTI_LASTROW(srcRows[3], 4, 2) + MULADD_MULTI_LASTROW(rec+3, 4, 2) } else { - MULADD_MULTI_ROW(srcRows[4], 0, 4); + MULADD_MULTI_ROW(rec+4, 0, 4); SCALE_ROW(4); for(unsigned r = 0; r < 3; r++) { - MULADD_ROW(srcRows[r], 4); + MULADD_ROW(rec+r, 4); } - MULADD_LASTROW(srcRows[3], 4) + MULADD_LASTROW(rec+3, 4) } } // do main elimination, using the source group - while(1) { - uint16_t* row2 = MAT_ROW(rec2); - rec2++; - if(HEDLEY_UNLIKELY(rec2 == rec)) - rec2 += rows; - if(rows > 1) { - MULADD_MULTI_LASTROW(row2, 0, rows) + // first, gather all relevant coefficients + for(unsigned r=0; r 1) { + if(HEDLEY_LIKELY(pf)) + gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf); + else + gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch); + } else { + if(HEDLEY_LIKELY(pf)) + gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch, pf); + else + gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch); + } } } + return -1; + #undef MAT_ROW + #undef REPLACE_WORD #undef SCALE_ROW #undef MULADD_ROW #undef MULADD_ROW_PF @@ -163,11 +203,12 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned unsigned validCol = 0; unsigned missingCol = validCount; unsigned recStart = 0; - unsigned stride16 = stride / sizeof(uint16_t); + unsigned sw16 = stripeWidth/sizeof(uint16_t); unsigned invalidCount = inputValid.size() - validCount; if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case - for(unsigned input = 0; input < inputValid.size(); input++) { - mat[input] = 1; + for(unsigned stripe=0; stripe& inputValid, unsigned for(unsigned i=0; i& inputValid, unsigned CONSTRUCT_VIA_EXP(uint16_t rec : recSkips); // ...then compute most of the rows via multiplication - lastExp = 1; - uint16_t* src1 = mat + recStart * stride16; - for(unsigned rec = recStart+1; rec < invalidCount; rec++) { - uint16_t exp = recovery.at(rec); - bool skip = (exp != lastExp+1); - lastExp = exp; - if(skip) continue; - - gf16pmul(mat + rec * stride16, src1, mat + (rec-1) * stride16, stride); + for(unsigned stripe=0; stripe& inputValid, unsigned #undef CONSTRUCT_VIA_EXP } +#define CEIL_DIV(a, b) (((a) + (b)-1) / (b)) +#define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) + bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { unsigned invalidCount = inputValid.size() - validCount; assert(validCount < inputValid.size()); // i.e. invalidCount > 0 @@ -261,15 +312,20 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va unsigned matWidth = inputValid.size() * sizeof(uint16_t); Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true)); - stride = gf.alignToStride(matWidth); const auto gfInfo = gf.info(); + + // divide the matrix up into evenly sized stripes (for loop tiling optimisation) + numStripes = ROUND_DIV(matWidth, gfInfo.idealChunkSize); + if(numStripes < 1) numStripes = 1; + stripeWidth = gf.alignToStride(CEIL_DIV(matWidth, numStripes)); + numStripes = CEIL_DIV(matWidth, stripeWidth); + assert(numStripes >= 1); + void* gfScratch = gf.mutScratch_alloc(); if(mat) ALIGN_FREE(mat); - ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment); - - unsigned stride16 = stride / sizeof(uint16_t); - assert(stride16 * sizeof(uint16_t) == stride); + unsigned matSize = invalidCount * stripeWidth*numStripes; + ALIGN_ALLOC(mat, matSize, gfInfo.alignment); uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc @@ -299,22 +355,17 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va if(progressCb) progressCb(1, totalProgress); progressOffset = 2; - for(unsigned r = 0; r < invalidCount; r++) { - uint16_t* row = mat + r * stride16; - //memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory - gf.prepare(row, row, stride); - } + gf.prepare(mat, mat, matSize); } // invert - // TODO: optimise: multi-thread + packed arrangement unsigned rec = 0; #define INVERT_GROUP(rows) \ if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \ for(; rec <= invalidCount-rows; rec+=rows) { \ if(progressCb) progressCb(rec + progressOffset, totalProgress); \ \ - int badRowOffset = processRow(rec, validCount, invalidCount, gf, gfScratch); \ + int badRowOffset = processRow(rec, validCount, invalidCount, gf, gfScratch, rowCoeffs); \ if(badRowOffset >= 0) { \ /* ignore this recovery row and try again */ \ recovery.erase(recovery.begin() + rec + badRowOffset); \ @@ -323,37 +374,28 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } \ } // max out at 6 groups (registers + cache assoc?) + uint16_t* rowCoeffs = new uint16_t[invalidCount*6]; INVERT_GROUP(6) INVERT_GROUP(5) INVERT_GROUP(4) INVERT_GROUP(3) INVERT_GROUP(2) INVERT_GROUP(1) + delete[] rowCoeffs; #undef INVERT_GROUP // post transform if(gf.needPrepare()) { if(progressCb) progressCb(totalProgress-1, totalProgress); - for(unsigned r = 0; r < invalidCount; r++) { - uint16_t* row = mat + r * stride16; - gf.finish(row, stride); - - /* - // check for zeroes; TODO: does this need to be the full row? - for(unsigned col = validCount; col < inputValid.size(); col++) { - if(HEDLEY_UNLIKELY(row[col] == 0)) { // bad coeff - recovery.erase(recovery.begin() + r); - goto invert_loop; - } - } - */ - } + gf.finish(mat, matSize); + // TODO: check for zeroes?? } } // remove excess recovery recovery.resize(invalidCount); + numRec = invalidCount; gf.mutScratch_free(gfScratch); return true; diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index b5290900..bb0be942 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -9,17 +9,22 @@ class Galois16Mul; class Galois16RecMatrix { uint16_t* mat; - unsigned stride; + unsigned numStripes; + unsigned stripeWidth; + unsigned numRec; void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); template - int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch); + int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs); public: Galois16RecMatrix() : mat(nullptr) {} ~Galois16RecMatrix(); bool Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb = nullptr); inline uint16_t GetFactor(uint16_t inIdx, uint16_t recIdx) const { - return mat[recIdx * stride/sizeof(uint16_t) + inIdx]; + // TODO: check if numStripes==1? consider optimising division? + unsigned sw = stripeWidth/sizeof(uint16_t); + unsigned stripe = inIdx / sw; + return mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)]; } }; #endif From 54ee89f2139151cb196dd85574745e7f6003962c Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 17 Jun 2023 18:29:19 +1000 Subject: [PATCH 22/91] Extract main invert loop to own function --- gf16/gfmat_inv.cpp | 119 ++++++++++++++++++++++++--------------------- gf16/gfmat_inv.h | 5 +- 2 files changed, 67 insertions(+), 57 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index d5874406..bbd15dd4 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -10,8 +10,41 @@ extern "C" uint16_t* gf16_recip; #include "../src/platform.h" // for ALIGN_* #include "gf16mul.h" +#define MAT_ROW(s, r) (mat + (((s)*numRec) + (r)) * (stripeWidth / sizeof(uint16_t))) + template -int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) { +void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf) { + for(unsigned stripe=stripeStart; stripe 1) { + if(HEDLEY_LIKELY(pf)) + gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf); + else + gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch); + } else { + if(HEDLEY_LIKELY(pf)) + gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch, pf); + else + gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch); + } + } + } +} + +template +int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) { unsigned missingCol = validCount + rec; uint16_t baseCoeff; @@ -21,7 +54,6 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in // TODO: consider optimisation for numStripes == 1 ? - #define MAT_ROW(s, r) (mat + (((s)*invalidCount) + (r)) * sw16) #define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/sw16, r), (c)%sw16, v) void* srcRows[rows]; @@ -64,7 +96,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in gf.mul_add_multi_stridepf(numRows, stripeWidth, MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+srcOffs), stripeWidth, coeff, gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth) #define MULADD_LASTROW(rowDst, rowSrc) \ - if(HEDLEY_LIKELY(recFirst < invalidCount)) { \ + if(HEDLEY_LIKELY(recFirst < numRec)) { \ MULADD_ROW_PF(rowDst, rowSrc, MAT_ROW(0, recFirst)); \ } else { \ if(nextScaleRow) { \ @@ -75,7 +107,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in return -1; \ } #define MULADD_MULTI_LASTROW(rowDst, srcOffs, numRows) \ - if(HEDLEY_LIKELY(recFirst < invalidCount)) { \ + if(HEDLEY_LIKELY(recFirst < numRec)) { \ MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, MAT_ROW(0, recFirst)); \ } else { \ if(nextScaleRow) { \ @@ -88,7 +120,9 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in unsigned recFirst = rec == 0 ? rows : 0; // the next row when `processRow` is called; last action will prefetch this row - uint16_t* nextScaleRow = (rec+rows < invalidCount) ? MAT_ROW(0, rec+rows) : nullptr; + uint16_t* nextScaleRow = (rec+rows < numRec) ? MAT_ROW(0, rec+rows) : nullptr; + + // TODO: consider loop tiling this stuff; requires extracting a small matrix (rows*rows), and solving that, which means a scalar multiply is necessary // rescale the row SCALE_ROW(0); @@ -103,7 +137,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in MULADD_ROW_PF(rec+0, 1, srcRows[2]); } else MULADD_LASTROW(rec+0, 1) } else { - if(recFirst >= invalidCount) + if(recFirst >= numRec) return -1; } if(rows >= 3) { @@ -149,7 +183,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in // do main elimination, using the source group // first, gather all relevant coefficients - for(unsigned r=0; r 1) { - if(HEDLEY_LIKELY(pf)) - gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf); - else - gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch); - } else { - if(HEDLEY_LIKELY(pf)) - gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch, pf); - else - gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch); - } - } - } + invertLoop(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow); return -1; - #undef MAT_ROW #undef REPLACE_WORD #undef SCALE_ROW #undef MULADD_ROW @@ -196,6 +204,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in #undef MULADD_LASTROW #undef MULADD_MULTI_LASTROW } +#undef MAT_ROW // construct initial matrix (pre-inversion) @@ -204,11 +213,10 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned unsigned missingCol = validCount; unsigned recStart = 0; unsigned sw16 = stripeWidth/sizeof(uint16_t); - unsigned invalidCount = inputValid.size() - validCount; if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case for(unsigned stripe=0; stripe& inputValid, unsigned for(unsigned i=0; i& inputValid, unsigned for(; input < inputValid.size(); input++) { \ uint16_t inputLog = gfmat_input_log(input); \ unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; \ - targetCol = (targetCol/sw16)*sw16*invalidCount + (targetCol%sw16); \ + targetCol = (targetCol/sw16)*sw16*numRec + (targetCol%sw16); \ for(loopcond) { \ mat[rec * sw16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \ } \ @@ -257,11 +265,11 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned // there's a good chance that we have a mostly sequential sequence of recovery blocks // check this by looking for gaps in the sequence std::vector recSkips; - recSkips.reserve(invalidCount); + recSkips.reserve(numRec); recSkips.push_back(recStart); - unsigned maxSkips = invalidCount/2; // TODO: tune threshold + unsigned maxSkips = numRec/2; // TODO: tune threshold uint16_t lastExp = 1; - for(unsigned rec = recStart+1; rec < invalidCount; rec++) { + for(unsigned rec = recStart+1; rec < numRec; rec++) { uint16_t exp = recovery.at(rec); if(exp != lastExp+1) { recSkips.push_back(rec); @@ -277,9 +285,9 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned // ...then compute most of the rows via multiplication for(unsigned stripe=0; stripe& inputValid, unsigned } } - CONSTRUCT_VIA_EXP(unsigned rec = recStart; rec < invalidCount; rec++); + CONSTRUCT_VIA_EXP(unsigned rec = recStart; rec < numRec; rec++); #undef CONSTRUCT_VIA_EXP } @@ -302,12 +310,12 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { - unsigned invalidCount = inputValid.size() - validCount; - assert(validCount < inputValid.size()); // i.e. invalidCount > 0 + numRec = inputValid.size() - validCount; + assert(validCount < inputValid.size()); // i.e. numRec > 0 assert(inputValid.size() <= 32768 && inputValid.size() > 0); assert(recovery.size() <= 65535 && recovery.size() > 0); - if(invalidCount > recovery.size()) return false; + if(numRec > recovery.size()) return false; unsigned matWidth = inputValid.size() * sizeof(uint16_t); @@ -324,10 +332,10 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va void* gfScratch = gf.mutScratch_alloc(); if(mat) ALIGN_FREE(mat); - unsigned matSize = invalidCount * stripeWidth*numStripes; + unsigned matSize = numRec * stripeWidth*numStripes; ALIGN_ALLOC(mat, matSize, gfInfo.alignment); - uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc + uint16_t totalProgress = numRec + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc // easier to handle if exponents are in order std::sort(recovery.begin(), recovery.end()); @@ -339,7 +347,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? - if(invalidCount > recovery.size()) { // not enough recovery + if(numRec > recovery.size()) { // not enough recovery gf.mutScratch_free(gfScratch); ALIGN_FREE(mat); mat = nullptr; @@ -361,11 +369,11 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // invert unsigned rec = 0; #define INVERT_GROUP(rows) \ - if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \ - for(; rec <= invalidCount-rows; rec+=rows) { \ + if(gfInfo.idealInputMultiple >= rows && numRec >= rows) { \ + for(; rec <= numRec-rows; rec+=rows) { \ if(progressCb) progressCb(rec + progressOffset, totalProgress); \ \ - int badRowOffset = processRow(rec, validCount, invalidCount, gf, gfScratch, rowCoeffs); \ + int badRowOffset = processRow(rec, validCount, gf, gfScratch, rowCoeffs); \ if(badRowOffset >= 0) { \ /* ignore this recovery row and try again */ \ recovery.erase(recovery.begin() + rec + badRowOffset); \ @@ -374,7 +382,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } \ } // max out at 6 groups (registers + cache assoc?) - uint16_t* rowCoeffs = new uint16_t[invalidCount*6]; + uint16_t* rowCoeffs = new uint16_t[numRec*6]; INVERT_GROUP(6) INVERT_GROUP(5) INVERT_GROUP(4) @@ -394,8 +402,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } // remove excess recovery - recovery.resize(invalidCount); - numRec = invalidCount; + recovery.resize(numRec); gf.mutScratch_free(gfScratch); return true; diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index bb0be942..489f3dc3 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -14,8 +14,11 @@ class Galois16RecMatrix { unsigned numRec; void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); + + template + void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf); template - int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs); + int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs); public: Galois16RecMatrix() : mat(nullptr) {} ~Galois16RecMatrix(); From 7f98988e17c5283dd249f1ced43cc7a8ec3e8279 Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 18 Jun 2023 16:34:11 +1000 Subject: [PATCH 23/91] Add basic threading support to matrix inversion --- gf16/gfmat_inv.cpp | 164 ++++++++++++++++++++++++++++++++++++++++++--- gf16/gfmat_inv.h | 12 ++-- 2 files changed, 162 insertions(+), 14 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index bbd15dd4..dd876676 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -9,17 +9,64 @@ extern "C" uint16_t* gf16_recip; #include #include "../src/platform.h" // for ALIGN_* #include "gf16mul.h" +#include "threadqueue.h" +#include + +static const unsigned MIN_THREAD_REC = 10; // minimum number of rows to process on a thread + +class Galois16RecMatrixWorker { + const Galois16Mul& gf; +public: + MessageThread thread; + void* gfScratch; + + explicit Galois16RecMatrixWorker(const Galois16Mul& _gf) : gf(_gf) { + gfScratch = _gf.mutScratch_alloc(); + } + Galois16RecMatrixWorker(Galois16RecMatrixWorker&& other) noexcept : gf(other.gf) { + thread = std::move(other.thread); + gfScratch = other.gfScratch; + } + ~Galois16RecMatrixWorker() { + thread.end(); + gf.mutScratch_free(gfScratch); + } +}; + +struct Galois16RecMatrixWorkerMessage { + unsigned stripeStart, stripeEnd; + unsigned recFirst, recLast; + unsigned recSrc; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch; + void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, void**, Galois16Mul&, void*, const void*); + Galois16RecMatrix* parent; + std::atomic* procRefs; + std::promise* done; +}; + +static void invert_worker(ThreadMessageQueue& q) { + Galois16RecMatrixWorkerMessage* req; + while((req = static_cast(q.pop())) != NULL) { + (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->srcRows, *(req->gf), req->gfScratch, nullptr); + if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) { + req->done->set_value(); + } + delete req; + } +} #define MAT_ROW(s, r) (mat + (((s)*numRec) + (r)) * (stripeWidth / sizeof(uint16_t))) +#define CEIL_DIV(a, b) (((a) + (b)-1) / (b)) +#define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) template -void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf) { +void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) { for(unsigned stripe=stripeStart; stripe -int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) { +int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers) { unsigned missingCol = validCount + rec; uint16_t baseCoeff; @@ -191,7 +238,85 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul rowCoeffs[r*rows + c] = REPLACE_WORD(r, missingCol+c, 0); } } - invertLoop(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow); + if(workers.empty()) + // process elimination directly + invertLoop(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow); + else { + // process using workers + std::atomic procRefs; + std::promise done; + auto makeReq = [&, this]() -> Galois16RecMatrixWorkerMessage* { + auto* req = new Galois16RecMatrixWorkerMessage; + req->recFirst = recFirst; + req->recLast = numRec; + req->recSrc = rec; + req->rowCoeffs = rowCoeffs; + req->srcRows = srcRows; + req->gf = &gf; + req->fn = &Galois16RecMatrix::invertLoop; + req->parent = this; + req->procRefs = &procRefs; + req->done = &done; + return req; + }; + if(numStripes >= workers.size()) { // split full stripes across workers + float stripesPerWorker = (float)numStripes / workers.size(); + float stripe = 0.5; + procRefs.store(workers.size()); + for(auto& worker : workers) { + auto* req = makeReq(); + req->stripeStart = (unsigned)stripe; + req->stripeEnd = (unsigned)(stripe + stripesPerWorker); + req->gfScratch = worker.gfScratch; + worker.thread.send(req); + stripe += stripesPerWorker; + } + } else { // each stripe may need >1 worker + std::vector reqs; + reqs.reserve(workers.size()); + float workersPerStripe = (float)workers.size() / numStripes; + float workerCnt = 0.5; + for(unsigned stripe=0; stripe rec && rowPos <= rec) + // need to send extra to compensate for the gap + sendRows += rows; + if(rowPos+sendRows > numRec) + sendRows = numRec - rowPos; + + auto* req = makeReq(); + req->stripeStart = stripe; + req->stripeEnd = stripe+1; + req->recFirst = rowPos; + req->recLast = rowPos+sendRows; + reqs.push_back(req); + + rowPos += sendRows; + if(rowPos == rec) rowPos += rows; + } + + workerCnt += workersPerStripe; + } + assert(reqs.size() <= workers.size()); + procRefs.store(reqs.size()); + + for(unsigned i=0; igfScratch = worker.gfScratch; + worker.thread.send(req); + } + } + + // wait for threads to finish + done.get_future().wait(); + } return -1; @@ -306,9 +431,6 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned #undef CONSTRUCT_VIA_EXP } -#define CEIL_DIV(a, b) (((a) + (b)-1) / (b)) -#define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) - bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { numRec = inputValid.size() - validCount; assert(validCount < inputValid.size()); // i.e. numRec > 0 @@ -329,7 +451,6 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va numStripes = CEIL_DIV(matWidth, stripeWidth); assert(numStripes >= 1); - void* gfScratch = gf.mutScratch_alloc(); if(mat) ALIGN_FREE(mat); unsigned matSize = numRec * stripeWidth*numStripes; @@ -346,9 +467,24 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va setup_pmul(); } + std::vector workers; + void* gfScratch; + unsigned _numThreads = numThreads; + if(numRec < MIN_THREAD_REC) _numThreads = 1; // don't spawn threads if not enough work + if(_numThreads > 1) { + for(unsigned i=0; i<_numThreads; i++) { + workers.push_back(Galois16RecMatrixWorker(gf)); + workers[i].thread.name = "gauss_worker"; + workers[i].thread.setCallback(invert_worker); + } + gfScratch = nullptr; // ...otherwise MSVC won't be happy + } else + gfScratch = gf.mutScratch_alloc(); + invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? if(numRec > recovery.size()) { // not enough recovery - gf.mutScratch_free(gfScratch); + if(_numThreads <= 1) + gf.mutScratch_free(gfScratch); ALIGN_FREE(mat); mat = nullptr; return false; @@ -373,7 +509,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va for(; rec <= numRec-rows; rec+=rows) { \ if(progressCb) progressCb(rec + progressOffset, totalProgress); \ \ - int badRowOffset = processRow(rec, validCount, gf, gfScratch, rowCoeffs); \ + int badRowOffset = processRow(rec, validCount, gf, gfScratch, rowCoeffs, workers); \ if(badRowOffset >= 0) { \ /* ignore this recovery row and try again */ \ recovery.erase(recovery.begin() + rec + badRowOffset); \ @@ -404,10 +540,18 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // remove excess recovery recovery.resize(numRec); - gf.mutScratch_free(gfScratch); + if(_numThreads <= 1) + gf.mutScratch_free(gfScratch); return true; } +Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) { + numThreads = hardware_concurrency(); + numRec = 0; + numStripes = 0; + stripeWidth = 0; +} + Galois16RecMatrix::~Galois16RecMatrix() { if(mat) ALIGN_FREE(mat); } diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 489f3dc3..79e84795 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -7,21 +7,25 @@ #ifdef PARPAR_INVERT_SUPPORT class Galois16Mul; +class Galois16RecMatrixWorker; class Galois16RecMatrix { uint16_t* mat; unsigned numStripes; unsigned stripeWidth; unsigned numRec; - + unsigned numThreads; void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); template - void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf); + void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf); template - int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs); + int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers); public: - Galois16RecMatrix() : mat(nullptr) {} + Galois16RecMatrix(); ~Galois16RecMatrix(); + void setNumThreads(int threads) { + numThreads = threads; + } bool Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb = nullptr); inline uint16_t GetFactor(uint16_t inIdx, uint16_t recIdx) const { // TODO: check if numStripes==1? consider optimising division? From a1efeb3ed8d53adcdc21a3ac254042654526e899 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 17 Jul 2023 21:51:36 +1000 Subject: [PATCH 24/91] Fix missing CpuCap on non-x86/ARM platforms Ref animetosho/par2cmdline-turbo#12 --- gf16/gf16mul.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 292805b3..5d6a33c4 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -1263,12 +1263,12 @@ void Galois16Mul::mutScratch_free(void* mutScratch) const { } Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inputs, unsigned /*outputs*/, bool forInvert) { - const CpuCap caps(true); (void)regionSizeHint; (void)inputs; (void)forInvert; #ifdef PLATFORM_X86 + const CpuCap caps(true); if(caps.hasGFNI) { if(gf16_affine_available_avx512 && caps.hasAVX512VLBW) return GF16_AFFINE_AVX512; @@ -1306,6 +1306,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu return GF16_XOR_SSE2; #endif #ifdef PLATFORM_ARM + const CpuCap caps(true); if(caps.hasSVE2) { if(gf16_sve_get_size() >= 64) return GF16_SHUFFLE_512_SVE2; @@ -1335,8 +1336,8 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { if(gf16_lookup3_stride()) ret.push_back(GF16_LOOKUP3); - const CpuCap caps(checkCpuid); #ifdef PLATFORM_X86 + const CpuCap caps(checkCpuid); if(gf16_shuffle_available_ssse3 && caps.hasSSSE3) ret.push_back(GF16_SHUFFLE_SSSE3); if(gf16_shuffle_available_avx && caps.hasAVX) @@ -1384,6 +1385,7 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { } #endif #ifdef PLATFORM_ARM + const CpuCap caps(checkCpuid); if(gf16_available_neon && caps.hasNEON) { ret.push_back(GF16_SHUFFLE_NEON); ret.push_back(GF16_CLMUL_NEON); From 69e3f0c5490e334316facbf915b323450626a4d0 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 17 Jul 2023 22:27:19 +1000 Subject: [PATCH 25/91] Warning suppression for non-x86/ARM build --- hasher/hasher.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp index 7caf79d0..cff2ac4a 100644 --- a/hasher/hasher.cpp +++ b/hasher/hasher.cpp @@ -26,6 +26,7 @@ void setup_hasher() { CRC32_Calc = &CRC32_Calc_Slice4; struct _CpuCap CpuCap; + (void)CpuCap; // CPU detection #ifdef PLATFORM_X86 @@ -222,6 +223,7 @@ void set_hasherMD5MultiLevel(MD5MultiLevels level) { case MD5MULT_AVX2: case MD5MULT_SSE: #endif + default: case MD5MULT_SCALAR: break; } #undef SET_LEVEL From d75bfad8ebe7db364818340654830a18790edba7 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 17 Jul 2023 22:44:58 +1000 Subject: [PATCH 26/91] Use non-GNUC compatible inline-asm declaration --- gf16/gf16_xor_common.h | 10 +++++----- gf16/gf16_xor_sse2.c | 2 +- hasher/md5-arm-asm.h | 2 +- hasher/md5-arm64-asm.h | 14 +++++++------- hasher/md5-avx512-asm.h | 8 ++++---- hasher/md5-scalar-base.h | 6 +++--- hasher/md5-x86-asm.h | 8 ++++---- hasher/md5x2-arm-asm.h | 2 +- hasher/md5x2-neon-asm.h | 4 ++-- hasher/md5x2-sse-asm.h | 22 +++++++++++----------- hasher/md5x2-x86-asm.h | 20 ++++++++++---------- 11 files changed, 49 insertions(+), 49 deletions(-) diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h index 80877440..8c987ad3 100644 --- a/gf16/gf16_xor_common.h +++ b/gf16/gf16_xor_common.h @@ -37,7 +37,7 @@ extern void gf16_xor256_jit_multi_stub(intptr_t dst, intptr_t dstEnd, const void # endif static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) { WRITE_JIT(2048) - asm volatile( + __asm__ volatile( "callq *%q[f]\n" : "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" @@ -46,7 +46,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, # ifdef __AVX2__ static HEDLEY_ALWAYS_INLINE void gf16_xor256_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) { WRITE_JIT(2048) - asm volatile( + __asm__ volatile( "callq *%q[f]\n" : "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn) : "memory" // GCC pre 4.9 doesn't accept YMM registers @@ -59,7 +59,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor256_jit_stub(intptr_t src, intptr_t dEn # ifdef __AVX512F__ static HEDLEY_ALWAYS_INLINE void gf16_xor512_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) { WRITE_JIT(2048) - asm volatile( + __asm__ volatile( "callq *%q[f]\n" : "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn) : "%zmm1", "%zmm2", "%zmm3", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory" @@ -69,7 +69,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor512_jit_multi_stub( intptr_t dst, intptr_t dstEnd, const void** src, void* fn ) { WRITE_JIT(8192) - asm volatile( + __asm__ volatile( "movq 8(%%rdx), %%rsi\n" "movq 16(%%rdx), %%rdi\n" "movq 24(%%rdx), %%r8\n" @@ -102,7 +102,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, } # else static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) { - asm volatile( + __asm__ volatile( "calll *%[f]\n" : "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "memory" diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c index 1f428c6c..bce368ec 100644 --- a/gf16/gf16_xor_sse2.c +++ b/gf16/gf16_xor_sse2.c @@ -262,7 +262,7 @@ static HEDLEY_ALWAYS_INLINE void STOREU_XMM(void* dest, __m128i xmm) { /* conditional move, because, for whatever reason, no-one thought of making a CMOVcc intrinsic */ #if defined(__GNUC__) || defined(__clang__) - #define CMOV(cond, dst, src) asm( \ + #define CMOV(cond, dst, src) __asm__( \ "test %[c], %[c]\n" \ "cmovnz %[s], %[d]\n" \ : [d]"+r"(dst): [c]"r"(cond), [s]"r"(src)) diff --git a/hasher/md5-arm-asm.h b/hasher/md5-arm-asm.h index 20e53e7b..66996b76 100644 --- a/hasher/md5-arm-asm.h +++ b/hasher/md5-arm-asm.h @@ -142,7 +142,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR D = state[3]; #endif - asm( + __asm__( "ldr " REG(TMP2) ", [%[in]]\n" REV(TMP2) #ifdef ARM_THUMB_LIMIT_REGS diff --git a/hasher/md5-arm64-asm.h b/hasher/md5-arm64-asm.h index 48180964..695a4e00 100644 --- a/hasher/md5-arm64-asm.h +++ b/hasher/md5-arm64-asm.h @@ -106,7 +106,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR #define RF4(i0, i1, i2, i3, i4, i5, kr) \ - asm( \ + __asm__( \ ROUND_F(A, A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 25, "ldp %w[cache2], %w[cache3], " LDP_SRC(2)) \ ROUND_F(D, D, A, B, C, "%w[cache1]", "k0", "", 20, "") \ ROUND_F(C, C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 15, "ldp %w[cache4], %w[cache5], " LDP_SRC(4)) \ @@ -117,7 +117,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR :); #define RG4(i0, i1, i2, i3, kr) \ - asm( \ + __asm__( \ ROUND_G(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 27) \ ROUND_G(D, A, B, C, "%w[cache1]", "k0", "", 23) \ ROUND_G(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 18) \ @@ -127,7 +127,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR :); #define RH4(i0, i1, i2, i3, kr) \ - asm( \ + __asm__( \ ROUND_H(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 28) \ ROUND_H(D, A, B, C, "%w[cache1]", "k0", "", 21) \ ROUND_H(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 16) \ @@ -137,7 +137,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR :); #define RI4(i0, i1, i2, i3, kr) \ - asm( \ + __asm__( \ ROUND_I(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 26) \ ROUND_I(D, A, B, C, "%w[cache1]", "k0", "", 22) \ ROUND_I(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 17) \ @@ -146,7 +146,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR : [kM]"r"(md5_constants_aarch64), [cache0]"r"(cache[i0]), [cache1]"r"(cache[i1]), [cache2]"r"(cache[i2]), [cache3]"r"(cache[i3]) \ :); - asm( + __asm__( "ldp %w[cache0], %w[cache1], " LDP_SRC(0) "\n" "ldp %[k0], %[k1], [%[kM]]\n" ROUND_F(IA, A, IB, IC, ID, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 25, "ldp %w[cache2], %w[cache3], " LDP_SRC(2)) @@ -162,7 +162,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR RF4( 4, 5, 6, 7, 8, 9, 32) RF4( 8, 9, 10, 11, 12, 13, 48) - asm( + __asm__( ROUND_F(A, A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 25, "ldp %w[cache2], %w[cache3], " LDP_SRC(14)) ROUND_F(D, D, A, B, C, "%w[cache1]", "k0", "", 20, "") ROUND_F(C, C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 15, "") @@ -187,7 +187,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR RI4( 7, 14, 5,12, 208) RI4( 3, 10, 1, 8, 224) RI4( 15, 6, 13, 4, 240) - asm( + __asm__( ROUND_I(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 26) ROUND_I(D, A, B, C, "%w[cache1]", "k0", "", 22) ROUND_I(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 17) diff --git a/hasher/md5-avx512-asm.h b/hasher/md5-avx512-asm.h index 9a625df5..1eb0e484 100644 --- a/hasher/md5-avx512-asm.h +++ b/hasher/md5-avx512-asm.h @@ -113,7 +113,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR "vprord $" STR(R) ", %[" STR(A) "], %[" STR(A) "]\n" \ "vpaddd %[" STR(B) "], %[" STR(A) "], %[" STR(A) "]\n" - asm( + __asm__( "vmovdqa %[ID], %[TMP2]\n" RF4_FIRST(0) RF4(4) @@ -132,7 +132,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR : [k0]"m"(md5_constants_avx512[n]), [k1]"m"(md5_constants_avx512[n+4]), [k2]"m"(md5_constants_avx512[n+8]), [k3]"m"(md5_constants_avx512[n+12]) \ : - asm( + __asm__( "vpaddd %[k0], %[in0], %[in0]\n" "vpaddd %[k1], %[in4], %[in4]\n" "vpaddd %[k2], %[in8], %[in8]\n" @@ -143,7 +143,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR RG4("%[in12]", "%[in0]", "%[in4]") : ASM_PARAMS(16)); - asm( + __asm__( "vpaddd %[k1], %[in4], %[in4]\n" "vpsrlq $32, %[in4], %[TMP1]\n" @@ -168,7 +168,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR "vmovdqa %[D], %[TMP2]\n" : ASM_PARAMS(32)); - asm( + __asm__( "vpaddd %[k0], %[in0], %[in0]\n" "vpaddd %[k1], %[in4], %[in4]\n" "vpaddd %[k3], %[in12], %[in12]\n" diff --git a/hasher/md5-scalar-base.h b/hasher/md5-scalar-base.h index a05c4fcb..df2823b6 100644 --- a/hasher/md5-scalar-base.h +++ b/hasher/md5-scalar-base.h @@ -38,7 +38,7 @@ */ # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) # define ROTATE(a,n) ({ unsigned int ret; \ - asm ( \ + __asm__ ( \ "roll %1,%0" \ : "=r"(ret) \ : "I"(n), "0"((unsigned int)(a)) \ @@ -48,7 +48,7 @@ # elif defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \ defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__) # define ROTATE(a,n) ({ unsigned int ret; \ - asm ( \ + __asm__ ( \ "rlwinm %0,%1,%2,0,31" \ : "=r"(ret) \ : "r"(a), "I"(n)); \ @@ -56,7 +56,7 @@ }) # elif defined(__s390x__) # define ROTATE(a,n) ({ unsigned int ret; \ - asm ("rll %0,%1,%2" \ + __asm__ ("rll %0,%1,%2" \ : "=r"(ret) \ : "r"(a), "I"(n)); \ ret; \ diff --git a/hasher/md5-x86-asm.h b/hasher/md5-x86-asm.h index 6aca35ba..17499bdd 100644 --- a/hasher/md5-x86-asm.h +++ b/hasher/md5-x86-asm.h @@ -221,7 +221,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR #endif - asm( + __asm__( #ifdef PLATFORM_AMD64 "movl %[input0], %k[TMP2]\n" "movl %k[ID], %k[TMP1]\n" @@ -249,7 +249,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR ASM_INPUTS :); - asm( + __asm__( RG4( 6, 11, 0, 5, -0x09e1da9e, -0x3fbf4cc0, 0x265e5a51, -0x16493856) RG4(10, 15, 4, 9, -0x29d0efa3, 0x02441453, -0x275e197f, -0x182c0438) RG4(14, 3, 8, 13, 0x21e1cde6, -0x3cc8f82a, -0x0b2af279, 0x455a14ed) @@ -429,7 +429,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_nolea(uint32_t* HEDLEY_RESTRI D = state[3]; - asm( + __asm__( "addl %[input0], %k[A]\n" "movl %k[D], %k[TMP1]\n" RF4(, 1, 2, 3, 4, -0x28955b88, -0x173848aa, 0x242070db, -0x3e423112) @@ -444,7 +444,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_nolea(uint32_t* HEDLEY_RESTRI : ASM_INPUTS :); - asm( + __asm__( RG4( 6, 11, 0, 5, -0x09e1da9e, -0x3fbf4cc0, 0x265e5a51, -0x16493856) RG4(10, 15, 4, 9, -0x29d0efa3, 0x02441453, -0x275e197f, -0x182c0438) RG4(14, 3, 8, 13, 0x21e1cde6, -0x3cc8f82a, -0x0b2af279, 0x455a14ed) diff --git a/hasher/md5x2-arm-asm.h b/hasher/md5x2-arm-asm.h index f931609d..f01e5195 100644 --- a/hasher/md5x2-arm-asm.h +++ b/hasher/md5x2-arm-asm.h @@ -176,7 +176,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "[%[i0], #" STR(i2) "]", "[%[i1], #" STR(i2) "]", k2l, k2h, 17) \ ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "[%[i0], #" STR(i3) "]", "[%[i1], #" STR(i3) "]", k3l, k3h, 11) - asm( + __asm__( "ldr " REG(TMP1) ", [%[i0]]\n" "ldr " REG(TMP2) ", [%[i1]]\n" #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ diff --git a/hasher/md5x2-neon-asm.h b/hasher/md5x2-neon-asm.h index 1789ab5a..e55e7cad 100644 --- a/hasher/md5x2-neon-asm.h +++ b/hasher/md5x2-neon-asm.h @@ -128,7 +128,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_neon(uint32x2_t* state, co ROUND_I(C, D, A, B, "v18", 17, 15) \ ROUND_I(B, C, D, A, "v19", 11, 21) - asm( + __asm__( "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[i0]]\n" "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[i1]]\n" "zip1 v24.4s, v20.4s, v28.4s\n" @@ -237,7 +237,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_neon(uint32x2_t* state, co ROUND_I(C, D, A, B, "d14", 17, 15) \ ROUND_I(B, C, D, A, "d15", 11, 21) - asm( + __asm__( "vld1.8 {d16-d19}, [%[i0]]\n" "add r4, %[i0], #32\n" "vld1.8 {d24-d27}, [%[i1]]\n" diff --git a/hasher/md5x2-sse-asm.h b/hasher/md5x2-sse-asm.h index d9391698..3516e5dd 100644 --- a/hasher/md5x2-sse-asm.h +++ b/hasher/md5x2-sse-asm.h @@ -152,7 +152,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_sse(__m128i* state, const "psrlq $" STR(R) ", %[" STR(A) "]\n" \ "paddd %[" STR(B) "], %[" STR(A) "]\n" -#define RF4(offs, r1, r2) asm( \ +#define RF4(offs, r1, r2) __asm__( \ READ4 \ ROUND_F(A, B, C, D, "%[TMPI1]", 25) \ "psrlq $32, %[TMPI1]\n" \ @@ -216,21 +216,21 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_sse(__m128i* state, const RF4(2, 4, 5) RF4(3, 6, 7) - asm( + __asm__( RG4(0, 0, 3, 5) RG4(1, 2, 5, 7) RG4(2, 4, 7, 1) RG4(3, 6, 1, 3) : ASM_PARAMS(32)); - asm( + __asm__( RH4(0, ROUND_H_FIRST, 2, 4, 5, 7) RH4(1, ROUND_H, 0, 2, 3, 5) RH4(2, ROUND_H, 6, 0, 1, 3) RH4(3, ROUND_H, 4, 6, 7, 1) : ASM_PARAMS(64)); - asm( + __asm__( "pcmpeqb %[TMPF2], %[TMPF2]\n" RI4(0, 0, 3, 7, 2) RI4(1, 6, 1, 5, 0) @@ -314,7 +314,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_avx(__m128i* state, const "vpsrlq $" STR(R) ", %[" STR(A) "], %[" STR(A) "]\n" \ "vpaddd %[" STR(B) "], %[" STR(A) "], %[" STR(A) "]\n" -#define RF4(offs, r1, r2) asm( \ +#define RF4(offs, r1, r2) __asm__( \ READ4 \ ROUND_F(A, B, C, D, "%[TMPI1]", 25) \ "vpsrlq $32, %[TMPI1], %[TMPI1]\n" \ @@ -380,21 +380,21 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_avx(__m128i* state, const RF4(2, 4, 5) RF4(3, 6, 7) - asm( + __asm__( RG4(0, 0, 3, 5) RG4(1, 2, 5, 7) RG4(2, 4, 7, 1) RG4(3, 6, 1, 3) : ASM_PARAMS(32)); - asm( + __asm__( RH4(0, ROUND_H, 2, 4, 5, 7) RH4(1, ROUND_H, 0, 2, 3, 5) RH4(2, ROUND_H, 6, 0, 1, 3) RH4(3, ROUND_H, 4, 6, 7, 1) : ASM_PARAMS(64)); - asm( + __asm__( "vpcmpeqb %[TMPF2], %[TMPF2], %[TMPF2]\n" RI4(0, 0, 3, 7, 2) RI4(1, 6, 1, 5, 0) @@ -460,21 +460,21 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_avx512(__m128i* state, con RF4(2, 4, 5) RF4(3, 6, 7) - asm( + __asm__( RG4(0, 0, 3, 5) RG4(1, 2, 5, 7) RG4(2, 4, 7, 1) RG4(3, 6, 1, 3) : ASM_PARAMS(32)); - asm( + __asm__( RH4(0, ROUND_H_FIRST, 2, 4, 5, 7) RH4(1, ROUND_H, 0, 2, 3, 5) RH4(2, ROUND_H, 6, 0, 1, 3) RH4(3, ROUND_H, 4, 6, 7, 1) : ASM_PARAMS(64)); - asm( + __asm__( RI4(0, 0, 3, 7, 2) RI4(1, 6, 1, 5, 0) RI4(2, 4, 7, 3, 6) diff --git a/hasher/md5x2-x86-asm.h b/hasher/md5x2-x86-asm.h index e0f18b96..64322d2f 100644 --- a/hasher/md5x2-x86-asm.h +++ b/hasher/md5x2-x86-asm.h @@ -163,34 +163,34 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co [i1_0]"m"(_data[1][i0]), [i1_1]"m"(_data[1][i1]) ASM_PARAMS_ONES \ : -#define RF4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \ +#define RF4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ ROUND_F(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 7) \ ROUND_F(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 12) \ -: ASM_PARAMS(i0, i1)); asm( \ +: ASM_PARAMS(i0, i1)); __asm__( \ ROUND_F(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 17) \ ROUND_F(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 22) \ : ASM_PARAMS(i2, i3)); -#define RG4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \ +#define RG4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ ROUND_G(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 5) \ ROUND_G(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 9) \ -: ASM_PARAMS(i0, i1)); asm( \ +: ASM_PARAMS(i0, i1)); __asm__( \ ROUND_G(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 14) \ ROUND_G(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 20) \ : ASM_PARAMS(i2, i3)); -#define RH4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \ +#define RH4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ ROUND_H(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 4) \ ROUND_H(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 11) \ -: ASM_PARAMS(i0, i1)); asm( \ +: ASM_PARAMS(i0, i1)); __asm__( \ ROUND_H(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 16) \ ROUND_H(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 23) \ : ASM_PARAMS(i2, i3)); -#define RI4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \ +#define RI4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 6) \ ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 10) \ -: ASM_PARAMS(i0, i1)); asm( \ +: ASM_PARAMS(i0, i1)); __asm__( \ ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 15) \ ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 21) \ : ASM_PARAMS(i2, i3)); @@ -217,10 +217,10 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co RI4( 3, 10, 1, 8, 0x655b59c3, -0x70f3336e, -0x00100b83, -0x7a7ba22f) RI4(15, 6, 13, 4, 0x6fa87e4f, -0x01d31920, -0x5cfebcec, 0x4e0811a1) - asm( + __asm__( ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", -0x08ac817e, 6) ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", -0x42c50dcb, 10) - : ASM_PARAMS(11, 2)); asm( + : ASM_PARAMS(11, 2)); __asm__( ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", 0x2ad7d2bb, 15) ROUND_I_LAST(B1, C1, D1, A1, B2, C2, D2, A2, -0x14792c6f, 21) : ASM_PARAMS(9, 0)); From 465d1f7c1e1e90bb352b30a148f6e70094f17c1e Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 17 Jul 2023 22:59:44 +1000 Subject: [PATCH 27/91] Add missing C99 flags --- binding.gyp | 1 + 1 file changed, 1 insertion(+) diff --git a/binding.gyp b/binding.gyp index 8d62640e..fd4af8ec 100644 --- a/binding.gyp +++ b/binding.gyp @@ -21,6 +21,7 @@ ] }] ], + "cflags": ["-std=gnu99", "-D_POSIX_C_SOURCE=200112L"], "cxxflags": ["-std=c++11"], "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}}, "configurations": {"Release": { From 5b2c917e2831307189f4e845ec19a662288b836d Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 18 Jul 2023 18:19:00 +1000 Subject: [PATCH 28/91] Try to use MAP_ANONYMOUS flag if available --- gf16/x86_jit.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gf16/x86_jit.h b/gf16/x86_jit.h index 0a8fbe50..2597d9d8 100644 --- a/gf16/x86_jit.h +++ b/gf16/x86_jit.h @@ -738,7 +738,13 @@ static HEDLEY_ALWAYS_INLINE jit_wx_pair* jit_alloc(size_t len) { if(!ret) return NULL; ret->len = len; - void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); + void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | +# ifdef MAP_ANONYMOUS + MAP_ANONYMOUS, +# else + MAP_ANON, +# endif + -1, 0); if(mem) { if((uintptr_t)mem & 63) { // page not cacheline aligned? something's gone wrong... munmap(mem, len); From 277c458ff5441a7c542a7e6bb6ceb487285d218f Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 15:13:17 +1000 Subject: [PATCH 29/91] Row grouping in matrix inversion + bug fixes --- gf16/gfmat_inv.cpp | 200 ++++++++++++++++++++++++++++++++------------- gf16/gfmat_inv.h | 9 +- 2 files changed, 151 insertions(+), 58 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index dd876676..84f1fdc7 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -26,10 +26,12 @@ class Galois16RecMatrixWorker { Galois16RecMatrixWorker(Galois16RecMatrixWorker&& other) noexcept : gf(other.gf) { thread = std::move(other.thread); gfScratch = other.gfScratch; + other.gfScratch = nullptr; } ~Galois16RecMatrixWorker() { thread.end(); - gf.mutScratch_free(gfScratch); + if(gfScratch) + gf.mutScratch_free(gfScratch); } }; @@ -37,7 +39,8 @@ struct Galois16RecMatrixWorkerMessage { unsigned stripeStart, stripeEnd; unsigned recFirst, recLast; unsigned recSrc; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch; - void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, void**, Galois16Mul&, void*, const void*); + unsigned coeffWidth; + void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*); Galois16RecMatrix* parent; std::atomic* procRefs; std::promise* done; @@ -46,7 +49,7 @@ struct Galois16RecMatrixWorkerMessage { static void invert_worker(ThreadMessageQueue& q) { Galois16RecMatrixWorkerMessage* req; while((req = static_cast(q.pop())) != NULL) { - (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->srcRows, *(req->gf), req->gfScratch, nullptr); + (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr); if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) { req->done->set_value(); } @@ -59,7 +62,7 @@ static void invert_worker(ThreadMessageQueue& q) { #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) template -void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) { +void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) { for(unsigned stripe=stripeStart; stripe 1) { if(HEDLEY_LIKELY(pf)) - gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf); + gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, coeffPtr, gfScratch, pf); else - gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch); + gf.mul_add_multi(rows, stripeWidth*numRec*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, coeffPtr, gfScratch); } else { if(HEDLEY_LIKELY(pf)) - gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch, pf); + gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch, pf); else - gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch); + gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch); } } } } +#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v) + template -int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers) { +int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch) { + assert(recFirst <= recLast); + unsigned missingCol = validCount + rec; uint16_t baseCoeff; uint16_t coeff[rows]; - unsigned sw16 = stripeWidth / sizeof(uint16_t); - // TODO: consider optimisation for numStripes == 1 ? - - - #define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/sw16, r), (c)%sw16, v) - void* srcRows[rows]; srcRows[0] = MAT_ROW(0, rec); for(unsigned i=1; i 2) { - MULADD_ROW_PF(rec+0, 1, srcRows[2]); + MULADD_ROW_PF(rec+0, 1, MAT_ROW(0, 2)); } else MULADD_LASTROW(rec+0, 1) } else { if(recFirst >= numRec) @@ -189,7 +191,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul } if(rows >= 3) { if(rows >= 4) { - MULADD_MULTI_ROW_PF(rec+2, 0, 2, srcRows[3]); + MULADD_MULTI_ROW_PF(rec+2, 0, 2, MAT_ROW(0, 3)); SCALE_ROW(2); MULADD_MULTI_ROW(rec+3, 0, 2); MULADD_ROW(rec+3, 2); @@ -197,7 +199,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul MULADD_ROW(rec+2, 3); MULADD_MULTI_ROW(rec+0, 2, 2); if(rows > 4) { - MULADD_MULTI_ROW_PF(rec+1, 2, 2, srcRows[4]); + MULADD_MULTI_ROW_PF(rec+1, 2, 2, MAT_ROW(0, 4)); } else MULADD_MULTI_LASTROW(rec+1, 2, 2) } else { MULADD_MULTI_ROW(rec+2, 0, 2); @@ -208,7 +210,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul } if(rows >= 5) { if(rows >= 6) { - MULADD_MULTI_ROW_PF(rec+4, 0, 4, srcRows[5]); + MULADD_MULTI_ROW_PF(rec+4, 0, 4, MAT_ROW(0, 5)); SCALE_ROW(4); MULADD_MULTI_ROW(rec+5, 0, 4); MULADD_ROW(rec+5, 4); @@ -227,20 +229,50 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul MULADD_LASTROW(rec+3, 4) } } - - // do main elimination, using the source group - // first, gather all relevant coefficients - for(unsigned r=0; r +void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers) { + // TODO: consider optimisation for numStripes == 1 ? + + assert(recFirst <= recLast); + + void* srcRows[rows]; + srcRows[0] = MAT_ROW(0, rec); + for(unsigned i=1; i= recLast) return; + + // do main elimination, using the source group if(workers.empty()) // process elimination directly - invertLoop(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow); + invertLoop(0, numStripes, recFirst, recLast, rec, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow); else { // process using workers std::atomic procRefs; @@ -248,11 +280,12 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul auto makeReq = [&, this]() -> Galois16RecMatrixWorkerMessage* { auto* req = new Galois16RecMatrixWorkerMessage; req->recFirst = recFirst; - req->recLast = numRec; + req->recLast = recLast; req->recSrc = rec; req->rowCoeffs = rowCoeffs; req->srcRows = srcRows; req->gf = &gf; + req->coeffWidth = coeffWidth; req->fn = &Galois16RecMatrix::invertLoop; req->parent = this; req->procRefs = &procRefs; @@ -278,23 +311,26 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul float workerCnt = 0.5; for(unsigned stripe=0; stripe= recFirst && rec < recLast) numRows -= rows; + numRows = CEIL_DIV(numRows, workerNum); if(numRows < MIN_THREAD_REC) numRows = MIN_THREAD_REC; // ensure workers have a half decent amount of stuff to do unsigned rowPos = recFirst; - while(rowPos < numRec) { + while(rowPos < recLast) { unsigned sendRows = numRows; if(rowPos+sendRows > rec && rowPos <= rec) // need to send extra to compensate for the gap sendRows += rows; - if(rowPos+sendRows > numRec) - sendRows = numRec - rowPos; + if(rowPos+sendRows > recLast) + sendRows = recLast - rowPos; auto* req = makeReq(); req->stripeStart = stripe; req->stripeEnd = stripe+1; req->recFirst = rowPos; req->recLast = rowPos+sendRows; + req->rowCoeffs += (rowPos-recFirst) * coeffWidth; reqs.push_back(req); rowPos += sendRows; @@ -305,6 +341,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul } assert(reqs.size() <= workers.size()); procRefs.store(reqs.size()); + assert(procRefs > 0); for(unsigned i=0; i +int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress) { + unsigned alignedRowGroupSize = (rowGroupSize / rows) * rows; + while(rec <= numRec-rows) { + + unsigned curRowGroupSize = alignedRowGroupSize; + if(numRec-rec < curRowGroupSize) { + curRowGroupSize = numRec-rec; + curRowGroupSize -= curRowGroupSize % rows; + } + assert(curRowGroupSize > 0); + + unsigned recStart = rec; + // for progress indicator, we'll even it out by computing a ratio to advance by + unsigned progressRatio = (curRowGroupSize<<16)/numRec; + unsigned progressBase = recStart + progressOffset; + + // loop through this row group (normalize values) + for(; rec < curRowGroupSize+recStart; rec+=rows) { + if(progressCb) progressCb(progressBase + (((rec-recStart)*progressRatio+32768)>>16), totalProgress); + int badRowOffset = initScale(rec, validCount, recStart, curRowGroupSize+recStart, gf, gfScratch); + if(badRowOffset >= 0) return rec+badRowOffset; + fillCoeffs(rowCoeffs, rows, validCount, recStart, curRowGroupSize+recStart, rec, rows, gf); + processRow(rec, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers); + } + + + // apply current row group to all other row groups + for(unsigned recGroup=0; recGroup>16), totalProgress); + } + + unsigned curRowGroupSize2 = rowGroupSize; + if(numRec-recGroup < curRowGroupSize2) + curRowGroupSize2 = numRec-recGroup; + if(recGroup < recStart && recGroup+curRowGroupSize2 > recStart) + curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group + fillCoeffs(rowCoeffs, curRowGroupSize, validCount, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize, gf); + for(unsigned rec2=recStart; rec2 < curRowGroupSize+recStart; rec2+=rows) { + processRow(rec2, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs + (rec2-recStart), curRowGroupSize, workers); + } + recGroup += curRowGroupSize2; + } + } + return -1; +} + + + // construct initial matrix (pre-inversion) void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery) { unsigned validCol = 0; @@ -472,15 +555,23 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va unsigned _numThreads = numThreads; if(numRec < MIN_THREAD_REC) _numThreads = 1; // don't spawn threads if not enough work if(_numThreads > 1) { + workers.reserve(_numThreads); for(unsigned i=0; i<_numThreads; i++) { - workers.push_back(Galois16RecMatrixWorker(gf)); + workers.emplace_back(gf); workers[i].thread.name = "gauss_worker"; workers[i].thread.setCallback(invert_worker); } - gfScratch = nullptr; // ...otherwise MSVC won't be happy + gfScratch = workers[0].gfScratch; } else gfScratch = gf.mutScratch_alloc(); + // target L3 slice? use 1MB target for now; TODO: improve this + unsigned rowGroupSize = (1024*1024 / stripeWidth); + // if it's going to be split amongst cores, increase the number of rows in a group + if(numStripes < _numThreads) rowGroupSize *= _numThreads/numStripes; + if(rowGroupSize < gfInfo.idealInputMultiple*2) rowGroupSize = gfInfo.idealInputMultiple*2; + if(rowGroupSize > numRec) rowGroupSize = numRec; + invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? if(numRec > recovery.size()) { // not enough recovery if(_numThreads <= 1) @@ -506,19 +597,15 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va unsigned rec = 0; #define INVERT_GROUP(rows) \ if(gfInfo.idealInputMultiple >= rows && numRec >= rows) { \ - for(; rec <= numRec-rows; rec+=rows) { \ - if(progressCb) progressCb(rec + progressOffset, totalProgress); \ - \ - int badRowOffset = processRow(rec, validCount, gf, gfScratch, rowCoeffs, workers); \ - if(badRowOffset >= 0) { \ - /* ignore this recovery row and try again */ \ - recovery.erase(recovery.begin() + rec + badRowOffset); \ - goto invert_loop; \ - } \ + int badRow = processRows(rec, rowGroupSize, validCount, gf, gfScratch, rowCoeffs, workers, progressCb, progressOffset, totalProgress); \ + if(badRow >= 0) { \ + /* ignore this recovery row and try again */ \ + recovery.erase(recovery.begin() + badRow); \ + goto invert_loop; \ } \ } // max out at 6 groups (registers + cache assoc?) - uint16_t* rowCoeffs = new uint16_t[numRec*6]; + uint16_t* rowCoeffs = new uint16_t[rowGroupSize*rowGroupSize]; INVERT_GROUP(6) INVERT_GROUP(5) INVERT_GROUP(4) @@ -547,6 +634,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) { numThreads = hardware_concurrency(); + if(numThreads > 4) numThreads = 4; // by default, cap at 4 threads, as scaling doesn't work so well; TODO: tweak this later numRec = 0; numStripes = 0; stripeWidth = 0; diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 79e84795..477b08a0 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -17,9 +17,14 @@ class Galois16RecMatrix { void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); template - void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf); + void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf); template - int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers); + int initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch); + void fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf); + template + void processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers); + template + int processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress); public: Galois16RecMatrix(); ~Galois16RecMatrix(); From 070f14a725280144ae02dd750ac1277073369f98 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 15:13:54 +1000 Subject: [PATCH 30/91] Update CFLAGS --- binding.gyp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binding.gyp b/binding.gyp index fd4af8ec..70a36159 100644 --- a/binding.gyp +++ b/binding.gyp @@ -21,7 +21,7 @@ ] }] ], - "cflags": ["-std=gnu99", "-D_POSIX_C_SOURCE=200112L"], + "cflags": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"], "cxxflags": ["-std=c++11"], "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}}, "configurations": {"Release": { From 41a7a4c0b9b23deae9ca78e887382abe837aff2b Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 15:14:46 +1000 Subject: [PATCH 31/91] Clarification on --auto-slice-size --- help.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/help.txt b/help.txt index da6a3a05..66674bbd 100644 --- a/help.txt +++ b/help.txt @@ -33,7 +33,9 @@ PAR2 Options: equals that size, otherwise is 4B -S, --auto-slice-size Automatically scale up input slice size if the number of input slices would exceed the maximum - allowed. This option takes no parameters. + allowed. The chosen slice size will respect + `--slice-size-multiple` when scaling up the slice. + This option takes no parameters. Alias for `--max-input-slices=32768` -r, --recovery-slices Number of recovery slices to generate. You can append a suffix to auto-calculate this, as in the From 2274f71f4abc5e42ac030ca1c658633d61afd504 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 15:24:50 +1000 Subject: [PATCH 32/91] Fix max length of Windows thread name --- gf16/threadqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h index ecb535b4..ee11a8c2 100644 --- a/gf16/threadqueue.h +++ b/gf16/threadqueue.h @@ -282,7 +282,7 @@ class MessageThread { if(fnSetTD) { wchar_t nameUCS2[17]; //assert(strlen(self->name) <= 16); // always hard-coded string, plus Linux limits it to 16 chars, so shouldn't ever overflow - MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, self->name, -1, nameUCS2, 50); + MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, self->name, -1, nameUCS2, sizeof(nameUCS2)/sizeof(wchar_t) -1); fnSetTD(GetCurrentThread(), nameUCS2); } } From f2cf95d3fe5bc4d18ff309271b5b85d182febc56 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 15:50:43 +1000 Subject: [PATCH 33/91] Fix checksum-prepare function prototype --- gf16/gf16_checksum_arm.h | 2 +- gf16/gf16_checksum_x86.h | 2 +- gf16/gf16_lookup.c | 4 ++-- gf16/gf16_sve_common.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gf16/gf16_checksum_arm.h b/gf16/gf16_checksum_arm.h index b53cbb9a..2b945a31 100644 --- a/gf16/gf16_checksum_arm.h +++ b/gf16/gf16_checksum_arm.h @@ -76,7 +76,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_checksum_exp_neon(void *HEDLEY_RESTRICT ch gf16_checksum_store(checksum, res); } -static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) { +static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) { #define _X(bl) \ ALIGN_TO(16, uint8_t tmp[bl]) = {0}; \ vst1q_u8(tmp, gf16_checksum_load(checksum)); \ diff --git a/gf16/gf16_checksum_x86.h b/gf16/gf16_checksum_x86.h index e7404888..37144352 100644 --- a/gf16/gf16_checksum_x86.h +++ b/gf16/gf16_checksum_x86.h @@ -149,7 +149,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_checksum_exp)(void *HEDLEY_RESTRICT ch *(_mword*)checksum = res; } -static HEDLEY_ALWAYS_INLINE void _FN(gf16_checksum_prepare)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_checksum_prepare)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) { // because some compilers don't like `tmp[blockLen]` despite blockLen being constant, just implement every possibility #define _X(bl) \ ALIGN_TO(MWORD_SIZE, uint8_t tmp[bl]) = {0}; \ diff --git a/gf16/gf16_lookup.c b/gf16/gf16_lookup.c index c8ceef7a..dc293de8 100644 --- a/gf16/gf16_lookup.c +++ b/gf16/gf16_lookup.c @@ -579,7 +579,7 @@ HEDLEY_CONST size_t gf16_lookup3_stride() { -static HEDLEY_ALWAYS_INLINE void gf16_lookup_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) { +static HEDLEY_ALWAYS_INLINE void gf16_lookup_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) { UNUSED(prepareBlock); memset(dst, 0, blockLen); if(sizeof(uintptr_t) >= 8) @@ -628,7 +628,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_lookup3_prepare_blocku(void *HEDLEY_RESTRI memcpy(&data, src, remaining); gf16_lookup3_prepare_block(dst, &data); } -static HEDLEY_ALWAYS_INLINE void gf16_lookup3_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) { +static HEDLEY_ALWAYS_INLINE void gf16_lookup3_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) { UNUSED(prepareBlock); gf16_lookup3_prepare_block(dst, checksum); memset((char*)dst+gf16_lookup3_stride(), 0, blockLen-gf16_lookup3_stride()); diff --git a/gf16/gf16_sve_common.h b/gf16/gf16_sve_common.h index 0dabf676..2c9b7c6b 100644 --- a/gf16/gf16_sve_common.h +++ b/gf16/gf16_sve_common.h @@ -45,7 +45,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_finish_half_blocku_sve(void *HEDLEY_RESTRI svst1_u8(svwhilelt_b8((uint64_t)0, (uint64_t)remaining), dst, svld1_u8(svptrue_b8(), src)); } -static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) { +static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) { ALIGN_TO(16, int16_t tmp[blockLen/2]); memset(tmp, 0, blockLen); svst1_s16(svptrue_b16(), tmp, *(svint16_t*)checksum); From f910bfa48de73583e755daeec957199230503050 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 15:55:47 +1000 Subject: [PATCH 34/91] Fix type of JIT offset, as it's typically >255 --- gf16/gf16_xor_avx2.c | 6 +++--- gf16/gf16_xor_avx512.c | 2 +- gf16/gf16_xor_common.h | 4 ++-- gf16/gf16_xor_sse2.c | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c index d9ba9578..219b05ae 100644 --- a/gf16/gf16_xor_avx2.c +++ b/gf16/gf16_xor_avx2.c @@ -675,7 +675,7 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _avx2) #if defined(__AVX2__) && defined(PLATFORM_AMD64) -static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) { +static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast16_t* sizeNorm, uint_fast16_t* sizeInsitu) { uint8_t *jitCodeStart = jitCodeNorm; jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 512); jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 512); @@ -684,7 +684,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin for(int i=3; i<16; i++) { jitCodeNorm += _jit_vmovdqa_load(jitCodeNorm, i, AX, lshift32(i-4, 5)); } - if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart; + if(sizeNorm) *sizeNorm = (uint_fast16_t)(jitCodeNorm-jitCodeStart); jitCodeStart = jitCodeInsitu; @@ -696,7 +696,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin for(int i=0; i<3; i++) { jitCodeInsitu += _jit_vmovdqa_store(jitCodeInsitu, AX, lshift32(i-4, 5), i); } - if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart; + if(sizeInsitu) *sizeInsitu = (uint_fast16_t)(jitCodeInsitu-jitCodeStart); } # include "gf16_bitdep_init_avx2.h" diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c index cce0625b..694f5ef5 100644 --- a/gf16/gf16_xor_avx512.c +++ b/gf16/gf16_xor_avx512.c @@ -1206,7 +1206,7 @@ void* gf16_xor_jit_init_avx512(int polynomial, int jitOptStrat) { gf16_bitdep_init256(ret->deps, polynomial, 0); ret->jitOptStrat = jitOptStrat; - ret->codeStart = (uint_fast8_t)xor_write_init_jit(tmpCode); + ret->codeStart = (uint_fast16_t)xor_write_init_jit(tmpCode); return ret; #else UNUSED(polynomial); UNUSED(jitOptStrat); diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h index 8c987ad3..21b7ea80 100644 --- a/gf16/gf16_xor_common.h +++ b/gf16/gf16_xor_common.h @@ -115,8 +115,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, struct gf16_xor_scratch { uint8_t deps[16*16*2*4]; int jitOptStrat; // GF16_XOR_JIT_STRAT_* - uint_fast8_t codeStart; - uint_fast8_t codeStartInsitu; + uint_fast16_t codeStart; + uint_fast16_t codeStartInsitu; }; diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c index bce368ec..fe68fbbf 100644 --- a/gf16/gf16_xor_sse2.c +++ b/gf16/gf16_xor_sse2.c @@ -1197,7 +1197,7 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _sse2) #include "gf16_bitdep_init_sse2.h" #ifdef PLATFORM_X86 -static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) { +static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast16_t* sizeNorm, uint_fast16_t* sizeInsitu) { uint8_t *jitCodeStart = jitCodeNorm; jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 256); jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 256); @@ -1214,7 +1214,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin } # endif - if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart; + if(sizeNorm) *sizeNorm = (uint_fast16_t)(jitCodeNorm-jitCodeStart); // in-situ version jitCodeStart = jitCodeInsitu; @@ -1237,7 +1237,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin } # endif - if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart; + if(sizeInsitu) *sizeInsitu = (uint_fast16_t)(jitCodeInsitu-jitCodeStart); } #endif From 731897f32d8a9e3e5ec995a9dd22658ca9706f0e Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 19 Jul 2023 22:37:56 +1000 Subject: [PATCH 35/91] Enable ClMul to be used for inversion, using Shuffle for single region multiplies --- gf16/gf16mul.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 5d6a33c4..ac7765f3 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -702,8 +702,16 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { int available = gf16_clmul_init_arm(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_available_neon && available) - _mul = &gf16_clmul_mul_neon; - _mul_add = &gf16_clmul_muladd_neon; + + // use Shuffle for single region multiplies, because it's faster + scratch = gf16_shuffle_init_arm(GF16_POLYNOMIAL); + if(scratch) { + _mul = &gf16_shuffle_mul_neon; + _mul_add = &gf16_shuffle_muladd_neon; + } else { + _mul = &gf16_clmul_mul_neon; + _mul_add = &gf16_clmul_muladd_neon; + } _mul_add_multi = &gf16_clmul_muladd_multi_neon; _mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_neon; _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_neon; @@ -818,8 +826,9 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { case GF16_CLMUL_SVE2: METHOD_REQUIRES(gf16_available_sve2) - _mul = &gf16_clmul_mul_sve2; - _mul_add = &gf16_clmul_muladd_sve2; + // single region multiplies (_mul/add) use Shuffle-128 instead + _mul = &gf16_shuffle_mul_128_sve2; + _mul_add = &gf16_shuffle_muladd_128_sve2; _mul_add_multi = &gf16_clmul_muladd_multi_sve2; _mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_sve2; _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sve2; @@ -1310,7 +1319,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu if(caps.hasSVE2) { if(gf16_sve_get_size() >= 64) return GF16_SHUFFLE_512_SVE2; - return inputs > 3 && !forInvert ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2; + return inputs > 3 ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2; } if(caps.hasSVE && gf16_sve_get_size() > 16) return GF16_SHUFFLE_128_SVE; @@ -1321,7 +1330,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu # else inputs > 1 # endif - && !forInvert ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; + ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; #endif From 8abd213d5c1278cd0de5bd260644a7e2593077d0 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 20 Jul 2023 16:11:55 +1000 Subject: [PATCH 36/91] Nest row group apply loop inside striping loop during inversion Hopefully better cache usage + less waiting on threads --- gf16/gfmat_inv.cpp | 75 ++++++++++++++++++++++++---------------------- gf16/gfmat_inv.h | 4 +-- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 84f1fdc7..fbf03900 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -38,9 +38,9 @@ class Galois16RecMatrixWorker { struct Galois16RecMatrixWorkerMessage { unsigned stripeStart, stripeEnd; unsigned recFirst, recLast; - unsigned recSrc; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch; + unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch; unsigned coeffWidth; - void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*); + void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*); Galois16RecMatrix* parent; std::atomic* procRefs; std::promise* done; @@ -49,7 +49,7 @@ struct Galois16RecMatrixWorkerMessage { static void invert_worker(ThreadMessageQueue& q) { Galois16RecMatrixWorkerMessage* req; while((req = static_cast(q.pop())) != NULL) { - (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr); + (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr); if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) { req->done->set_value(); } @@ -62,33 +62,39 @@ static void invert_worker(ThreadMessageQueue& q) { #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) template -void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) { +void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) { + assert(recSrcCount % rows == 0); for(unsigned stripe=stripeStart; stripe 1) { - if(HEDLEY_LIKELY(pf)) - gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, coeffPtr, gfScratch, pf); - else - gf.mul_add_multi(rows, stripeWidth*numRec*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, coeffPtr, gfScratch); - } else { - if(HEDLEY_LIKELY(pf)) - gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch, pf); - else - gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch); + for(unsigned recI = 0; recI < recSrcCount; recI += rows) { + unsigned rec = recI+recSrc; + for(unsigned rec2=recFirst; rec2 1) { + if(HEDLEY_LIKELY(pf)) + gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf); + else + gf.mul_add_multi(rows, stripeWidth*(numRec*stripe + recI), MAT_ROW(0, curRec2) - recI*stripeWidth/sizeof(uint16_t), srcRows, stripeWidth, coeffPtr, gfScratch); + } else { + if(HEDLEY_LIKELY(pf)) + gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf); + else + gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch); + } } } } @@ -253,7 +259,7 @@ void Galois16RecMatrix::fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned } template -void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers) { +void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers) { // TODO: consider optimisation for numStripes == 1 ? assert(recFirst <= recLast); @@ -272,7 +278,7 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned rec // do main elimination, using the source group if(workers.empty()) // process elimination directly - invertLoop(0, numStripes, recFirst, recLast, rec, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow); + invertLoop(0, numStripes, recFirst, recLast, rec, recCount, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow); else { // process using workers std::atomic procRefs; @@ -282,6 +288,7 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned rec req->recFirst = recFirst; req->recLast = recLast; req->recSrc = rec; + req->recSrcCount = recCount; req->rowCoeffs = rowCoeffs; req->srcRows = srcRows; req->gf = &gf; @@ -382,7 +389,7 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne int badRowOffset = initScale(rec, validCount, recStart, curRowGroupSize+recStart, gf, gfScratch); if(badRowOffset >= 0) return rec+badRowOffset; fillCoeffs(rowCoeffs, rows, validCount, recStart, curRowGroupSize+recStart, rec, rows, gf); - processRow(rec, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers); + processRow(rec, rows, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers); } @@ -404,9 +411,7 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne if(recGroup < recStart && recGroup+curRowGroupSize2 > recStart) curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group fillCoeffs(rowCoeffs, curRowGroupSize, validCount, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize, gf); - for(unsigned rec2=recStart; rec2 < curRowGroupSize+recStart; rec2+=rows) { - processRow(rec2, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs + (rec2-recStart), curRowGroupSize, workers); - } + processRow(recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs, curRowGroupSize, workers); recGroup += curRowGroupSize2; } } diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 477b08a0..7490dcaf 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -17,12 +17,12 @@ class Galois16RecMatrix { void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); template - void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf); + void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf); template int initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch); void fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf); template - void processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers); + void processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers); template int processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress); public: From 415729c5e3dbd6340133e423fea17a618f09c791 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 20 Jul 2023 16:51:36 +1000 Subject: [PATCH 37/91] Enable VPCLMULQDQ in MSVC --- src/platform.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/platform.h b/src/platform.h index 40c36fb3..11d7f6d6 100644 --- a/src/platform.h +++ b/src/platform.h @@ -81,6 +81,9 @@ #if defined(__AVX512F__) && _MSC_VER >= 1914 #define __AVX512VBMI__ 1 #endif +#if defined(__AVX2__) && _MSC_VER >= 1915 + #define __VPCLMULQDQ__ 1 +#endif #if defined(__SSE2__) && _MSC_VER >= 1920 #define __GFNI__ 1 #endif From 8b5caa38120d40846f09d79f6bd076b3b9671a82 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 20 Jul 2023 21:30:45 +1000 Subject: [PATCH 38/91] Inversion code tweaks --- gf16/gfmat_inv.cpp | 203 ++++++++++++++++++++++++--------------------- gf16/gfmat_inv.h | 13 +-- 2 files changed, 116 insertions(+), 100 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index fbf03900..921a930c 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -14,6 +14,17 @@ extern "C" uint16_t* gf16_recip; static const unsigned MIN_THREAD_REC = 10; // minimum number of rows to process on a thread +struct Galois16RecMatrixComputeState { + uint16_t* coeff; + Galois16Mul gf; + void* gfScratch; + unsigned validCount; + void* srcRowsBase[PP_INVERT_MAX_MULTI_ROWS]; + std::vector workers; + + Galois16RecMatrixComputeState(Galois16Methods method) : gf(method) {} +}; + class Galois16RecMatrixWorker { const Galois16Mul& gf; public: @@ -38,18 +49,22 @@ class Galois16RecMatrixWorker { struct Galois16RecMatrixWorkerMessage { unsigned stripeStart, stripeEnd; unsigned recFirst, recLast; - unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch; + unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; Galois16Mul* gf; void* gfScratch; + void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS]; unsigned coeffWidth; - void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*); + void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void*(&)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul&, void*, const void*); Galois16RecMatrix* parent; std::atomic* procRefs; std::promise* done; + + Galois16RecMatrixWorkerMessage(Galois16RecMatrixComputeState& state) + : rowCoeffs(state.coeff), gf(&state.gf), srcRowsBase(state.srcRowsBase) {} }; static void invert_worker(ThreadMessageQueue& q) { Galois16RecMatrixWorkerMessage* req; while((req = static_cast(q.pop())) != NULL) { - (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr); + (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRowsBase, *(req->gf), req->gfScratch, nullptr); if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) { req->done->set_value(); } @@ -62,7 +77,7 @@ static void invert_worker(ThreadMessageQueue& q) { #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) template -void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) { +void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf) { assert(recSrcCount % rows == 0); for(unsigned stripe=stripeStart; stripe 1) { if(HEDLEY_LIKELY(pf)) gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf); - else - gf.mul_add_multi(rows, stripeWidth*(numRec*stripe + recI), MAT_ROW(0, curRec2) - recI*stripeWidth/sizeof(uint16_t), srcRows, stripeWidth, coeffPtr, gfScratch); + else { + unsigned offset = rec*stripeWidth; + gf.mul_add_multi(rows, stripeWidth*numRec*stripe + offset, MAT_ROW(0, curRec2) - offset/sizeof(uint16_t), srcRowsBase, stripeWidth, coeffPtr, gfScratch); + } } else { if(HEDLEY_LIKELY(pf)) gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf); @@ -100,55 +117,50 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns } } -#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v) +#define REPLACE_WORD(r, c, v) state.gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v) template -int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch) { +int Galois16RecMatrix::initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast) { assert(recFirst <= recLast); + assert(rec != recFirst); - unsigned missingCol = validCount + rec; - - uint16_t baseCoeff; - uint16_t coeff[rows]; - - void* srcRows[rows]; - srcRows[0] = MAT_ROW(0, rec); - for(unsigned i=1; i 6 case not handled"); + return -1; #undef SCALE_ROW #undef MULADD_ROW @@ -245,53 +258,42 @@ int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned rec #undef MULADD_MULTI_LASTROW } -void Galois16RecMatrix::fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf) { - unsigned missingCol = validCount + rec; - if(recFirst == rec) recFirst += rows; +void Galois16RecMatrix::fillCoeffs(Galois16RecMatrixComputeState& state, unsigned rows, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth) { + assert(rec != recFirst); + unsigned missingCol = state.validCount + rec; for(unsigned r=recFirst; r -void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers) { +void Galois16RecMatrix::processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth) { // TODO: consider optimisation for numStripes == 1 ? - assert(recFirst <= recLast); - - void* srcRows[rows]; - srcRows[0] = MAT_ROW(0, rec); - for(unsigned i=1; i= recLast) return; - // do main elimination, using the source group - if(workers.empty()) + if(state.workers.empty()) // process elimination directly - invertLoop(0, numStripes, recFirst, recLast, rec, recCount, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow); + invertLoop(0, numStripes, recFirst, recLast, rec, recCount, state.coeff, coeffWidth, state.srcRowsBase, state.gf, state.gfScratch, nextScaleRow); else { // process using workers std::atomic procRefs; std::promise done; auto makeReq = [&, this]() -> Galois16RecMatrixWorkerMessage* { - auto* req = new Galois16RecMatrixWorkerMessage; + auto* req = new Galois16RecMatrixWorkerMessage(state); req->recFirst = recFirst; req->recLast = recLast; req->recSrc = rec; req->recSrcCount = recCount; - req->rowCoeffs = rowCoeffs; - req->srcRows = srcRows; - req->gf = &gf; req->coeffWidth = coeffWidth; req->fn = &Galois16RecMatrix::invertLoop; req->parent = this; @@ -299,11 +301,11 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec req->done = &done; return req; }; - if(numStripes >= workers.size()) { // split full stripes across workers - float stripesPerWorker = (float)numStripes / workers.size(); + if(numStripes >= state.workers.size()) { // split full stripes across workers + float stripesPerWorker = (float)numStripes / state.workers.size(); float stripe = 0.5; - procRefs.store(workers.size()); - for(auto& worker : workers) { + procRefs.store((int)state.workers.size()); + for(auto& worker : state.workers) { auto* req = makeReq(); req->stripeStart = (unsigned)stripe; req->stripeEnd = (unsigned)(stripe + stripesPerWorker); @@ -311,10 +313,11 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec worker.thread.send(req); stripe += stripesPerWorker; } + assert((unsigned)stripe == numStripes); } else { // each stripe may need >1 worker std::vector reqs; - reqs.reserve(workers.size()); - float workersPerStripe = (float)workers.size() / numStripes; + reqs.reserve(state.workers.size()); + float workersPerStripe = (float)state.workers.size() / numStripes; float workerCnt = 0.5; for(unsigned stripe=0; stripe 0); for(unsigned i=0; igfScratch = worker.gfScratch; worker.thread.send(req); @@ -367,7 +371,7 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec template -int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress) { +int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress) { unsigned alignedRowGroupSize = (rowGroupSize / rows) * rows; while(rec <= numRec-rows) { @@ -386,10 +390,14 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne // loop through this row group (normalize values) for(; rec < curRowGroupSize+recStart; rec+=rows) { if(progressCb) progressCb(progressBase + (((rec-recStart)*progressRatio+32768)>>16), totalProgress); - int badRowOffset = initScale(rec, validCount, recStart, curRowGroupSize+recStart, gf, gfScratch); + unsigned recFirst = recStart; + if(recFirst == rec) recFirst += rows; + + int badRowOffset = initScale(state, rec, recFirst, curRowGroupSize+recStart); if(badRowOffset >= 0) return rec+badRowOffset; - fillCoeffs(rowCoeffs, rows, validCount, recStart, curRowGroupSize+recStart, rec, rows, gf); - processRow(rec, rows, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers); + if(recFirst == curRowGroupSize+recStart) continue; + fillCoeffs(state, rows, recFirst, curRowGroupSize+recStart, rec, rows); + processRow(state, rec, rows, recFirst, curRowGroupSize+recStart, rows); } @@ -410,8 +418,9 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne curRowGroupSize2 = numRec-recGroup; if(recGroup < recStart && recGroup+curRowGroupSize2 > recStart) curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group - fillCoeffs(rowCoeffs, curRowGroupSize, validCount, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize, gf); - processRow(recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs, curRowGroupSize, workers); + assert(curRowGroupSize2 > 0); + fillCoeffs(state, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize); + processRow(state, recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, curRowGroupSize); recGroup += curRowGroupSize2; } } @@ -520,7 +529,7 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned } bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned validCount, std::vector& recovery, std::function progressCb) { - numRec = inputValid.size() - validCount; + numRec = (unsigned)inputValid.size() - validCount; assert(validCount < inputValid.size()); // i.e. numRec > 0 assert(inputValid.size() <= 32768 && inputValid.size() > 0); assert(recovery.size() <= 65535 && recovery.size() > 0); @@ -528,14 +537,15 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va if(numRec > recovery.size()) return false; - unsigned matWidth = inputValid.size() * sizeof(uint16_t); - Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true)); - const auto gfInfo = gf.info(); + unsigned matWidth = (unsigned)inputValid.size() * sizeof(uint16_t); + Galois16RecMatrixComputeState state(Galois16Mul::default_method(matWidth, (unsigned)inputValid.size(), (unsigned)inputValid.size(), true)); + state.validCount = validCount; + const auto gfInfo = state.gf.info(); // divide the matrix up into evenly sized stripes (for loop tiling optimisation) - numStripes = ROUND_DIV(matWidth, gfInfo.idealChunkSize); + numStripes = ROUND_DIV(matWidth, (unsigned)gfInfo.idealChunkSize); if(numStripes < 1) numStripes = 1; - stripeWidth = gf.alignToStride(CEIL_DIV(matWidth, numStripes)); + stripeWidth = (unsigned)state.gf.alignToStride(CEIL_DIV(matWidth, numStripes)); numStripes = CEIL_DIV(matWidth, stripeWidth); assert(numStripes >= 1); @@ -544,7 +554,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va unsigned matSize = numRec * stripeWidth*numStripes; ALIGN_ALLOC(mat, matSize, gfInfo.alignment); - uint16_t totalProgress = numRec + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc + uint16_t totalProgress = numRec + (state.gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc // easier to handle if exponents are in order std::sort(recovery.begin(), recovery.end()); @@ -555,32 +565,35 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va setup_pmul(); } - std::vector workers; - void* gfScratch; + state.srcRowsBase[0] = mat; + for(unsigned i=1; i 1) { - workers.reserve(_numThreads); + state.workers.reserve(_numThreads); for(unsigned i=0; i<_numThreads; i++) { - workers.emplace_back(gf); - workers[i].thread.name = "gauss_worker"; - workers[i].thread.setCallback(invert_worker); + state.workers.emplace_back(state.gf); + state.workers[i].thread.name = "gauss_worker"; + state.workers[i].thread.setCallback(invert_worker); } - gfScratch = workers[0].gfScratch; + state.gfScratch = state.workers[0].gfScratch; } else - gfScratch = gf.mutScratch_alloc(); + state.gfScratch = state.gf.mutScratch_alloc(); // target L3 slice? use 1MB target for now; TODO: improve this unsigned rowGroupSize = (1024*1024 / stripeWidth); // if it's going to be split amongst cores, increase the number of rows in a group if(numStripes < _numThreads) rowGroupSize *= _numThreads/numStripes; - if(rowGroupSize < gfInfo.idealInputMultiple*2) rowGroupSize = gfInfo.idealInputMultiple*2; + unsigned rowMultiple = (std::min)(gfInfo.idealInputMultiple, PP_INVERT_MAX_MULTI_ROWS); + if(rowGroupSize < rowMultiple*2) rowGroupSize = rowMultiple*2; if(rowGroupSize > numRec) rowGroupSize = numRec; invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? if(numRec > recovery.size()) { // not enough recovery if(_numThreads <= 1) - gf.mutScratch_free(gfScratch); + state.gf.mutScratch_free(state.gfScratch); ALIGN_FREE(mat); mat = nullptr; return false; @@ -591,18 +604,18 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va // pre-transform uint16_t progressOffset = 1; - if(gf.needPrepare()) { + if(state.gf.needPrepare()) { if(progressCb) progressCb(1, totalProgress); progressOffset = 2; - gf.prepare(mat, mat, matSize); + state.gf.prepare(mat, mat, matSize); } // invert unsigned rec = 0; #define INVERT_GROUP(rows) \ if(gfInfo.idealInputMultiple >= rows && numRec >= rows) { \ - int badRow = processRows(rec, rowGroupSize, validCount, gf, gfScratch, rowCoeffs, workers, progressCb, progressOffset, totalProgress); \ + int badRow = processRows(state, rec, rowGroupSize, progressCb, progressOffset, totalProgress); \ if(badRow >= 0) { \ /* ignore this recovery row and try again */ \ recovery.erase(recovery.begin() + badRow); \ @@ -610,21 +623,21 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } \ } // max out at 6 groups (registers + cache assoc?) - uint16_t* rowCoeffs = new uint16_t[rowGroupSize*rowGroupSize]; + state.coeff = new uint16_t[rowGroupSize*rowGroupSize]; INVERT_GROUP(6) INVERT_GROUP(5) INVERT_GROUP(4) INVERT_GROUP(3) INVERT_GROUP(2) INVERT_GROUP(1) - delete[] rowCoeffs; + delete[] state.coeff; #undef INVERT_GROUP // post transform - if(gf.needPrepare()) { + if(state.gf.needPrepare()) { if(progressCb) progressCb(totalProgress-1, totalProgress); - gf.finish(mat, matSize); + state.gf.finish(mat, matSize); // TODO: check for zeroes?? } } @@ -633,7 +646,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va recovery.resize(numRec); if(_numThreads <= 1) - gf.mutScratch_free(gfScratch); + state.gf.mutScratch_free(state.gfScratch); return true; } diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 7490dcaf..3ad049c6 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -6,8 +6,11 @@ #include "../src/stdint.h" #ifdef PARPAR_INVERT_SUPPORT +const unsigned PP_INVERT_MAX_MULTI_ROWS = 6; // process up to 6 rows in a multi-mul call + class Galois16Mul; class Galois16RecMatrixWorker; +struct Galois16RecMatrixComputeState; class Galois16RecMatrix { uint16_t* mat; unsigned numStripes; @@ -17,14 +20,14 @@ class Galois16RecMatrix { void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); template - void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf); + void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf); template - int initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch); - void fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf); + int initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast); + void fillCoeffs(Galois16RecMatrixComputeState& state, unsigned rows, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth); template - void processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector& workers); + void processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth); template - int processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector& workers, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress); + int processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress); public: Galois16RecMatrix(); ~Galois16RecMatrix(); From 2c0f4005898626612284a502745377acc227ee4b Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 20 Jul 2023 21:31:14 +1000 Subject: [PATCH 39/91] Add missing VZEROUPPER in x86 pmul kernel --- gf16/gf16pmul_clmul_x86.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_clmul_x86.h index 9dfa801e..654449fa 100644 --- a/gf16/gf16pmul_clmul_x86.h +++ b/gf16/gf16pmul_clmul_x86.h @@ -222,6 +222,9 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void _MMI(store)((_mword*)(_dst + ptr), result); } #endif +#if MWORD_SIZE >= 32 + _mm256_zeroupper(); +#endif } #else From a57701a4aa757fb3306072acf2e321b82fa82285 Mon Sep 17 00:00:00 2001 From: animetosho Date: Fri, 21 Jul 2023 11:30:54 +1000 Subject: [PATCH 40/91] Tweak prefetching during inversion --- gf16/gf16_muladd_multi.h | 4 +- gf16/gfmat_inv.cpp | 94 +++++++++++++++++++++++++--------------- gf16/gfmat_inv.h | 14 +++--- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h index 98ccc9d4..cb938cb4 100644 --- a/gf16/gf16_muladd_multi.h +++ b/gf16/gf16_muladd_multi.h @@ -167,7 +167,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_R _SRC(interleave, 9), _SRC(interleave, 10), _SRC(interleave, 11), _SRC(interleave, 12), _SRC(interleave,13), _SRC(interleave, 14), _SRC(interleave, 15), _SRC(interleave, 16), _SRC(interleave,17), - len, coefficients + region, 1, _pf + len, coefficients + region, 2, _pf ); srcEnd += srcStride*interleave; outputPfRounds--; @@ -201,7 +201,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_R _SRC(x, 9), _SRC(x, 10), _SRC(x, 11), _SRC(x, 12), \ _SRC(x,13), _SRC(x, 14), _SRC(x, 15), _SRC(x, 16), \ _SRC(x,17), \ - len, coefficients + region, 1, _pf \ + len, coefficients + region, 2, _pf \ ); \ break REMAINING_CASES; diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 921a930c..8ba80fbe 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -21,6 +21,7 @@ struct Galois16RecMatrixComputeState { unsigned validCount; void* srcRowsBase[PP_INVERT_MAX_MULTI_ROWS]; std::vector workers; + unsigned pfFactor; Galois16RecMatrixComputeState(Galois16Methods method) : gf(method) {} }; @@ -52,19 +53,20 @@ struct Galois16RecMatrixWorkerMessage { unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; Galois16Mul* gf; void* gfScratch; void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS]; unsigned coeffWidth; - void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void*(&)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul&, void*, const void*); + void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void*(&)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul&, void*, const void*, unsigned); + unsigned pfFactor; Galois16RecMatrix* parent; std::atomic* procRefs; std::promise* done; Galois16RecMatrixWorkerMessage(Galois16RecMatrixComputeState& state) - : rowCoeffs(state.coeff), gf(&state.gf), srcRowsBase(state.srcRowsBase) {} + : rowCoeffs(state.coeff), gf(&state.gf), srcRowsBase(state.srcRowsBase), pfFactor(state.pfFactor) {} }; static void invert_worker(ThreadMessageQueue& q) { Galois16RecMatrixWorkerMessage* req; while((req = static_cast(q.pop())) != NULL) { - (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRowsBase, *(req->gf), req->gfScratch, nullptr); + (req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRowsBase, *(req->gf), req->gfScratch, nullptr, req->pfFactor); if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) { req->done->set_value(); } @@ -76,9 +78,14 @@ static void invert_worker(ThreadMessageQueue& q) { #define CEIL_DIV(a, b) (((a) + (b)-1) / (b)) #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b)) -template -void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf) { +template +void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf, unsigned pfFactor) { assert(recSrcCount % rows == 0); + // when to start prefetching the next stripe + unsigned recStartPf = 0; + if(recSrcCount > rows<= recStartPf) { + if(recI == recStartPf && curRec2 == recFirst) { + if(stripe < stripeEnd-1) + // prefetch next stripe + pf = (const uint8_t*)(MAT_ROW(stripe+1, recFirst)); + else + pf = (const uint8_t*)nextPf; + } else if(pf) { + pf += stripeWidth >> pfFactor; + } + // TODO: if numStripes==1, we might want to avoid prefetching the same row group as the first applyRows loop would + } else + pf = nullptr; + uint16_t* target = MAT_ROW(stripe, curRec2); uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth + recI; if(rows > 1) { - if(HEDLEY_LIKELY(pf)) - gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf); + if(pf) + gf.mul_add_multi_stridepf(rows, stripeWidth, target, MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf); else { unsigned offset = rec*stripeWidth; gf.mul_add_multi(rows, stripeWidth*numRec*stripe + offset, MAT_ROW(0, curRec2) - offset/sizeof(uint16_t), srcRowsBase, stripeWidth, coeffPtr, gfScratch); } } else { - if(HEDLEY_LIKELY(pf)) - gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf); + if(pf) + gf.mul_add_pf(target, MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf); else - gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch); + gf.mul_add(target, MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch); } } } @@ -119,8 +130,8 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns #define REPLACE_WORD(r, c, v) state.gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v) -template -int Galois16RecMatrix::initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast) { +template +int Galois16RecMatrix::scaleRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast) { assert(recFirst <= recLast); assert(rec != recFirst); @@ -185,8 +196,10 @@ int Galois16RecMatrix::initScale(Galois16RecMatrixComputeState& state, unsigned return -1; \ } - // the next row when `processRow` is called; last action will prefetch this row - uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr; + // the next row when `applyRows` is called; last action will prefetch this row + uint16_t* nextScaleRow = nullptr; + if(!state.workers.empty() && recFirst < recLast) + nextScaleRow = MAT_ROW(0, recFirst); // only prefetch if we're not sending data to threads // TODO: consider loop tiling this stuff; requires extracting a small matrix (rows*rows), and solving that, which means a scalar multiply is necessary @@ -271,20 +284,19 @@ void Galois16RecMatrix::fillCoeffs(Galois16RecMatrixComputeState& state, unsigne } } -template -void Galois16RecMatrix::processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth) { +template +void Galois16RecMatrix::applyRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth, int nextRow) { // TODO: consider optimisation for numStripes == 1 ? assert(recFirst < recLast); assert(rec != recFirst); - // the next row when `processRow` is called; last action will prefetch this row - uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr; // do main elimination, using the source group - if(state.workers.empty()) + if(state.workers.empty()) { // process elimination directly - invertLoop(0, numStripes, recFirst, recLast, rec, recCount, state.coeff, coeffWidth, state.srcRowsBase, state.gf, state.gfScratch, nextScaleRow); - else { + uint16_t* nextScaleRow = nextRow >= 0 ? MAT_ROW(0, (unsigned)nextRow) : nullptr; + invertLoop(0, numStripes, recFirst, recLast, rec, recCount, state.coeff, coeffWidth, state.srcRowsBase, state.gf, state.gfScratch, nextScaleRow, state.pfFactor); + } else { // process using workers std::atomic procRefs; std::promise done; @@ -370,7 +382,7 @@ void Galois16RecMatrix::processRow(Galois16RecMatrixComputeState& state, unsigne #undef MAT_ROW -template +template int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress) { unsigned alignedRowGroupSize = (rowGroupSize / rows) * rows; while(rec <= numRec-rows) { @@ -393,11 +405,15 @@ int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigne unsigned recFirst = recStart; if(recFirst == rec) recFirst += rows; - int badRowOffset = initScale(state, rec, recFirst, curRowGroupSize+recStart); + int badRowOffset = scaleRows(state, rec, recFirst, curRowGroupSize+recStart); if(badRowOffset >= 0) return rec+badRowOffset; if(recFirst == curRowGroupSize+recStart) continue; fillCoeffs(state, rows, recFirst, curRowGroupSize+recStart, rec, rows); - processRow(state, rec, rows, recFirst, curRowGroupSize+recStart, rows); + + int nextRow = recStart; + if(rec+rows == curRowGroupSize+recStart) + nextRow = recStart > 0 ? 0 : (numRec>=curRowGroupSize*2 ? curRowGroupSize : -1); + applyRows(state, rec, rows, recFirst, curRowGroupSize+recStart, rows, nextRow); } @@ -420,7 +436,14 @@ int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigne curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group assert(curRowGroupSize2 > 0); fillCoeffs(state, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize); - processRow(state, recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, curRowGroupSize); + + int nextRow = recGroup + curRowGroupSize2; + if((unsigned)nextRow >= numRec) + nextRow = rec+curRowGroupSize2 < numRec ? rec : -1; + else if((unsigned)nextRow+curRowGroupSize2 > numRec) + nextRow = -1; // don't over prefetch; TODO: is there a way to still prefetch something? + applyRows(state, recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, curRowGroupSize, nextRow); + recGroup += curRowGroupSize2; } } @@ -541,6 +564,7 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va Galois16RecMatrixComputeState state(Galois16Mul::default_method(matWidth, (unsigned)inputValid.size(), (unsigned)inputValid.size(), true)); state.validCount = validCount; const auto gfInfo = state.gf.info(); + state.pfFactor = gfInfo.prefetchDownscale; // divide the matrix up into evenly sized stripes (for loop tiling optimisation) numStripes = ROUND_DIV(matWidth, (unsigned)gfInfo.idealChunkSize); diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 3ad049c6..ce53d7fe 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -19,14 +19,14 @@ class Galois16RecMatrix { unsigned numThreads; void Construct(const std::vector& inputValid, unsigned validCount, const std::vector& recovery); - template - void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf); - template - int initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast); + template + void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf, unsigned pfFactor); + template + int scaleRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast); void fillCoeffs(Galois16RecMatrixComputeState& state, unsigned rows, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth); - template - void processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth); - template + template + void applyRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth, int nextRow); + template int processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function progressCb, uint16_t progressOffset, uint16_t totalProgress); public: Galois16RecMatrix(); From f5497dd4f4497d26dfed3f5b82785ca0a269b10b Mon Sep 17 00:00:00 2001 From: animetosho Date: Fri, 21 Jul 2023 13:26:29 +1000 Subject: [PATCH 41/91] Also prefetch next rowSrc in inversion --- gf16/gfmat_inv.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 8ba80fbe..7f0d91e4 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -82,9 +82,11 @@ template void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf, unsigned pfFactor) { assert(recSrcCount % rows == 0); // when to start prefetching the next stripe - unsigned recStartPf = 0; + unsigned recStartPf = 0, recSrcStartPf = recFirst; if(recSrcCount > rows< rows<> pfFactor; } // TODO: if numStripes==1, we might want to avoid prefetching the same row group as the first applyRows loop would - } else - pf = nullptr; + } else { + if(curRec2 == recSrcStartPf) + // prefetch next rowSrc + pf = (const uint8_t*)(MAT_ROW(stripe, rec+rows)); + else if(curRec2 < recSrcStartPf) + pf = nullptr; + } uint16_t* target = MAT_ROW(stripe, curRec2); uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth + recI; @@ -123,6 +128,7 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns else gf.mul_add(target, MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch); } + if(pf) pf += stripeWidth >> pfFactor; } } } @@ -676,7 +682,6 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) { numThreads = hardware_concurrency(); - if(numThreads > 4) numThreads = 4; // by default, cap at 4 threads, as scaling doesn't work so well; TODO: tweak this later numRec = 0; numStripes = 0; stripeWidth = 0; From 6f90be9ad4b095d4a978360b991d03e4426b6790 Mon Sep 17 00:00:00 2001 From: animetosho Date: Fri, 21 Jul 2023 15:33:28 +1000 Subject: [PATCH 42/91] Target 512K cache for inversion + ensure hardware_concurrency() > 0 --- gf16/gfmat_inv.cpp | 5 +++-- gf16/threadqueue.h | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 7f0d91e4..26b4055b 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -612,8 +612,9 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } else state.gfScratch = state.gf.mutScratch_alloc(); - // target L3 slice? use 1MB target for now; TODO: improve this - unsigned rowGroupSize = (1024*1024 / stripeWidth); + // target L2 size - 512K seems to be a reasonable guess for now; TODO: improve this + // - targeting larger L2 (e.g. >1MB) seems to perform worse, so a fixed size might end up being better + unsigned rowGroupSize = (512*1024 / stripeWidth); // if it's going to be split amongst cores, increase the number of rows in a group if(numStripes < _numThreads) rowGroupSize *= _numThreads/numStripes; unsigned rowMultiple = (std::min)(gfInfo.idealInputMultiple, PP_INVERT_MAX_MULTI_ROWS); diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h index ee11a8c2..3db53cb3 100644 --- a/gf16/threadqueue.h +++ b/gf16/threadqueue.h @@ -392,8 +392,8 @@ class MessageThread { }; static inline int hardware_concurrency() { -#ifdef USE_LIBUV int threads; +#ifdef USE_LIBUV #if UV_VERSION_HEX >= 0x12c00 // 1.44.0 threads = uv_available_parallelism(); #else @@ -401,10 +401,11 @@ static inline int hardware_concurrency() { uv_cpu_info(&info, &threads); uv_free_cpu_info(info, threads); #endif - return threads; #else - return (int)std::thread::hardware_concurrency(); + threads = (int)std::thread::hardware_concurrency(); #endif + if(threads < 1) threads = 1; + return threads; } From 10739248ebf4ab5abe75d20ac6fc57a0beb93f2a Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 25 Jul 2023 18:26:55 +1000 Subject: [PATCH 43/91] Rename gf16pmul_clmul* -> gf16_pmul* --- gf16/gf16pmul.cpp | 40 +++++++++---------- gf16/gf16pmul.h | 6 +-- ...{gf16pmul_clmul_avx2.c => gf16pmul_avx2.c} | 2 +- ...{gf16pmul_clmul_neon.c => gf16pmul_neon.c} | 8 ++-- gf16/{gf16pmul_clmul_sse.c => gf16pmul_sse.c} | 2 +- ...{gf16pmul_clmul_sve2.c => gf16pmul_sve2.c} | 12 +++--- ...l_clmul_vpclgfni.c => gf16pmul_vpclgfni.c} | 2 +- ...mul_clmul_vpclmul.c => gf16pmul_vpclmul.c} | 2 +- gf16/{gf16pmul_clmul_x86.h => gf16pmul_x86.h} | 16 ++++---- 9 files changed, 45 insertions(+), 45 deletions(-) rename gf16/{gf16pmul_clmul_avx2.c => gf16pmul_avx2.c} (88%) rename gf16/{gf16pmul_clmul_neon.c => gf16pmul_neon.c} (80%) rename gf16/{gf16pmul_clmul_sse.c => gf16pmul_sse.c} (88%) rename gf16/{gf16pmul_clmul_sve2.c => gf16pmul_sve2.c} (77%) rename gf16/{gf16pmul_clmul_vpclgfni.c => gf16pmul_vpclgfni.c} (90%) rename gf16/{gf16pmul_clmul_vpclmul.c => gf16pmul_vpclmul.c} (89%) rename gf16/{gf16pmul_clmul_x86.h => gf16pmul_x86.h} (92%) diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp index ffc59f35..5e1f7504 100644 --- a/gf16/gf16pmul.cpp +++ b/gf16/gf16pmul.cpp @@ -30,47 +30,47 @@ void setup_pmul() { hasGFNI = (cpuInfoX[2] & 0x100) == 0x100; #endif - if(!hasGFNI) gf16pmul_clmul_available_vpclgfni = 0; + if(!hasGFNI) gf16pmul_available_vpclgfni = 0; if(!hasVPCLMUL) { - gf16pmul_clmul_available_vpclmul = 0; - gf16pmul_clmul_available_vpclgfni = 0; + gf16pmul_available_vpclmul = 0; + gf16pmul_available_vpclgfni = 0; } - if(!hasAVX2) gf16pmul_clmul_available_avx2 = 0; - if(!hasClMul) gf16pmul_clmul_available_sse = 0; + if(!hasAVX2) gf16pmul_available_avx2 = 0; + if(!hasClMul) gf16pmul_available_sse = 0; - if(gf16pmul_clmul_available_vpclgfni) { - gf16pmul = &gf16pmul_clmul_vpclgfni; + if(gf16pmul_available_vpclgfni) { + gf16pmul = &gf16pmul_vpclgfni; gf16pmul_alignment = 32; gf16pmul_blocklen = 64; } - else if(gf16pmul_clmul_available_vpclmul) { - gf16pmul = &gf16pmul_clmul_vpclmul; + else if(gf16pmul_available_vpclmul) { + gf16pmul = &gf16pmul_vpclmul; gf16pmul_alignment = 32; gf16pmul_blocklen = 32; } - else if(gf16pmul_clmul_available_avx2) { - gf16pmul = &gf16pmul_clmul_avx2; + else if(gf16pmul_available_avx2) { + gf16pmul = &gf16pmul_avx2; gf16pmul_alignment = 32; gf16pmul_blocklen = 32; } - else if(gf16pmul_clmul_available_sse) { - gf16pmul = &gf16pmul_clmul_sse; + else if(gf16pmul_available_sse) { + gf16pmul = &gf16pmul_sse; gf16pmul_alignment = 16; gf16pmul_blocklen = 16; } #endif #ifdef PLATFORM_ARM - if(!CPU_HAS_SVE2) gf16pmul_clmul_available_sve2 = 0; - if(!CPU_HAS_NEON) gf16pmul_clmul_available_neon = 0; + if(!CPU_HAS_SVE2) gf16pmul_available_sve2 = 0; + if(!CPU_HAS_NEON) gf16pmul_available_neon = 0; - if(gf16pmul_clmul_available_sve2) { - gf16pmul = &gf16pmul_clmul_sve2; - gf16pmul_alignment = gf16pmul_clmul_sve2_width(); + if(gf16pmul_available_sve2) { + gf16pmul = &gf16pmul_sve2; + gf16pmul_alignment = gf16pmul_sve2_width(); gf16pmul_blocklen = gf16pmul_alignment*2; } - else if(gf16pmul_clmul_available_neon) { - gf16pmul = &gf16pmul_clmul_neon; + else if(gf16pmul_available_neon) { + gf16pmul = &gf16pmul_neon; gf16pmul_alignment = 16; gf16pmul_blocklen = 32; } diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h index 180a9025..7ef94ded 100644 --- a/gf16/gf16pmul.h +++ b/gf16/gf16pmul.h @@ -14,8 +14,8 @@ void setup_pmul(); HEDLEY_BEGIN_C_DECLS #define _PMUL_DECL(f) \ - void gf16pmul_clmul_##f(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); \ - extern int gf16pmul_clmul_available_##f + void gf16pmul_##f(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); \ + extern int gf16pmul_available_##f _PMUL_DECL(sse); _PMUL_DECL(avx2); @@ -26,7 +26,7 @@ _PMUL_DECL(sve2); #undef _PMUL_DECL -unsigned gf16pmul_clmul_sve2_width(); +unsigned gf16pmul_sve2_width(); HEDLEY_END_C_DECLS diff --git a/gf16/gf16pmul_clmul_avx2.c b/gf16/gf16pmul_avx2.c similarity index 88% rename from gf16/gf16pmul_clmul_avx2.c rename to gf16/gf16pmul_avx2.c index ce965f4c..1c0cc4a1 100644 --- a/gf16/gf16pmul_clmul_avx2.c +++ b/gf16/gf16pmul_avx2.c @@ -9,4 +9,4 @@ #if defined(__PCLMUL__) && defined(__AVX2__) # define _AVAILABLE 1 #endif -#include "gf16pmul_clmul_x86.h" +#include "gf16pmul_x86.h" diff --git a/gf16/gf16pmul_clmul_neon.c b/gf16/gf16pmul_neon.c similarity index 80% rename from gf16/gf16pmul_clmul_neon.c rename to gf16/gf16pmul_neon.c index c4d8f76c..c23cc3c4 100644 --- a/gf16/gf16pmul_clmul_neon.c +++ b/gf16/gf16pmul_neon.c @@ -2,9 +2,9 @@ #include "gf16_clmul_neon.h" #ifdef __ARM_NEON -int gf16pmul_clmul_available_neon = 1; +int gf16pmul_available_neon = 1; -void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { +void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { assert(len % sizeof(uint8x16_t)*2 == 0); const poly8_t* _src1 = (const poly8_t*)src1 + len; @@ -32,8 +32,8 @@ void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void } #else // defined(__ARM_NEON) -int gf16pmul_clmul_available_neon = 0; -void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { +int gf16pmul_available_neon = 0; +void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); } #endif diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_sse.c similarity index 88% rename from gf16/gf16pmul_clmul_sse.c rename to gf16/gf16pmul_sse.c index 5338858d..2def2279 100644 --- a/gf16/gf16pmul_clmul_sse.c +++ b/gf16/gf16pmul_sse.c @@ -9,4 +9,4 @@ #if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__) # define _AVAILABLE 1 #endif -#include "gf16pmul_clmul_x86.h" +#include "gf16pmul_x86.h" diff --git a/gf16/gf16pmul_clmul_sve2.c b/gf16/gf16pmul_sve2.c similarity index 77% rename from gf16/gf16pmul_clmul_sve2.c rename to gf16/gf16pmul_sve2.c index b88fe2c2..43adc210 100644 --- a/gf16/gf16pmul_clmul_sve2.c +++ b/gf16/gf16pmul_sve2.c @@ -2,9 +2,9 @@ #include "gf16_clmul_sve2.h" #ifdef __ARM_FEATURE_SVE2 -int gf16pmul_clmul_available_sve2 = 1; +int gf16pmul_available_sve2 = 1; -void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { +void gf16pmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { assert(len % svcntb()*2 == 0); const uint8_t* _src1 = (const uint8_t*)src1 + len; @@ -28,17 +28,17 @@ void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void } } -unsigned gf16pmul_clmul_sve2_width() { +unsigned gf16pmul_sve2_width() { return svcntb(); } #else // defined(__ARM_FEATURE_SVE2) -int gf16pmul_clmul_available_sve2 = 0; -void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { +int gf16pmul_available_sve2 = 0; +void gf16pmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); } -unsigned gf16pmul_clmul_sve2_width() { +unsigned gf16pmul_sve2_width() { return 1; } #endif diff --git a/gf16/gf16pmul_clmul_vpclgfni.c b/gf16/gf16pmul_vpclgfni.c similarity index 90% rename from gf16/gf16pmul_clmul_vpclgfni.c rename to gf16/gf16pmul_vpclgfni.c index 26ad0478..4474f6fe 100644 --- a/gf16/gf16pmul_clmul_vpclgfni.c +++ b/gf16/gf16pmul_vpclgfni.c @@ -12,4 +12,4 @@ #if defined(__VPCLMULQDQ__) && defined(__GFNI__) && defined(__AVX2__) # define _AVAILABLE 1 #endif -#include "gf16pmul_clmul_x86.h" +#include "gf16pmul_x86.h" diff --git a/gf16/gf16pmul_clmul_vpclmul.c b/gf16/gf16pmul_vpclmul.c similarity index 89% rename from gf16/gf16pmul_clmul_vpclmul.c rename to gf16/gf16pmul_vpclmul.c index 715544a9..a4140b10 100644 --- a/gf16/gf16pmul_clmul_vpclmul.c +++ b/gf16/gf16pmul_vpclmul.c @@ -11,4 +11,4 @@ #if defined(__VPCLMULQDQ__) && defined(__AVX2__) # define _AVAILABLE 1 #endif -#include "gf16pmul_clmul_x86.h" +#include "gf16pmul_x86.h" diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_x86.h similarity index 92% rename from gf16/gf16pmul_clmul_x86.h rename to gf16/gf16pmul_x86.h index 654449fa..b74d24cf 100644 --- a/gf16/gf16pmul_clmul_x86.h +++ b/gf16/gf16pmul_x86.h @@ -1,9 +1,9 @@ #include "gf16_global.h" #if defined(_AVAILABLE) -int _FN(gf16pmul_clmul_available) = 1; +int _FN(gf16pmul_available) = 1; -static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1, const _mword* src2, _mword* prod1, _mword* prod2) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_initmul)(const _mword* src1, const _mword* src2, _mword* prod1, _mword* prod2) { _mword wordMask = _MM(set1_epi32)(0xffff); _mword data1 = _MMI(load)(src1); @@ -61,7 +61,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1, #endif } -void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { +void _FN(gf16pmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { assert(len % sizeof(_mword) == 0); const uint8_t* _src1 = (const uint8_t*)src1 + len; @@ -93,8 +93,8 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void # endif for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) { _mword prod1, prod2, prod3, prod4; - _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); - _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr) +1, (_mword*)(_src2 + ptr) +1, &prod3, &prod4); + _FN(gf16pmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); + _FN(gf16pmul_initmul)((_mword*)(_src1 + ptr) +1, (_mword*)(_src2 + ptr) +1, &prod3, &prod4); // split low/high _mword tmp1 = _MM(shuffle_epi8)(prod1, shufLoHi); @@ -168,7 +168,7 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void #else for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)) { _mword prod1, prod2; - _FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); + _FN(gf16pmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2); // do reduction /* obvious Barret reduction strategy, using CLMUL instructions @@ -228,8 +228,8 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void } #else -int _FN(gf16pmul_clmul_available) = 0; -void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { +int _FN(gf16pmul_available) = 0; +void _FN(gf16pmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len); } #endif From e294f6e51683f5312b61d9e2d1d2fa044af5d18d Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 27 Jul 2023 23:12:09 +1000 Subject: [PATCH 44/91] Prepare Affine kernel for AVX10 --- gf16/gf16_affine2x_x86.h | 4 +- gf16/gf16_affine_avx10.h | 438 ++++++++++++++++++++++++++++++++++++++ gf16/gf16_affine_avx2.c | 2 + gf16/gf16_affine_avx512.c | 420 ++---------------------------------- gf16/gf16_affine_gfni.c | 2 + gf16/gf16_muladd_multi.h | 16 +- gf16/gf16mul.cpp | 4 +- 7 files changed, 468 insertions(+), 418 deletions(-) create mode 100644 gf16/gf16_affine_avx10.h diff --git a/gf16/gf16_affine2x_x86.h b/gf16/gf16_affine2x_x86.h index b6391622..c8e494b9 100644 --- a/gf16/gf16_affine2x_x86.h +++ b/gf16/gf16_affine2x_x86.h @@ -73,9 +73,9 @@ void _FN(gf16_affine2x_prepare)(void* dst, const void* src, size_t srcLen) { #ifdef _AVAILABLE # ifdef PLATFORM_AMD64 -GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FN(gf16_affine2x_prepare_block), _FN(gf16_affine2x_prepare_blocku), 6 + (MWORD_SIZE==64)*6, _MM_END, _mword checksum = _MMI(setzero)(), _FN(gf16_checksum_block), _FN(gf16_checksum_blocku), _FN(gf16_checksum_exp), _FN(gf16_checksum_prepare), sizeof(_mword)) +GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FNPREP(gf16_affine2x_prepare_block), _FNPREP(gf16_affine2x_prepare_blocku), 6 + (MWORD_SIZE==64)*6, _MM_END, _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword)) # else -GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FN(gf16_affine2x_prepare_block), _FN(gf16_affine2x_prepare_blocku), 2, _MM_END, _mword checksum = _MMI(setzero)(), _FN(gf16_checksum_block), _FN(gf16_checksum_blocku), _FN(gf16_checksum_exp), _FN(gf16_checksum_prepare), sizeof(_mword)) +GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FNPREP(gf16_affine2x_prepare_block), _FNPREP(gf16_affine2x_prepare_blocku), 2, _MM_END, _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword)) # endif #else GF_PREPARE_PACKED_FUNCS_STUB(gf16_affine2x, _FNSUFFIX) diff --git a/gf16/gf16_affine_avx10.h b/gf16/gf16_affine_avx10.h new file mode 100644 index 00000000..f2e9de1a --- /dev/null +++ b/gf16/gf16_affine_avx10.h @@ -0,0 +1,438 @@ + +#ifdef _AVAILABLE +int _FN(gf16_affine_available) = 1; +# include "gf16_shuffle_x86_prepare.h" +# include "gf16_checksum_x86.h" +#else +int _FN(gf16_affine_available) = 0; +#endif + +#include "gf16_affine2x_x86.h" +#include "gf16_muladd_multi.h" + + +#ifdef _AVAILABLE +# ifdef PLATFORM_AMD64 +GF_PREPARE_PACKED_FUNCS(gf16_affine, _FNSUFFIX, sizeof(_mword)*2, _FNPREP(gf16_shuffle_prepare_block), _FNPREP(gf16_shuffle_prepare_blocku), 6, _mm256_zeroupper(), _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword)) +# else +GF_PREPARE_PACKED_FUNCS(gf16_affine, _FNSUFFIX, sizeof(_mword)*2, _FNPREP(gf16_shuffle_prepare_block), _FNPREP(gf16_shuffle_prepare_blocku), 1, _mm256_zeroupper(), _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword)) +# endif +#else +GF_PREPARE_PACKED_FUNCS_STUB(gf16_affine, _FNSUFFIX) +#endif + + +#ifdef _AVAILABLE +static HEDLEY_ALWAYS_INLINE __m256i gf16_affine_load_matrix(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient) { + __m256i depmask = _mm256_xor_si256( + _mm256_load_si256((__m256i*)scratch + (coefficient & 0xf)*4), + _mm256_load_si256((__m256i*)((char*)scratch + ((coefficient << 3) & 0x780)) + 1) + ); + depmask = _mm256_ternarylogic_epi32( + depmask, + _mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 1) & 0x780)) + 2), + _mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 5) & 0x780)) + 3), + 0x96 + ); + return depmask; +} +#endif + + +#ifdef _AVAILABLE +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine_muladd_round)(const _mword* src, _mword* tpl, _mword* tph, _mword mat_ll, _mword mat_hl, _mword mat_lh, _mword mat_hh) { + _mword ta = _MMI(load)(src); + _mword tb = _MMI(load)(src + 1); + + *tpl = _MM(ternarylogic_epi32)( + _MM(gf2p8affine_epi64_epi8)(ta, mat_lh, 0), + _MM(gf2p8affine_epi64_epi8)(tb, mat_ll, 0), + *tpl, + 0x96 + ); + *tph = _MM(ternarylogic_epi32)( + _MM(gf2p8affine_epi64_epi8)(ta, mat_hh, 0), + _MM(gf2p8affine_epi64_epi8)(tb, mat_hl, 0), + *tph, + 0x96 + ); +} +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine_muladd_x)( + const void *HEDLEY_RESTRICT scratch, + uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, + GF16_MULADD_MULTI_SRCLIST, + size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf +) { + GF16_MULADD_MULTI_SRC_UNUSED(6); + + _mword mat_All, mat_Alh, mat_Ahl, mat_Ahh; + _mword mat_Bll, mat_Blh, mat_Bhl, mat_Bhh; + _mword mat_Cll, mat_Clh, mat_Chl, mat_Chh; + _mword mat_Dll, mat_Dlh, mat_Dhl, mat_Dhh; + _mword mat_Ell, mat_Elh, mat_Ehl, mat_Ehh; + _mword mat_Fll, mat_Flh, mat_Fhl, mat_Fhh; + + _mword depmask1; + #if MWORD_SIZE == 64 + __m256i depmask256; + __m512i depmask2; + #define PERM1(dstVec, srcLL) \ + dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \ + dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \ + dstVec##ll = _mm512_broadcastq_epi64(srcLL); \ + dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2)) + #define PERM2(dstVec) \ + depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(2,3,2,3)); \ + dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \ + dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \ + dstVec##ll = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(2,2,2,2)); \ + dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2)) + + if(srcCount == 1) { + depmask256 = gf16_affine_load_matrix(scratch, coefficients[0]); + depmask2 = _mm512_castsi256_si512(depmask256); + depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1)); + PERM1(mat_A, _mm256_castsi256_si128(depmask256)); + } else if(srcCount > 1) { + depmask1 = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]); + depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1)); + PERM1(mat_A, _mm512_castsi512_si128(depmask1)); + PERM2(mat_B); + } + if(srcCount == 3) { + depmask256 = gf16_affine_load_matrix(scratch, coefficients[2]); + depmask2 = _mm512_castsi256_si512(depmask256); + depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1)); + PERM1(mat_C, _mm256_castsi256_si128(depmask256)); + } else if(srcCount > 3) { + depmask1 = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]); + depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1)); + PERM1(mat_C, _mm512_castsi512_si128(depmask1)); + PERM2(mat_D); + } + if(srcCount == 5) { + depmask256 = gf16_affine_load_matrix(scratch, coefficients[4]); + depmask2 = _mm512_castsi256_si512(depmask256); + depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1)); + PERM1(mat_E, _mm256_castsi256_si128(depmask256)); + } else if(srcCount > 5) { + depmask1 = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]); + depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1)); + PERM1(mat_E, _mm512_castsi512_si128(depmask1)); + PERM2(mat_F); + } + #undef PERM2 + #else + #define PERM1(dstVec) \ + dstVec##hh = _mm256_permute4x64_epi64(depmask1, _MM_SHUFFLE(1,1,1,1)); \ + dstVec##lh = _mm256_permute4x64_epi64(depmask1, _MM_SHUFFLE(3,3,3,3)); \ + dstVec##ll = _mm256_broadcastq_epi64(_mm256_castsi256_si128(depmask1)); \ + dstVec##hl = _mm256_permute4x64_epi64(depmask1, _MM_SHUFFLE(2,2,2,2)) + #define LOAD_SRC(n, dstVec) \ + if(srcCount > n) { \ + depmask1 = gf16_affine_load_matrix(scratch, coefficients[n]); \ + PERM1(dstVec); \ + } + + LOAD_SRC(0, mat_A) + LOAD_SRC(1, mat_B) + LOAD_SRC(2, mat_C) + LOAD_SRC(3, mat_D) + LOAD_SRC(4, mat_E) + LOAD_SRC(5, mat_F) + #undef LOAD_SRC + #endif + #undef PERM1 + + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) { + _mword tph = _MMI(load)((_mword*)(_dst + ptr)); + _mword tpl = _MMI(load)((_mword*)(_dst + ptr) + 1); + _FN(gf16_affine_muladd_round)((_mword*)(_src1 + ptr*srcScale), &tpl, &tph, mat_All, mat_Ahl, mat_Alh, mat_Ahh); + if(srcCount >= 2) + _FN(gf16_affine_muladd_round)((_mword*)(_src2 + ptr*srcScale), &tpl, &tph, mat_Bll, mat_Bhl, mat_Blh, mat_Bhh); + if(srcCount >= 3) + _FN(gf16_affine_muladd_round)((_mword*)(_src3 + ptr*srcScale), &tpl, &tph, mat_Cll, mat_Chl, mat_Clh, mat_Chh); + if(srcCount >= 4) + _FN(gf16_affine_muladd_round)((_mword*)(_src4 + ptr*srcScale), &tpl, &tph, mat_Dll, mat_Dhl, mat_Dlh, mat_Dhh); + if(srcCount >= 5) + _FN(gf16_affine_muladd_round)((_mword*)(_src5 + ptr*srcScale), &tpl, &tph, mat_Ell, mat_Ehl, mat_Elh, mat_Ehh); + if(srcCount >= 6) + _FN(gf16_affine_muladd_round)((_mword*)(_src6 + ptr*srcScale), &tpl, &tph, mat_Fll, mat_Fhl, mat_Flh, mat_Fhh); + _MMI(store)((_mword*)(_dst + ptr), tph); + _MMI(store)((_mword*)(_dst + ptr)+1, tpl); + + if(doPrefetch == 1) + _mm_prefetch(_pf+(ptr>>1), MM_HINT_WT1); + if(doPrefetch == 2) + _mm_prefetch(_pf+(ptr>>1), _MM_HINT_T1); + } +} +#endif /*defined(_AVAILABLE)*/ + +void _FN(gf16_affine_muladd)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#ifdef _AVAILABLE + gf16_muladd_single(scratch, &_FN(gf16_affine_muladd_x), dst, src, len, coefficient); + _mm256_zeroupper(); +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); +#endif +} + +void _FN(gf16_affine_muladd_prefetch)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) { + UNUSED(mutScratch); +#ifdef _AVAILABLE + gf16_muladd_prefetch_single(scratch, &_FN(gf16_affine_muladd_x), dst, src, len, coefficient, prefetch); + _mm256_zeroupper(); +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(prefetch); +#endif +} + +#if defined(_AVAILABLE) && defined(PLATFORM_AMD64) +GF16_MULADD_MULTI_FUNCS(gf16_affine, _FNSUFFIX, _FN(gf16_affine_muladd_x), 6, sizeof(_mword)*2, 1, _mm256_zeroupper()) +#else +GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine, _FNSUFFIX) +#endif + + + +#ifdef _AVAILABLE +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcCountOffs, const void* _src1, const void* _src2, _mword* result, _mword* swapped, _mword matNorm1, _mword matSwap1, _mword matNorm2, _mword matSwap2) { + if(srcCountOffs < 0) return; + + _mword data1 = _MMI(load)(_src1); + if(srcCountOffs == 0) { + *result = _MMI(xor)( + *result, + _MM(gf2p8affine_epi64_epi8)(data1, matNorm1, 0) + ); + *swapped = _MMI(xor)( + *swapped, + _MM(gf2p8affine_epi64_epi8)(data1, matSwap1, 0) + ); + } + else { // if(srcCountOffs > 0) + _mword data2 = _MMI(load)(_src2); + *result = _MM(ternarylogic_epi32)( + *result, + _MM(gf2p8affine_epi64_epi8)(data1, matNorm1, 0), + _MM(gf2p8affine_epi64_epi8)(data2, matNorm2, 0), + 0x96 + ); + *swapped = _MM(ternarylogic_epi32)( + *swapped, + _MM(gf2p8affine_epi64_epi8)(data1, matSwap1, 0), + _MM(gf2p8affine_epi64_epi8)(data2, matSwap2, 0), + 0x96 + ); + } +} +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_x)( + const void *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, + GF16_MULADD_MULTI_SRCLIST, + size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf +) { + GF16_MULADD_MULTI_SRC_UNUSED(13); + + _mword depmask; + _mword matNormA, matSwapA; + _mword matNormB, matSwapB; + _mword matNormC, matSwapC; + _mword matNormD, matSwapD; + _mword matNormE, matSwapE; + _mword matNormF, matSwapF; + _mword matNormG, matSwapG; + _mword matNormH, matSwapH; + _mword matNormI, matSwapI; + _mword matNormJ, matSwapJ; + _mword matNormK, matSwapK; + _mword matNormL, matSwapL; + _mword matNormM, matSwapM; + + // prevent MSVC whining + matNormB = matSwapB = matNormC = matSwapC = matNormD = matSwapD = matNormE = matSwapE = matNormF = matSwapF = matNormG = matSwapG = matNormH = matSwapH = matNormI = matSwapI = matNormJ = matSwapJ = matNormK = matSwapK = matNormL = matSwapL = matNormM = matSwapM = +# if MWORD_SIZE == 64 + _mm512_undefined_epi32(); +# else + _mm256_undefined_si256(); +# endif + +# if MWORD_SIZE == 64 + if(srcCount == 1) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[0])); + matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } + if(srcCount > 1) { + depmask = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]); + matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + matNormB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); + matSwapB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); + } + if(srcCount == 3) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[2])); + matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } + if(srcCount > 3) { + depmask = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]); + matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + matNormD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); + matSwapD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); + } + if(srcCount == 5) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[4])); + matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } + if(srcCount > 5) { + depmask = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]); + matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + matNormF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); + matSwapF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); + } + if(srcCount == 7) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[6])); + matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } + if(srcCount > 7) { + depmask = gf16_affine_load2_matrix(scratch, coefficients[6], coefficients[7]); + matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + matNormH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); + matSwapH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); + } + if(srcCount == 9) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[8])); + matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } + if(srcCount > 9) { + depmask = gf16_affine_load2_matrix(scratch, coefficients[8], coefficients[9]); + matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + matNormJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); + matSwapJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); + } + if(srcCount == 11) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[10])); + matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } + if(srcCount > 11) { + depmask = gf16_affine_load2_matrix(scratch, coefficients[10], coefficients[11]); + matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + matNormL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); + matSwapL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); + } + if(srcCount == 13) { + depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[12])); + matNormM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); + matSwapM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); + } +# else + #define LOAD_SRC(n, mat) \ + if(srcCount > n) { \ + depmask = gf16_affine_load_matrix(scratch, coefficients[n]); \ + matNorm##mat = _mm256_inserti128_si256(depmask, _mm256_castsi256_si128(depmask), 1); \ + matSwap##mat = _mm256_permute2x128_si256(depmask, depmask, 0x11); \ + } + LOAD_SRC(0, A) + LOAD_SRC(1, B) + LOAD_SRC(2, C) + LOAD_SRC(3, D) + LOAD_SRC(4, E) + LOAD_SRC(5, F) + LOAD_SRC(6, G) + LOAD_SRC(7, H) + LOAD_SRC(8, I) + LOAD_SRC(9, J) + LOAD_SRC(10, K) + LOAD_SRC(11, L) + LOAD_SRC(12, M) + #undef LOAD_SRC +# endif + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)) { + _mword data = _MMI(load)((_mword*)(_src1 + ptr*srcScale)); + _mword result = _MM(gf2p8affine_epi64_epi8)(data, matNormA, 0); + _mword swapped = _MM(gf2p8affine_epi64_epi8)(data, matSwapA, 0); + if(srcCount > 1) + data = _MMI(load)((_mword*)(_src2 + ptr*srcScale)); + if(srcCount >= 3) { + _mword data2 = _MMI(load)((_mword*)(_src3 + ptr*srcScale)); + result = _MM(ternarylogic_epi32)( + result, + _MM(gf2p8affine_epi64_epi8)(data, matNormB, 0), + _MM(gf2p8affine_epi64_epi8)(data2, matNormC, 0), + 0x96 + ); + swapped = _MM(ternarylogic_epi32)( + swapped, + _MM(gf2p8affine_epi64_epi8)(data, matSwapB, 0), + _MM(gf2p8affine_epi64_epi8)(data2, matSwapC, 0), + 0x96 + ); + } else if(srcCount == 2) { + result = _MMI(xor)( + result, + _MM(gf2p8affine_epi64_epi8)(data, matNormB, 0) + ); + swapped = _MMI(xor)( + swapped, + _MM(gf2p8affine_epi64_epi8)(data, matSwapB, 0) + ); + } + + _FN(gf16_affine2x_muladd_2round)(srcCount - 4, _src4 + ptr*srcScale, _src5 + ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE); + _FN(gf16_affine2x_muladd_2round)(srcCount - 6, _src6 + ptr*srcScale, _src7 + ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG); + _FN(gf16_affine2x_muladd_2round)(srcCount - 8, _src8 + ptr*srcScale, _src9 + ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI); + _FN(gf16_affine2x_muladd_2round)(srcCount - 10, _src10 + ptr*srcScale, _src11 + ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK); + _FN(gf16_affine2x_muladd_2round)(srcCount - 12, _src12 + ptr*srcScale, _src13 + ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM); + + result = _MM(ternarylogic_epi32)( + result, + _MM(shuffle_epi32)(swapped, _MM_SHUFFLE(1,0,3,2)), + _MMI(load)((_mword*)(_dst + ptr)), + 0x96 + ); + _MMI(store) ((_mword*)(_dst + ptr), result); + + if(doPrefetch == 1) + _mm_prefetch(_pf+ptr, MM_HINT_WT1); + if(doPrefetch == 2) + _mm_prefetch(_pf+ptr, _MM_HINT_T1); + } +} +#endif /*defined(_AVAILABLE)*/ + + +void _FN(gf16_affine2x_muladd)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#ifdef _AVAILABLE + gf16_muladd_single(scratch, &_FN(gf16_affine2x_muladd_x), dst, src, len, coefficient); + _mm256_zeroupper(); +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); +#endif +} + + +#ifdef _AVAILABLE +# ifdef PLATFORM_AMD64 +// TODO: may not want 12 regions for non-packed variant +GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _FNSUFFIX, _FN(gf16_affine2x_muladd_x), 12, sizeof(_mword), 0, _mm256_zeroupper()) +# else +// if only 8 registers available, only allow 2 parallel regions +GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _FNSUFFIX, _FN(gf16_affine2x_muladd_x), 2, sizeof(_mword), 0, _mm256_zeroupper()) +# endif +#else +GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine2x, _FNSUFFIX) +#endif diff --git a/gf16/gf16_affine_avx2.c b/gf16/gf16_affine_avx2.c index 1d42e328..67879d09 100644 --- a/gf16/gf16_affine_avx2.c +++ b/gf16/gf16_affine_avx2.c @@ -7,6 +7,7 @@ #define _MM(f) _mm256_ ## f #define _MMI(f) _mm256_ ## f ## _si256 #define _FNSUFFIX _avx2 +#define _FNPREP(f) f##_avx2 #define _MM_END _mm256_zeroupper(); #if defined(__GFNI__) && defined(__AVX2__) @@ -24,6 +25,7 @@ int gf16_affine_available_avx2 = 0; #endif #undef _MM_END #undef _FNSUFFIX +#undef _FNPREP #undef _MMI #undef _MM #undef _mword diff --git a/gf16/gf16_affine_avx512.c b/gf16/gf16_affine_avx512.c index 67f11f36..279ea3d0 100644 --- a/gf16/gf16_affine_avx512.c +++ b/gf16/gf16_affine_avx512.c @@ -7,55 +7,12 @@ #define _MM(f) _mm512_ ## f #define _MMI(f) _mm512_ ## f ## _si512 #define _FNSUFFIX _avx512 +#define _FNPREP(f) f##_avx512 #define _MM_END _mm256_zeroupper(); #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -int gf16_affine_available_avx512 = 1; # define _AVAILABLE 1 -# include "gf16_shuffle_x86_prepare.h" -# include "gf16_checksum_x86.h" -#else -int gf16_affine_available_avx512 = 0; -#endif - -#include "gf16_affine2x_x86.h" -#ifdef _AVAILABLE -# undef _AVAILABLE -#endif -#undef _MM_END -#undef _FNSUFFIX -#undef _MMI -#undef _MM -#undef _mword -#undef MWORD_SIZE - -#include "gf16_muladd_multi.h" - -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -# ifdef PLATFORM_AMD64 -GF_PREPARE_PACKED_FUNCS(gf16_affine, _avx512, sizeof(__m512i)*2, gf16_shuffle_prepare_block_avx512, gf16_shuffle_prepare_blocku_avx512, 6, _mm256_zeroupper(), __m512i checksum = _mm512_setzero_si512(), gf16_checksum_block_avx512, gf16_checksum_blocku_avx512, gf16_checksum_exp_avx512, gf16_checksum_prepare_avx512, sizeof(__m512i)) -# else -GF_PREPARE_PACKED_FUNCS(gf16_affine, _avx512, sizeof(__m512i)*2, gf16_shuffle_prepare_block_avx512, gf16_shuffle_prepare_blocku_avx512, 1, _mm256_zeroupper(), __m512i checksum = _mm512_setzero_si512(), gf16_checksum_block_avx512, gf16_checksum_blocku_avx512, gf16_checksum_exp_avx512, gf16_checksum_prepare_avx512, sizeof(__m512i)) -# endif -#else -GF_PREPARE_PACKED_FUNCS_STUB(gf16_affine, _avx512) -#endif - -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -static HEDLEY_ALWAYS_INLINE __m256i gf16_affine_load_matrix(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient) { - __m256i depmask = _mm256_xor_si256( - _mm256_load_si256((__m256i*)scratch + (coefficient & 0xf)*4), - _mm256_load_si256((__m256i*)((char*)scratch + ((coefficient << 3) & 0x780)) + 1) - ); - depmask = _mm256_ternarylogic_epi32( - depmask, - _mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 1) & 0x780)) + 2), - _mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 5) & 0x780)) + 3), - 0x96 - ); - return depmask; -} static HEDLEY_ALWAYS_INLINE __m512i gf16_affine_load2_matrix(const void *HEDLEY_RESTRICT scratch, uint16_t coeff1, uint16_t coeff2) { __m512i depmask = _mm512_xor_si512( _mm512_inserti64x4( @@ -87,6 +44,19 @@ static HEDLEY_ALWAYS_INLINE __m512i gf16_affine_load2_matrix(const void *HEDLEY_ } #endif +#include "gf16_affine_avx10.h" +#ifdef _AVAILABLE +# undef _AVAILABLE +#endif +#undef _MM_END +#undef _FNSUFFIX +#undef _FNPREP +#undef _MMI +#undef _MM +#undef _mword +#undef MWORD_SIZE + + void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) @@ -125,345 +95,6 @@ void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, cons } -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -static HEDLEY_ALWAYS_INLINE void gf16_affine_muladd_round(const __m512i* src, __m512i* tpl, __m512i* tph, __m512i mat_ll, __m512i mat_hl, __m512i mat_lh, __m512i mat_hh) { - __m512i ta = _mm512_load_si512(src); - __m512i tb = _mm512_load_si512(src + 1); - - *tpl = _mm512_ternarylogic_epi32( - _mm512_gf2p8affine_epi64_epi8(ta, mat_lh, 0), - _mm512_gf2p8affine_epi64_epi8(tb, mat_ll, 0), - *tpl, - 0x96 - ); - *tph = _mm512_ternarylogic_epi32( - _mm512_gf2p8affine_epi64_epi8(ta, mat_hh, 0), - _mm512_gf2p8affine_epi64_epi8(tb, mat_hl, 0), - *tph, - 0x96 - ); -} -static HEDLEY_ALWAYS_INLINE void gf16_affine_muladd_x_avx512( - const void *HEDLEY_RESTRICT scratch, - uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, - GF16_MULADD_MULTI_SRCLIST, - size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf -) { - GF16_MULADD_MULTI_SRC_UNUSED(6); - - __m512i mat_All, mat_Alh, mat_Ahl, mat_Ahh; - __m512i mat_Bll, mat_Blh, mat_Bhl, mat_Bhh; - __m512i mat_Cll, mat_Clh, mat_Chl, mat_Chh; - __m512i mat_Dll, mat_Dlh, mat_Dhl, mat_Dhh; - __m512i mat_Ell, mat_Elh, mat_Ehl, mat_Ehh; - __m512i mat_Fll, mat_Flh, mat_Fhl, mat_Fhh; - - #define PERM1(dstVec, srcLL) \ - dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \ - dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \ - dstVec##ll = _mm512_broadcastq_epi64(srcLL); \ - dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2)) - #define PERM2(dstVec) \ - depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(2,3,2,3)); \ - dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \ - dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \ - dstVec##ll = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(2,2,2,2)); \ - dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2)) - - __m256i depmask256; - __m512i depmask1, depmask2; - if(srcCount == 1) { - depmask256 = gf16_affine_load_matrix(scratch, coefficients[0]); - depmask2 = _mm512_castsi256_si512(depmask256); - depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1)); - PERM1(mat_A, _mm256_castsi256_si128(depmask256)); - } else if(srcCount > 1) { - depmask1 = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]); - depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1)); - PERM1(mat_A, _mm512_castsi512_si128(depmask1)); - PERM2(mat_B); - } - if(srcCount == 3) { - depmask256 = gf16_affine_load_matrix(scratch, coefficients[2]); - depmask2 = _mm512_castsi256_si512(depmask256); - depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1)); - PERM1(mat_C, _mm256_castsi256_si128(depmask256)); - } else if(srcCount > 3) { - depmask1 = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]); - depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1)); - PERM1(mat_C, _mm512_castsi512_si128(depmask1)); - PERM2(mat_D); - } - if(srcCount == 5) { - depmask256 = gf16_affine_load_matrix(scratch, coefficients[4]); - depmask2 = _mm512_castsi256_si512(depmask256); - depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1)); - PERM1(mat_E, _mm256_castsi256_si128(depmask256)); - } else if(srcCount > 5) { - depmask1 = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]); - depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1)); - PERM1(mat_E, _mm512_castsi512_si128(depmask1)); - PERM2(mat_F); - } - #undef PERM1 - #undef PERM2 - - for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m512i)*2) { - __m512i tph = _mm512_load_si512((__m512i*)(_dst + ptr)); - __m512i tpl = _mm512_load_si512((__m512i*)(_dst + ptr) + 1); - gf16_affine_muladd_round((__m512i*)(_src1 + ptr*srcScale), &tpl, &tph, mat_All, mat_Ahl, mat_Alh, mat_Ahh); - if(srcCount >= 2) - gf16_affine_muladd_round((__m512i*)(_src2 + ptr*srcScale), &tpl, &tph, mat_Bll, mat_Bhl, mat_Blh, mat_Bhh); - if(srcCount >= 3) - gf16_affine_muladd_round((__m512i*)(_src3 + ptr*srcScale), &tpl, &tph, mat_Cll, mat_Chl, mat_Clh, mat_Chh); - if(srcCount >= 4) - gf16_affine_muladd_round((__m512i*)(_src4 + ptr*srcScale), &tpl, &tph, mat_Dll, mat_Dhl, mat_Dlh, mat_Dhh); - if(srcCount >= 5) - gf16_affine_muladd_round((__m512i*)(_src5 + ptr*srcScale), &tpl, &tph, mat_Ell, mat_Ehl, mat_Elh, mat_Ehh); - if(srcCount >= 6) - gf16_affine_muladd_round((__m512i*)(_src6 + ptr*srcScale), &tpl, &tph, mat_Fll, mat_Fhl, mat_Flh, mat_Fhh); - _mm512_store_si512((__m512i*)(_dst + ptr), tph); - _mm512_store_si512((__m512i*)(_dst + ptr)+1, tpl); - - if(doPrefetch == 1) - _mm_prefetch(_pf+(ptr>>1), MM_HINT_WT1); - if(doPrefetch == 2) - _mm_prefetch(_pf+(ptr>>1), _MM_HINT_T1); - } -} -#endif /*defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)*/ - -void gf16_affine_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { - UNUSED(mutScratch); -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) - gf16_muladd_single(scratch, &gf16_affine_muladd_x_avx512, dst, src, len, coefficient); - _mm256_zeroupper(); -#else - UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); -#endif -} - -void gf16_affine_muladd_prefetch_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) { - UNUSED(mutScratch); -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) - gf16_muladd_prefetch_single(scratch, &gf16_affine_muladd_x_avx512, dst, src, len, coefficient, prefetch); - _mm256_zeroupper(); -#else - UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(prefetch); -#endif -} - -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64) -GF16_MULADD_MULTI_FUNCS(gf16_affine, _avx512, gf16_affine_muladd_x_avx512, 6, sizeof(__m512i)*2, 1, _mm256_zeroupper()) -#else -GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine, _avx512) -#endif - - -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -# include "gf16_bitdep_init_avx2.h" -#endif -void* gf16_affine_init_avx512(int polynomial) { -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) - __m128i* ret; - ALIGN_ALLOC(ret, sizeof(__m256i)*16*4, 32); - gf16_bitdep_init256(ret, polynomial, 1); - return ret; -#else - UNUSED(polynomial); - return NULL; -#endif -} - - -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_2round(const int srcCountOffs, const void* _src1, const void* _src2, __m512i* result, __m512i* swapped, __m512i matNorm1, __m512i matSwap1, __m512i matNorm2, __m512i matSwap2) { - if(srcCountOffs < 0) return; - - __m512i data1 = _mm512_load_si512(_src1); - if(srcCountOffs == 0) { - *result = _mm512_xor_si512( - *result, - _mm512_gf2p8affine_epi64_epi8(data1, matNorm1, 0) - ); - *swapped = _mm512_xor_si512( - *swapped, - _mm512_gf2p8affine_epi64_epi8(data1, matSwap1, 0) - ); - } - else { // if(srcCountOffs > 0) - __m512i data2 = _mm512_load_si512(_src2); - *result = _mm512_ternarylogic_epi32( - *result, - _mm512_gf2p8affine_epi64_epi8(data1, matNorm1, 0), - _mm512_gf2p8affine_epi64_epi8(data2, matNorm2, 0), - 0x96 - ); - *swapped = _mm512_ternarylogic_epi32( - *swapped, - _mm512_gf2p8affine_epi64_epi8(data1, matSwap1, 0), - _mm512_gf2p8affine_epi64_epi8(data2, matSwap2, 0), - 0x96 - ); - } -} -static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_avx512( - const void *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, - GF16_MULADD_MULTI_SRCLIST, - size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf -) { - GF16_MULADD_MULTI_SRC_UNUSED(13); - - __m512i depmask; - __m512i matNormA, matSwapA; - __m512i matNormB, matSwapB; - __m512i matNormC, matSwapC; - __m512i matNormD, matSwapD; - __m512i matNormE, matSwapE; - __m512i matNormF, matSwapF; - __m512i matNormG, matSwapG; - __m512i matNormH, matSwapH; - __m512i matNormI, matSwapI; - __m512i matNormJ, matSwapJ; - __m512i matNormK, matSwapK; - __m512i matNormL, matSwapL; - __m512i matNormM, matSwapM; - - // prevent MSVC whining - matNormB = matSwapB = matNormC = matSwapC = matNormD = matSwapD = matNormE = matSwapE = matNormF = matSwapF = matNormG = matSwapG = matNormH = matSwapH = matNormI = matSwapI = matNormJ = matSwapJ = matNormK = matSwapK = matNormL = matSwapL = matNormM = matSwapM = _mm512_undefined_epi32(); - - if(srcCount == 1) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[0])); - matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - if(srcCount > 1) { - depmask = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]); - matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - matNormB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); - matSwapB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); - } - if(srcCount == 3) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[2])); - matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - if(srcCount > 3) { - depmask = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]); - matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - matNormD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); - matSwapD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); - } - if(srcCount == 5) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[4])); - matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - if(srcCount > 5) { - depmask = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]); - matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - matNormF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); - matSwapF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); - } - if(srcCount == 7) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[6])); - matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - if(srcCount > 7) { - depmask = gf16_affine_load2_matrix(scratch, coefficients[6], coefficients[7]); - matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - matNormH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); - matSwapH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); - } - if(srcCount == 9) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[8])); - matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - if(srcCount > 9) { - depmask = gf16_affine_load2_matrix(scratch, coefficients[8], coefficients[9]); - matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - matNormJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); - matSwapJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); - } - if(srcCount == 11) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[10])); - matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - if(srcCount > 11) { - depmask = gf16_affine_load2_matrix(scratch, coefficients[10], coefficients[11]); - matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - matNormL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2)); - matSwapL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3)); - } - if(srcCount == 13) { - depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[12])); - matNormM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0)); - matSwapM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1)); - } - - - for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m512i)) { - __m512i data = _mm512_load_si512((__m512i*)(_src1 + ptr*srcScale)); - __m512i result = _mm512_gf2p8affine_epi64_epi8(data, matNormA, 0); - __m512i swapped = _mm512_gf2p8affine_epi64_epi8(data, matSwapA, 0); - if(srcCount > 1) - data = _mm512_load_si512((__m512i*)(_src2 + ptr*srcScale)); - if(srcCount >= 3) { - __m512i data2 = _mm512_load_si512((__m512i*)(_src3 + ptr*srcScale)); - result = _mm512_ternarylogic_epi32( - result, - _mm512_gf2p8affine_epi64_epi8(data, matNormB, 0), - _mm512_gf2p8affine_epi64_epi8(data2, matNormC, 0), - 0x96 - ); - swapped = _mm512_ternarylogic_epi32( - swapped, - _mm512_gf2p8affine_epi64_epi8(data, matSwapB, 0), - _mm512_gf2p8affine_epi64_epi8(data2, matSwapC, 0), - 0x96 - ); - } else if(srcCount == 2) { - result = _mm512_xor_si512( - result, - _mm512_gf2p8affine_epi64_epi8(data, matNormB, 0) - ); - swapped = _mm512_xor_si512( - swapped, - _mm512_gf2p8affine_epi64_epi8(data, matSwapB, 0) - ); - } - - gf16_affine2x_muladd_2round(srcCount - 4, _src4 + ptr*srcScale, _src5 + ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE); - gf16_affine2x_muladd_2round(srcCount - 6, _src6 + ptr*srcScale, _src7 + ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG); - gf16_affine2x_muladd_2round(srcCount - 8, _src8 + ptr*srcScale, _src9 + ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI); - gf16_affine2x_muladd_2round(srcCount - 10, _src10 + ptr*srcScale, _src11 + ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK); - gf16_affine2x_muladd_2round(srcCount - 12, _src12 + ptr*srcScale, _src13 + ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM); - - result = _mm512_ternarylogic_epi32( - result, - _mm512_shuffle_epi32(swapped, _MM_SHUFFLE(1,0,3,2)), - _mm512_load_si512((__m512i*)(_dst + ptr)), - 0x96 - ); - _mm512_store_si512 ((__m512i*)(_dst + ptr), result); - - if(doPrefetch == 1) - _mm_prefetch(_pf+ptr, MM_HINT_WT1); - if(doPrefetch == 2) - _mm_prefetch(_pf+ptr, _MM_HINT_T1); - } -} -#endif /*defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)*/ - void gf16_affine2x_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { UNUSED(mutScratch); @@ -487,26 +118,3 @@ void gf16_affine2x_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, co UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); #endif } - -void gf16_affine2x_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) { - UNUSED(mutScratch); -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) - gf16_muladd_single(scratch, &gf16_affine2x_muladd_x_avx512, dst, src, len, coefficient); - _mm256_zeroupper(); -#else - UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); -#endif -} - - -#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) -# ifdef PLATFORM_AMD64 -// TODO: may not want 12 regions for non-packed variant -GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _avx512, gf16_affine2x_muladd_x_avx512, 12, sizeof(__m512i), 0, _mm256_zeroupper()) -# else -// if only 8 registers available, only allow 2 parallel regions -GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _avx512, gf16_affine2x_muladd_x_avx512, 2, sizeof(__m512i), 0, _mm256_zeroupper()) -# endif -#else -GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine2x, _avx512) -#endif diff --git a/gf16/gf16_affine_gfni.c b/gf16/gf16_affine_gfni.c index e7668cdb..e2539ea3 100644 --- a/gf16/gf16_affine_gfni.c +++ b/gf16/gf16_affine_gfni.c @@ -8,6 +8,7 @@ #define _MM(f) _mm_ ## f #define _MMI(f) _mm_ ## f ## _si128 #define _FNSUFFIX _gfni +#define _FNPREP(f) f##_gfni #define _MM_END #if defined(__GFNI__) && defined(__SSSE3__) @@ -25,6 +26,7 @@ int gf16_affine_available_gfni = 0; #endif #undef _MM_END #undef _FNSUFFIX +#undef _FNPREP #undef _MMI #undef _MM #undef _mword diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h index cb938cb4..f445dbb3 100644 --- a/gf16/gf16_muladd_multi.h +++ b/gf16/gf16_muladd_multi.h @@ -24,41 +24,41 @@ if(max < 18) UNUSED(_src18) #define GF16_MULADD_MULTI_FUNCS(fnpre, fnsuf, xfn, procRegions, blocksize, pfFactor, finisher) \ -void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ +void TOKENPASTE3(fnpre, _muladd_multi, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ UNUSED(mutScratch); \ gf16_muladd_multi(scratch, &xfn, procRegions, regions, offset, dst, src, len, coefficients); \ finisher; \ } \ -void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \ +void TOKENPASTE3(fnpre, _muladd_multi_stridepf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \ UNUSED(mutScratch); \ gf16_muladd_multi_stridepf(scratch, &xfn, procRegions, regions, srcStride, dst, src, len, coefficients, pfFactor, prefetch); \ finisher; \ } \ -void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ +void TOKENPASTE3(fnpre, _muladd_multi_packed, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ UNUSED(mutScratch); \ gf16_muladd_multi_packed(scratch, &xfn, procRegions, procRegions, packedRegions, regions, dst, src, len, blocksize, coefficients); \ finisher; \ } \ -void fnpre ## _muladd_multi_packpf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ +void TOKENPASTE3(fnpre, _muladd_multi_packpf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ UNUSED(mutScratch); \ gf16_muladd_multi_packpf(scratch, &xfn, procRegions, procRegions, packedRegions, regions, dst, src, len, blocksize, coefficients, pfFactor, prefetchIn, prefetchOut); \ finisher; \ } #define GF16_MULADD_MULTI_FUNCS_STUB(fnpre, fnsuf) \ -void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ +void TOKENPASTE3(fnpre, _muladd_multi, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ UNUSED(mutScratch); \ UNUSED(scratch); UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \ } \ -void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \ +void TOKENPASTE3(fnpre, _muladd_multi_stridepf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \ UNUSED(mutScratch); \ UNUSED(scratch); UNUSED(regions); UNUSED(srcStride); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); UNUSED(prefetch); \ } \ -void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ +void TOKENPASTE3(fnpre, _muladd_multi_packed, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \ UNUSED(mutScratch); \ UNUSED(scratch); UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \ } \ -void fnpre ## _muladd_multi_packpf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ +void TOKENPASTE3(fnpre, _muladd_multi_packpf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ UNUSED(mutScratch); \ UNUSED(scratch); UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); UNUSED(prefetchIn); UNUSED(prefetchOut); \ } diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index ac7765f3..0d48bf73 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -847,7 +847,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { break; case GF16_AFFINE_AVX512: - scratch = gf16_affine_init_avx512(GF16_POLYNOMIAL); + scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512) _mul = &gf16_affine_mul_avx512; _mul_add = &gf16_affine_muladd_avx512; @@ -940,7 +940,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { break; case GF16_AFFINE2X_AVX512: - scratch = gf16_affine_init_avx512(GF16_POLYNOMIAL); + scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512) _mul = &gf16_affine2x_mul_avx512; _mul_add = &gf16_affine2x_muladd_avx512; From b5858b29c46971f938ba27d86cb35c17046345bb Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 31 Jul 2023 23:22:01 +1000 Subject: [PATCH 45/91] Remove references to deprecated String.prototype.substr + update arg_parser.js --- bin/parpar.js | 20 ++++++++++---------- lib/arg_parser.js | 42 ++++++++++++++++++++++++++++-------------- lib/par2gen.js | 2 +- test/par-compare.js | 2 +- 4 files changed, 40 insertions(+), 26 deletions(-) diff --git a/bin/parpar.js b/bin/parpar.js index f03af0b6..b123948a 100755 --- a/bin/parpar.js +++ b/bin/parpar.js @@ -393,7 +393,7 @@ if(argv.json) print_json('progress', data); }; else if(argv.progress == 'stdout' || argv.progress == 'stderr') { - var decimalPoint = (1.1).toLocaleString().substr(1, 1); + var decimalPoint = (1.1).toLocaleString().substring(1, 2); // TODO: display slices processed, pass# if verbose progress requested writeProgress = function(data) { // add formatting for aesthetics @@ -457,7 +457,7 @@ var inputFiles = argv._; stdInUsed = true; stream = process.stdin; } else { - stream = fs.createReadStream(null, {fd: fl[0].substr(5)|0}); + stream = fs.createReadStream(null, {fd: fl[0].substring(5)|0}); } // read from stream var data = ''; @@ -469,7 +469,7 @@ var inputFiles = argv._; }); stream.once('error', cb); } else if(/^proc:\/\//i.test(fl[0])) { - require('child_process').exec(fl[0].substr(7), {maxBuffer: 1048576*32, encoding: inlistEnc}, function(err, stdout, stderr) { + require('child_process').exec(fl[0].substring(7), {maxBuffer: 1048576*32, encoding: inlistEnc}, function(err, stdout, stderr) { cb(err, [fl[1], stdout]); }); } else { @@ -502,7 +502,7 @@ var inputFiles = argv._; creator: creator }; if(argv.out.match(/\.par2$/i)) - ppo.outputBase = argv.out.substr(0, argv.out.length-5); + ppo.outputBase = argv.out.substring(0, argv.out.length-5); for(var k in opts) { if(opts[k].map && (k in argv)) @@ -521,7 +521,7 @@ var inputFiles = argv._; var parseSizeOrNum = function(arg, input, multiple) { var m; - var isRec = (arg.substr(-15) == 'recovery-slices' || arg == 'slices-per-file' || arg == 'slices-first-file' || arg == 'packet-redundancy'); + var isRec = (arg.slice(-15) == 'recovery-slices' || arg == 'slices-per-file' || arg == 'slices-first-file' || arg == 'packet-redundancy'); input = input || argv[arg]; if(typeof input == 'number' || /^-?\d+$/.test(input)) { input = input|0; @@ -540,7 +540,7 @@ var inputFiles = argv._; error('Invalid value specified for `'+arg+'`'); var scale = 1; if(m[2].length > 2) { - scale = +(m[2].substr(2)); + scale = +(m[2].substring(2)); if(isNaN(scale) || !isFinite(scale)) error('Invalid value specified for `'+arg+'`'); if(m[2][1] == '/') { scale = 1/scale; @@ -588,7 +588,7 @@ var inputFiles = argv._; if(/^slices-/.test(k[0]) && (val[0] == '<' || val[0] == '>')) { // TODO: also do this for packet-redundancy? ppo[k[1]+'Rounding'] = (val[0] == '<' ? 'floor' : 'ceil'); - val = val.substr(1); + val = val.substring(1); } var expr = val.replace(/^[\-+]/, function(x) { if(x == '-') return '0-'; // hack to get initial negative term to work @@ -626,7 +626,7 @@ var inputFiles = argv._; var ret = {}; if(data.process) { ret.ratio = parseFloat(data.process); - if(data.process.substr(-1) == '%') + if(data.process.slice(-1) == '%') ret.ratio /= 100; } if(data.device) { @@ -675,8 +675,8 @@ var inputFiles = argv._; }; var openclOpts = {}; for(var k in argv) - if(k.substr(0, 7) == 'opencl-') - openclOpts[k.substr(7)] = argv[k]; + if(k.substring(0, 7) == 'opencl-') + openclOpts[k.substring(7)] = argv[k]; openclOpts = openclMap(openclOpts); if(argv.opencl) { ppo.openclDevices = argv.opencl.map(function(spec) { diff --git a/lib/arg_parser.js b/lib/arg_parser.js index 3502e682..0234802b 100644 --- a/lib/arg_parser.js +++ b/lib/arg_parser.js @@ -52,11 +52,12 @@ module.exports = function(argv, opts) { aliasMap[opts[k].alias] = k; } + var applyFn = {}; var setKey = function(key, val, explicit) { var o = opts[key]; if(o === undefined) throw new Error('Unknown option `' + key + '`'); - var isMultiple = (['list','array','map'].indexOf(o.type) !== -1); + var isMultiple = (['list','array','map','map2'].indexOf(o.type) !== -1); if((key in ret) && !isMultiple) throw new Error('Option `' + key + '` specified more than once'); @@ -104,7 +105,7 @@ module.exports = function(argv, opts) { } if(!(key in ret)) - ret[key] = (o.type == 'map') ? {} : []; + ret[key] = (o.type == 'map' || o.type == 'map2') ? {} : []; else if(!ret[key]) { // option set to a special scalar value if(ret[key] === null) throw new Error('No value specified for `' + key + '`'); @@ -122,13 +123,17 @@ module.exports = function(argv, opts) { ret[key].push(val); break; case 'map': + case 'map2': var m; if(m = val.match(/^(.+?)[=:](.*)$/)) ret[key][m[1].trim()] = m[2].trim(); + else if(o.type == 'map2') + ret[key][val.trim()] = undefined; else throw new Error('Invalid format for `' + key + '`'); break; } + if(o.fn) applyFn[key] = 1; } else { if(val === undefined || (val === '' && !explicit)) { if(o.ifSetDefault !== undefined) @@ -169,8 +174,8 @@ module.exports = function(argv, opts) { default: // string ret[key] = val; } + if(o.fn) ret[key] = o.fn(ret[key]); } - if(o.fn) ret[key] = o.fn(ret[key]); }; for(var i=0; i parse to object ret[k] = {}; v.forEach(function(s) { + if(typeof s !== 'string') + throw new Error('Invalid format for `' + k + '`'); var m; - if(typeof s === 'string' && (m = s.match(/^(.+?)[=:](.*)$/))) + if(m = s.match(/^(.+?)[=:](.*)$/)) ret[k][m[1].trim()] = m[2].trim(); + else if(opt.type == 'map2') + ret[k][s.trim()] = undefined; else throw new Error('Invalid format for `' + k + '`'); }); diff --git a/lib/par2gen.js b/lib/par2gen.js index 3545550d..b5bcf8df 100644 --- a/lib/par2gen.js +++ b/lib/par2gen.js @@ -777,7 +777,7 @@ function PAR2Gen(fileInfo, sliceSize, opts) { var stripLen = common_root.join(path.sep).length + 1; fileInfo.forEach(function(file) { if(!('displayName' in file) && ('name' in file)) - file.displayName = pathToPar2(file._fullPath.substr(stripLen)); + file.displayName = pathToPar2(file._fullPath.substring(stripLen)); delete file._fullPath; }); } diff --git a/test/par-compare.js b/test/par-compare.js index 3cd91049..87bf7854 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -183,7 +183,7 @@ function compare_files(file1, file2) { for(var k in file1) { // ignore Creator packet + unicode filename // TODO: consider comparing unicode filename packets - if(k == 'creator' || k.substr(0, 5) == 'unifn') continue; + if(k == 'creator' || k.substring(0, 5) == 'unifn') continue; if(!packet_eq(file1[k], file2[k])) { //console.log('Packet mismatch for ' + k, file1[k], file2[k]); From 40d25684bdcac663db3000543d1be4050d27be11 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 1 Aug 2023 14:07:56 +1000 Subject: [PATCH 46/91] Address deprecated NodeJS functions --- benchmarks/bench.js | 5 +++-- bin/parpar.js | 7 ++++--- lib/arg_parser.js | 4 ++-- lib/filechunkreader.js | 3 ++- lib/par2.js | 21 +++++++++++---------- lib/par2gen.js | 9 +++++---- test/par-compare.js | 17 +++++++++-------- 7 files changed, 36 insertions(+), 30 deletions(-) diff --git a/benchmarks/bench.js b/benchmarks/bench.js index 074c3ceb..23a1b8ba 100644 --- a/benchmarks/bench.js +++ b/benchmarks/bench.js @@ -211,6 +211,7 @@ for(var i in benchmarks) { delete benchmarks[i]; } +var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; var fsWriteSync = function(fd, data) { fs.writeSync(fd, data, 0, data.length, null); }; @@ -259,13 +260,13 @@ async.eachSeries(Object.keys(benchmarks), function getVersion(prog, cb) { if(fs.statSync(tmpDir + name).size == size) return; } var fd = fs.openSync(tmpDir + name, 'w'); - var rand = require('crypto').createCipher('rc4', 'my_incredibly_strong_password' + name); + var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, ''); rand.setAutoPadding(false); var nullBuf = new Buffer(1024*16); nullBuf.fill(0); var written = 0; while(written < size) { - var b = rand.update(nullBuf).slice(0, size-written); + var b = bufferSlice.call(rand.update(nullBuf), 0, size-written); fsWriteSync(fd, b); written += b.length; } diff --git a/bin/parpar.js b/bin/parpar.js index b123948a..de4a57c0 100755 --- a/bin/parpar.js +++ b/bin/parpar.js @@ -309,12 +309,13 @@ var fs = require('fs'); /*{{!include_in_executable! if(!argv['skip-self-check']) { // if this is a compiled EXE, do a self MD5 check to detect corruption + var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; var executable = fs.readFileSync(process.execPath); - var md5loc = executable.slice(-1024, -16).indexOf('\0='); + var md5loc = bufferSlice.call(executable, -1024, -16).indexOf('\0='); if(md5loc < 0) error('Could not find self-check hash - this executable may be truncated or corrupt. If you are certain this is not a problem, you may use the `--skip-self-check` flag to bypass this check.'); - var expectedMd5 = executable.slice(-1024 + md5loc + 16, (-1024 + md5loc + 32) || undefined).toString('hex'); - var actualMd5 = require('crypto').createHash('md5').update(executable.slice(0, -1024 + md5loc)).digest('hex'); + var expectedMd5 = bufferSlice.call(executable, -1024 + md5loc + 16, (-1024 + md5loc + 32) || undefined).toString('hex'); + var actualMd5 = require('crypto').createHash('md5').update(bufferSlice.call(executable, 0, -1024 + md5loc)).digest('hex'); if(expectedMd5 != actualMd5) error('Self-check failed - this executable may be corrupt. If you are certain this is not a problem, you may use the `--skip-self-check` flag to bypass this check.'); } diff --git a/lib/arg_parser.js b/lib/arg_parser.js index 0234802b..a4c77b71 100644 --- a/lib/arg_parser.js +++ b/lib/arg_parser.js @@ -3,7 +3,7 @@ var RE_DIGITS = /^\d+$/; var parseSize = function(s) { - if(typeof s == 'number' || RE_DIGITS.test(s)) return Math.max(0, Math.floor(s)); + if(typeof s == 'number' || (''+s).search(RE_DIGITS) >= 0) return Math.max(0, Math.floor(s)); var parts = (''+s).toUpperCase().match(/^([0-9.]+)([BKMGTPE])$/); if(parts) { var num = +(parts[1]); @@ -22,7 +22,7 @@ var parseSize = function(s) { return false; }; var parseTime = function(s) { - if(typeof s == 'number' || RE_DIGITS.test(s)) return Math.max(0, Math.floor(s*1000)); + if(typeof s == 'number' || (''+s).search(RE_DIGITS) >= 0) return Math.max(0, Math.floor(s*1000)); var parts = (''+s).toLowerCase().match(/^([0-9.]+)(m?s|[mhdw])$/); if(parts) { var num = +(parts[1]); diff --git a/lib/filechunkreader.js b/lib/filechunkreader.js index 77d283d9..788bcff9 100644 --- a/lib/filechunkreader.js +++ b/lib/filechunkreader.js @@ -3,6 +3,7 @@ var fs = require('fs'); var async = require('async'); var ProcQueue = require('./procqueue'); +var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; function FileChunkReader(files, sliceSize, chunkSize, chunkOffset, bufPool, concurrency, cbChunk, cb) { var readQ = new ProcQueue(concurrency); @@ -31,7 +32,7 @@ function FileChunkReader(files, sliceSize, chunkSize, chunkOffset, bufPool, conc if(readErr) return cb(readErr); fs.read(fd, buffer, 0, chunkSize, filePos, function(err, bytesRead) { if(err) readErr = err; - else cbChunk(file, buffer.slice(0, bytesRead), sliceNum, bufPool.put.bind(bufPool, buffer)); + else cbChunk(file, bufferSlice.call(buffer, 0, bytesRead), sliceNum, bufPool.put.bind(bufPool, buffer)); if(--chunksLeft == 0) { // all chunks read from this file, so close it diff --git a/lib/par2.js b/lib/par2.js index f9eca434..d9e98119 100644 --- a/lib/par2.js +++ b/lib/par2.js @@ -6,6 +6,7 @@ var async = require('async'); var allocBuffer = (Buffer.allocUnsafe || Buffer); var toBuffer = (Buffer.alloc ? Buffer.from : Buffer); +var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; var SAFE_INT = 0xffffffff; // JS only does 32-bit bit operations @@ -142,7 +143,7 @@ var GFWrapper = { self.finish(item.data, item.cb); break; } - else if(self.gf.add(item.num, item.data.slice(0, item.len), function() { + else if(self.gf.add(item.num, bufferSlice.call(item.data, 0, item.len), function() { //this.cb(this.num, this.data); this.cb(); }.bind(item))) { @@ -152,7 +153,7 @@ var GFWrapper = { }); } - if(this.gf.add(sliceNum, dataSlice.slice(0, len), function() { + if(this.gf.add(sliceNum, bufferSlice.call(dataSlice, 0, len), function() { //cb(sliceNum, dataSlice); cb(); })) @@ -310,7 +311,7 @@ var GFWrapper = { for(var i=0; i= self.recData.length) baseBufIdx -= self.recData.length; - bufs[i] = self.recData[baseBufIdx + i].slice(0, self.chunkSize); + bufs[i] = bufferSlice.call(self.recData[baseBufIdx + i], 0, self.chunkSize); } self.recDataHashers[hasherIdx].update(bufs, function() { @@ -344,12 +345,12 @@ var GFWrapper = { setImmediate(cb.bind( null, this.recDataPtr, - new PAR2OutputData(this.recDataPtr, this.recData[this.recDataPtr % this.recData.length].slice(0, this.chunkSize), this) + new PAR2OutputData(this.recDataPtr, bufferSlice.call(this.recData[this.recDataPtr % this.recData.length], 0, this.chunkSize), this) )); } else { var self = this; this.recDataFetchCb[this.recDataPtr] = function(idx, buffer) { - cb(idx, new PAR2OutputData(idx, buffer.slice(0, self.chunkSize), self)); + cb(idx, new PAR2OutputData(idx, bufferSlice.call(buffer, 0, self.chunkSize), self)); }; } this.recDataPtr++; @@ -382,7 +383,7 @@ var GFWrapper = { // return requested MD5 var offset = 16*(idx % self._gfOpts.hashBatchSize); - cb(self.recDataMD5.slice(offset, offset+16)); + cb(bufferSlice.call(self.recDataMD5, offset, offset+16)); }); }, _isRecoveryProcessed: function() { @@ -440,7 +441,7 @@ function PAR2(files, sliceSize, opts) { offs += 16; }); - this.setID = crypto.createHash('md5').update(this.pktMain.slice(64)).digest(); + this.setID = crypto.createHash('md5').update(bufferSlice.call(this.pktMain, 64)).digest(); // lastly, header this._writePktHeader(this.pktMain, 'PAR 2.0\0Main\0\0\0\0'); @@ -488,7 +489,7 @@ PAR2.prototype = { // put in packet hash if(!skipMD5) { crypto.createHash('md5') - .update(buf.slice(offset+32, offset+pktLen)) + .update(bufferSlice.call(buf, offset+32, offset+pktLen)) .digest() .copy(buf, offset+16); } @@ -521,7 +522,7 @@ PAR2.prototype = { makeRecoveryHeader: function(chunks, num) { if(!Array.isArray(chunks)) chunks = [chunks]; - var md5 = crypto.createHash('md5').update(pkt.slice(32)); + var md5 = crypto.createHash('md5').update(bufferSlice.call(pkt, 32)); var len = this.sliceSize; chunks.forEach(function(chunk) { @@ -679,7 +680,7 @@ function PAR2File(par2, file) { if(file.size == 0) { this.md5 = toBuffer('d41d8cd98f00b204e9800998ecf8427e', 'hex'); } else { - this._md5ctx = new binding.HasherInput(par2.sliceSize, this.pktCheck.slice(64 + 16)); + this._md5ctx = new binding.HasherInput(par2.sliceSize, bufferSlice.call(this.pktCheck, 64 + 16)); } } diff --git a/lib/par2gen.js b/lib/par2gen.js index b5bcf8df..38e631c6 100644 --- a/lib/par2gen.js +++ b/lib/par2gen.js @@ -9,6 +9,7 @@ var FileSeqReader = require('./fileseqreader'); var FileChunkReader = require('./filechunkreader'); var BufferPool = require('./bufferpool'); var PAR2OutFile = require('./par2outfile'); +var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; var MAX_BUFFER_SIZE = (require('buffer').kMaxLength || (1024*1024*1024-1)) - 1024-68; // the '-1024-68' is padding to deal with alignment issues (XorJit512 can have 1KB block) + 68-byte header @@ -1117,7 +1118,7 @@ PAR2Gen.prototype = { if(this._chunker) { async.parallel([ file.processHash.bind(file, buf), - this._chunker.processData.bind(this._chunker, file, buf.slice(0, this._chunkSize)) + this._chunker.processData.bind(this._chunker, file, bufferSlice.call(buf, 0, this._chunkSize)) ], cb); } else { file.process(buf, cb); @@ -1288,7 +1289,7 @@ PAR2Gen.prototype = { if(cbProgress) cbProgress('processing_slice', data.file, slicePos); self._chunker.processData( data.file.sliceOffset + slicePos, - data.buffer.slice(chunk, Math.min(chunk+chunkSize, data.file.size)), + bufferSlice.call(data.buffer, chunk, Math.min(chunk+chunkSize, data.file.size)), cb ); slicePos++; @@ -1306,7 +1307,7 @@ PAR2Gen.prototype = { async.times(numSlices, function(sliceOffNum, cb) { if(cbProgress) cbProgress('processing_slice', data.file, slicePos + sliceOffNum); var bp = sliceOffNum * self.opts.sliceSize; - data.file.processData(data.buffer.slice(bp, Math.min(data.buffer.length, bp+self.opts.sliceSize)), cb); + data.file.processData(bufferSlice.call(data.buffer, bp, Math.min(data.buffer.length, bp+self.opts.sliceSize)), cb); }, data.release.bind(data)); } }, cb); @@ -1516,7 +1517,7 @@ module.exports = { fs.read(fd, buf, 0, 16384, null, cb); }, function(bytesRead, buffer, cb) { - info.md5_16k = crypto.createHash('md5').update(buffer.slice(0, bytesRead)).digest(); + info.md5_16k = crypto.createHash('md5').update(bufferSlice.call(buffer, 0, bytesRead)).digest(); if(info.size < 16384) info.md5 = info.md5_16k; fs.close(fd, cb); } diff --git a/test/par-compare.js b/test/par-compare.js index 87bf7854..ad0963be 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -26,6 +26,7 @@ var fsRead = function(fd, len) { return buf; }; +var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; var BufferCompare; if(Buffer.compare) BufferCompare = Buffer.compare; else BufferCompare = function(a, b) { @@ -57,29 +58,29 @@ function parse_file(file) { while(pos != stat.size) { // != ensures that size should exactly match expected var header = fsRead(fd, 64); - if(header.slice(0, 8).toString() != 'PAR2\0PKT') + if(bufferSlice.call(header, 0, 8).toString() != 'PAR2\0PKT') throw new Error('Invalid packet signature @' + pos); var pkt = { len: header.readUInt32LE(8) + header.readUInt32LE(12) * 4294967296, offset: pos, - md5: header.slice(16, 32), - type: header.slice(48, 64).toString().replace(/\0+$/, '') + md5: bufferSlice.call(header, 16, 32), + type: bufferSlice.call(header, 48, 64).toString().replace(/\0+$/, '') }; try { if(pkt.len % 4 || pkt.len < 64) throw new Error('Invalid packet length specified'); if(ret.rsId) { - if(BufferCompare(ret.rsId, header.slice(32, 48))) + if(BufferCompare(ret.rsId, bufferSlice.call(header, 32, 48))) throw new Error('Mismatching recovery set ID'); } else { ret.rsId = new Buffer(16); - header.slice(32, 48).copy(ret.rsId); + bufferSlice.call(header, 32, 48).copy(ret.rsId); } var md5 = crypto.createHash('md5'); - md5.update(header.slice(32)); + md5.update(bufferSlice.call(header, 32)); var pktPos = 64; var idLen = 0; @@ -294,13 +295,13 @@ console.log('Creating random input files...'); function writeRndFile(name, size) { if(skipFileCreate && fs.existsSync(tmpDir + name)) return; var fd = fs.openSync(tmpDir + name, 'w'); - var rand = require('crypto').createCipher('rc4', 'my_incredibly_strong_password' + name); + var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, ''); rand.setAutoPadding(false); var nullBuf = new Buffer(1024*16); nullBuf.fill(0); var written = 0; while(written < size) { - var b = rand.update(nullBuf).slice(0, Math.min(1024*16, size-written)); + var b = bufferSlice.call(rand.update(nullBuf), 0, Math.min(1024*16, size-written)); fsWriteSync(fd, b); written += b.length; } From 26f1a914d9e7466030a3272b9a0138045f6f558e Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 1 Aug 2023 16:17:08 +1000 Subject: [PATCH 47/91] Handle Buffer constructor deprecation in extra scripts --- benchmarks/bench.js | 5 +++-- test/par-compare.js | 11 ++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/benchmarks/bench.js b/benchmarks/bench.js index 23a1b8ba..ba6e7d29 100644 --- a/benchmarks/bench.js +++ b/benchmarks/bench.js @@ -223,9 +223,10 @@ var findFile = function(dir, re) { return ret; }; +var allocBuffer = (Buffer.allocUnsafe || Buffer); var async = require('async'); var fs = require('fs'); -var nullBuf = new Buffer(1024*16); +var nullBuf = allocBuffer(1024*16); nullBuf.fill(0); var results = {}; var testFiles = []; @@ -262,7 +263,7 @@ async.eachSeries(Object.keys(benchmarks), function getVersion(prog, cb) { var fd = fs.openSync(tmpDir + name, 'w'); var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, ''); rand.setAutoPadding(false); - var nullBuf = new Buffer(1024*16); + var nullBuf = allocBuffer(1024*16); nullBuf.fill(0); var written = 0; while(written < size) { diff --git a/test/par-compare.js b/test/par-compare.js index ad0963be..a8eff33d 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -18,15 +18,16 @@ var skipFileCreate = true; // skip creating test files if they already exist (sp var fs = require('fs'); var crypto = require('crypto'); +var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; +var allocBuffer = (Buffer.allocUnsafe || Buffer); var fsRead = function(fd, len) { - var buf = new Buffer(len); + var buf = allocBuffer(len); var readLen = fs.readSync(fd, buf, 0, len, null); if(readLen != len) throw new Error("Couldn't read requested data: got " + readLen + " bytes instead of " + len); return buf; }; -var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; var BufferCompare; if(Buffer.compare) BufferCompare = Buffer.compare; else BufferCompare = function(a, b) { @@ -75,7 +76,7 @@ function parse_file(file) { if(BufferCompare(ret.rsId, bufferSlice.call(header, 32, 48))) throw new Error('Mismatching recovery set ID'); } else { - ret.rsId = new Buffer(16); + ret.rsId = allocBuffer(16); bufferSlice.call(header, 32, 48).copy(ret.rsId); } @@ -297,7 +298,7 @@ function writeRndFile(name, size) { var fd = fs.openSync(tmpDir + name, 'w'); var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, ''); rand.setAutoPadding(false); - var nullBuf = new Buffer(1024*16); + var nullBuf = allocBuffer(1024*16); nullBuf.fill(0); var written = 0; while(written < size) { @@ -588,7 +589,7 @@ async.timesSeries(allTests.length, function(testNum, cb) { for(var k in f) { ret[k] = { type: f[k].type, - md5: new Buffer(f[k].md5, 'hex'), + md5: (Buffer.alloc ? Buffer.from : Buffer)(f[k].md5, 'hex'), len: f[k].len }; } From bbd2728f863b4b55d780e8f644acb99a6aa23517 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 3 Aug 2023 11:08:02 +1000 Subject: [PATCH 48/91] Buffer.subarray is not identical to Buffer.slice on Node 4.x --- benchmarks/bench.js | 2 +- bin/parpar.js | 2 +- lib/filechunkreader.js | 2 +- lib/par2.js | 2 +- lib/par2gen.js | 2 +- test/par-compare.js | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/bench.js b/benchmarks/bench.js index ba6e7d29..759bf57d 100644 --- a/benchmarks/bench.js +++ b/benchmarks/bench.js @@ -211,7 +211,7 @@ for(var i in benchmarks) { delete benchmarks[i]; } -var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; +var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice; var fsWriteSync = function(fd, data) { fs.writeSync(fd, data, 0, data.length, null); }; diff --git a/bin/parpar.js b/bin/parpar.js index de4a57c0..a2d3c18f 100755 --- a/bin/parpar.js +++ b/bin/parpar.js @@ -309,7 +309,7 @@ var fs = require('fs'); /*{{!include_in_executable! if(!argv['skip-self-check']) { // if this is a compiled EXE, do a self MD5 check to detect corruption - var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; + var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice; var executable = fs.readFileSync(process.execPath); var md5loc = bufferSlice.call(executable, -1024, -16).indexOf('\0='); if(md5loc < 0) diff --git a/lib/filechunkreader.js b/lib/filechunkreader.js index 788bcff9..716ba811 100644 --- a/lib/filechunkreader.js +++ b/lib/filechunkreader.js @@ -3,7 +3,7 @@ var fs = require('fs'); var async = require('async'); var ProcQueue = require('./procqueue'); -var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; +var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice; function FileChunkReader(files, sliceSize, chunkSize, chunkOffset, bufPool, concurrency, cbChunk, cb) { var readQ = new ProcQueue(concurrency); diff --git a/lib/par2.js b/lib/par2.js index d9e98119..5fa7203c 100644 --- a/lib/par2.js +++ b/lib/par2.js @@ -6,7 +6,7 @@ var async = require('async'); var allocBuffer = (Buffer.allocUnsafe || Buffer); var toBuffer = (Buffer.alloc ? Buffer.from : Buffer); -var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; +var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice; var SAFE_INT = 0xffffffff; // JS only does 32-bit bit operations diff --git a/lib/par2gen.js b/lib/par2gen.js index 38e631c6..012cbe6e 100644 --- a/lib/par2gen.js +++ b/lib/par2gen.js @@ -9,7 +9,7 @@ var FileSeqReader = require('./fileseqreader'); var FileChunkReader = require('./filechunkreader'); var BufferPool = require('./bufferpool'); var PAR2OutFile = require('./par2outfile'); -var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; +var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice; var MAX_BUFFER_SIZE = (require('buffer').kMaxLength || (1024*1024*1024-1)) - 1024-68; // the '-1024-68' is padding to deal with alignment issues (XorJit512 can have 1KB block) + 68-byte header diff --git a/test/par-compare.js b/test/par-compare.js index a8eff33d..60832de8 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -18,7 +18,7 @@ var skipFileCreate = true; // skip creating test files if they already exist (sp var fs = require('fs'); var crypto = require('crypto'); -var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice; +var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice; var allocBuffer = (Buffer.allocUnsafe || Buffer); var fsRead = function(fd, len) { var buf = allocBuffer(len); From 1af436ec6067925971f159d087ce89d5c5cac08b Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 12 Aug 2023 22:11:12 +1000 Subject: [PATCH 49/91] Add RVV implementation of Shuffle128 --- binding.gyp | 53 +++++++ gf16/gf16_checksum_rvv.h | 71 ++++++++++ gf16/gf16_cksum.h | 1 + gf16/gf16_cksum_rvv.c | 19 +++ gf16/gf16_rvv_common.h | 37 +++++ gf16/gf16_shuffle.h | 46 +++--- gf16/gf16_shuffle128_rvv.c | 281 +++++++++++++++++++++++++++++++++++++ gf16/gf16mul.cpp | 53 ++++++- gf16/gf16mul.h | 2 + gf16/gf_add.h | 3 + gf16/gf_add_rvv.c | 83 +++++++++++ help.txt | 2 + lib/par2.js | 1 + src/cpuid.h | 28 ++++ 14 files changed, 661 insertions(+), 19 deletions(-) create mode 100644 gf16/gf16_checksum_rvv.h create mode 100644 gf16/gf16_cksum_rvv.c create mode 100644 gf16/gf16_rvv_common.h create mode 100644 gf16/gf16_shuffle128_rvv.c create mode 100644 gf16/gf_add_rvv.c diff --git a/binding.gyp b/binding.gyp index 70a36159..42d1e9f1 100644 --- a/binding.gyp +++ b/binding.gyp @@ -867,6 +867,59 @@ ] }] ] + }, + { + "target_name": "gf16_rvv", + "type": "static_library", + "defines": ["NDEBUG"], + "sources": [ + "gf16/gf16_shuffle128_rvv.c", + "gf16/gf_add_rvv.c", + "gf16/gf16_cksum_rvv.c" + ], + "cflags": ["-Wno-unused-function", "-std=c99"], + "xcode_settings": { + "OTHER_CFLAGS": ["-Wno-unused-function"], + "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"] + }, + "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], + "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}}, + "conditions": [ + ['target_arch=="riscv64" and OS!="win"', { + "variables": {"supports_rvv%": "/dev/null || true)"}, + "conditions": [ + ['supports_rvv!=""', { + "cflags!": ["-march=native"], + "cxxflags!": ["-march=native"], + "cflags": ["-march=rv64gcv"], + "cxxflags": ["-march=rv64gcv"], + "xcode_settings": { + "OTHER_CFLAGS!": ["-march=native"], + "OTHER_CXXFLAGS!": ["-march=native"], + "OTHER_CFLAGS": ["-march=rv64gcv"], + "OTHER_CXXFLAGS": ["-march=rv64gcv"], + } + }] + ] + }], + ['target_arch=="riscv32" and OS!="win"', { + "variables": {"supports_rvv%": "/dev/null || true)"}, + "conditions": [ + ['supports_rvv!=""', { + "cflags!": ["-march=native"], + "cxxflags!": ["-march=native"], + "cflags": ["-march=rv32gcv"], + "cxxflags": ["-march=rv32gcv"], + "xcode_settings": { + "OTHER_CFLAGS!": ["-march=native"], + "OTHER_CXXFLAGS!": ["-march=native"], + "OTHER_CFLAGS": ["-march=rv32gcv"], + "OTHER_CXXFLAGS": ["-march=rv32gcv"], + } + }] + ] + }] + ] } ] } diff --git a/gf16/gf16_checksum_rvv.h b/gf16/gf16_checksum_rvv.h new file mode 100644 index 00000000..615109b9 --- /dev/null +++ b/gf16/gf16_checksum_rvv.h @@ -0,0 +1,71 @@ +#ifndef __GF16_CHECKSUM_H +#define __GF16_CHECKSUM_H + +#include "gf16_rvv_common.h" + +#ifdef __RVV_LE +static HEDLEY_ALWAYS_INLINE void gf16_checksum_block_rvv(const void *HEDLEY_RESTRICT src, void *HEDLEY_RESTRICT checksum, const size_t blockLen, const int aligned) { + size_t vl = RV(vsetvlmax_e8m1)(); + const unsigned words = blockLen/vl; + + vint16m1_t v = *(vint16m1_t*)checksum; + v = gf16_vec_mul2_rvv(v); + if(aligned) { + vl = RV(vsetvlmax_e16m1)(); + int16_t* _src = (int16_t*)src; + for(unsigned i=0; i +# if defined(__clang__) && __clang_major__ < 16 +# define RV(f) f +# else +# define RV(f) __riscv_##f +# endif + + +// TODO: evaluate endian requirements +# if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +# define __RVV_LE +# endif + +static HEDLEY_ALWAYS_INLINE vint16m1_t gf16_vec_mul2_rvv(vint16m1_t v) { + size_t vl = RV(vsetvlmax_e16m1)(); + vbool16_t maskPoly = RV(vmslt_vx_i16m1_b16)(v, 0, vl); + v = RV(vadd_vv_i16m1)(v, v, vl); + return RV(vxor_vx_i16m1_m)( + maskPoly, + v, v, + GF16_POLYNOMIAL & 0xffff, + vl + ); +} + + +#endif + +#endif \ No newline at end of file diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h index 87f3fe05..d4ebbbb1 100644 --- a/gf16/gf16_shuffle.h +++ b/gf16/gf16_shuffle.h @@ -36,15 +36,13 @@ FUNCS(neon); FUNCS(128_sve); FUNCS(128_sve2); FUNCS(512_sve2); +FUNCS(128_rvv); #undef FUNCS void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle_muladd_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); void gf16_shuffle_muladd_prefetch_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); -void gf16_shuffle_prepare_packed_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_packed_cksum_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_partial_packsum_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); extern int gf16_shuffle_available_vbmi; #define FUNCS(v) \ @@ -55,32 +53,42 @@ FUNCS(neon); FUNCS(128_sve); FUNCS(128_sve2); FUNCS(512_sve2); +FUNCS(128_rvv); #undef FUNCS -void gf16_shuffle_prepare_packed_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_packed_cksum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_partial_packsum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); -void gf16_shuffle_finish_packed_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); -int gf16_shuffle_finish_packed_cksum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); -int gf16_shuffle_finish_partial_packsum_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); +#define FUNCS(v) \ + void gf16_shuffle_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ + void gf16_shuffle_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ + void gf16_shuffle_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen) -void gf16_shuffle_prepare_packed_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_packed_cksum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_partial_packsum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); -void gf16_shuffle_finish_packed_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); -int gf16_shuffle_finish_packed_cksum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); -int gf16_shuffle_finish_partial_packsum_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); +FUNCS(vbmi); +FUNCS(neon); +FUNCS(sve); +FUNCS(512_sve2); +FUNCS(rvv); + +#undef FUNCS + +#define FUNCS(v) \ + void gf16_shuffle_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ + int gf16_shuffle_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \ + int gf16_shuffle_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen) + +FUNCS(neon); +FUNCS(sve); +FUNCS(rvv); + +#undef FUNCS -void gf16_shuffle_prepare_packed_512_sve2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_packed_cksum_512_sve2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); -void gf16_shuffle_prepare_partial_packsum_512_sve2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); // also used for clmul, but declared here for convenience extern int gf16_available_neon; extern int gf16_available_sve; extern int gf16_available_sve2; +extern int gf16_available_rvv; + // shuffle2x #define FUNCS(v) \ void gf16_shuffle2x_prepare_##v(void* dst, const void* src, size_t srcLen); \ @@ -122,8 +130,10 @@ void* gf16_shuffle_init_vbmi(int polynomial); void* gf16_shuffle_init_arm(int polynomial); void* gf16_shuffle_init_128_sve(int polynomial); void* gf16_shuffle_init_512_sve(int polynomial); +void* gf16_shuffle_init_128_rvv(int polynomial); int gf16_sve_get_size(); +int gf16_rvv_get_size(); uint16_t gf16_affine2x_replace_word(void* data, size_t index, uint16_t newValue); uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue); diff --git a/gf16/gf16_shuffle128_rvv.c b/gf16/gf16_shuffle128_rvv.c new file mode 100644 index 00000000..ba23a1a9 --- /dev/null +++ b/gf16/gf16_shuffle128_rvv.c @@ -0,0 +1,281 @@ +#include "gf16_rvv_common.h" + +#if defined(__RVV_LE) +int gf16_available_rvv = 1; +#else +int gf16_available_rvv = 0; +#endif + +#include "gf16_muladd_multi.h" + +#if defined(__RVV_LE) +// TODO: detect intrinsics version +# if 1 +// intrinsics v0.11.x (up to at least GCC 13 / Clang 16) +# define _vlseg2e8 RV(vlseg2e8_v_u8m1) +# define _vsseg2e8 RV(vsseg2e8_v_u8m1) +# else +// intrinsics v0.12.x +static HEDLEY_ALWAYS_INLINE void _vlseg2e8(vuint8m1_t* v0, vuint8m1_t* v1, const uint8_t* src, size_t vl) { + vuint8m1x2_t d = RV(vlseg2e8_v_u8m1x2)(src, vl); + *v0 = RV(vget_v_u8m1x2_u8m1)(vd, 0); + *v1 = RV(vget_v_u8m1x2_u8m1)(vd, 1); +} +static HEDLEY_ALWAYS_INLINE void _vsseg2e8(uint8_t* dst, vuint8m1_t v0, vuint8m1_t v1, size_t vl) { + vuint8m1x2_t d; + d = RV(vset_v_u8m1_u8m1x2)(d, 0, v0); + d = RV(vset_v_u8m1_u8m1x2)(d, 1, v1); + RV(vsseg2e8_v_u8m1x2)(dst, d, vl); +} +# endif + +static HEDLEY_ALWAYS_INLINE void gf16_shuffle_128_rvv_calc_table(vuint8m1_t poly_l, uint16_t val, + vuint8m1_t* tbl_l0, vuint8m1_t* tbl_l1, vuint8m1_t* tbl_l2, vuint8m1_t* tbl_l3, + vuint8m1_t* tbl_h0, vuint8m1_t* tbl_h1, vuint8m1_t* tbl_h2, vuint8m1_t* tbl_h3 +) { + uint16_t val2 = GF16_MULTBY_TWO(val); + uint16_t val4 = GF16_MULTBY_TWO(val2); + uint16_t val8 = GF16_MULTBY_TWO(val4); + + vuint16m1_t tmp0 = RV(vmv_v_x_u16m1)(val ^ val2, 8); + tmp0 = RV(vslide1up_vx_u16m1)(tmp0, val2, 8); + tmp0 = RV(vslide1up_vx_u16m1)(tmp0, val, 8); + tmp0 = RV(vslide1up_vx_u16m1)(tmp0, 0, 8); + + vuint16m1_t tmp4 = RV(vxor_vv_u16m1)(RV(vmv_v_x_u16m1)(val4, 8), tmp0, 8); + tmp0 = RV(vslideup_vx_u16m1)(tmp0, tmp4, 4, 8); + + vuint16m1_t tmp8 = RV(vxor_vv_u16m1)(tmp0, RV(vmv_v_x_u16m1)(val8, 8), 8); + + vuint8mf2_t tmpL0, tmpL1, tmpH0, tmpH1; + tmpL0 = RV(vnsrl_wx_u8mf2)(tmp0, 0, 8); + tmpL1 = RV(vnsrl_wx_u8mf2)(tmp8, 0, 8); + tmpH0 = RV(vnsrl_wx_u8mf2)(tmp0, 8, 8); + tmpH1 = RV(vnsrl_wx_u8mf2)(tmp8, 8, 8); + + *tbl_l0 = RV(vslideup_vx_u8m1)(RV(vlmul_ext_v_u8mf2_u8m1)(tmpL0), RV(vlmul_ext_v_u8mf2_u8m1)(tmpL1), 8, 16); + *tbl_h0 = RV(vslideup_vx_u8m1)(RV(vlmul_ext_v_u8mf2_u8m1)(tmpH0), RV(vlmul_ext_v_u8mf2_u8m1)(tmpH1), 8, 16); + + vuint8m1_t ri, rh, rl; + + // could replace the sll+or with a macc, but probably not worth it + #define MUL16(p, c) \ + ri = RV(vsrl_vx_u8m1)(*tbl_h##p, 4, 16); \ + rl = RV(vsll_vx_u8m1)(*tbl_l##p, 4, 16); \ + rh = RV(vxor_vv_u8m1)(*tbl_h##p, ri, 16); \ + *tbl_l##c = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(poly_l, ri, 16), 16); \ + *tbl_h##c = RV(vor_vv_u8m1)( \ + RV(vsll_vx_u8m1)(rh, 4, 16), \ + RV(vsrl_vx_u8m1)(*tbl_l##p, 4, 16), \ + 16 \ + ) + + MUL16(0, 1); + MUL16(1, 2); + MUL16(2, 3); + #undef MUL16 +} + + +static HEDLEY_ALWAYS_INLINE void gf16_shuffle_128_rvv_round(size_t vl, vuint8m1_t src0, vuint8m1_t src1, vuint8m1_t* rl, vuint8m1_t* rh, + vuint8m1_t tbl_l0, vuint8m1_t tbl_l1, vuint8m1_t tbl_l2, vuint8m1_t tbl_l3, + vuint8m1_t tbl_h0, vuint8m1_t tbl_h1, vuint8m1_t tbl_h2, vuint8m1_t tbl_h3 +) { + vuint8m1_t tmp = RV(vand_vx_u8m1)(src0, 0xf, vl); + *rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l0, tmp, vl), vl); + *rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h0, tmp, vl), vl); + + tmp = RV(vand_vx_u8m1)(src1, 0xf, vl); + *rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l2, tmp, vl), vl); + *rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h2, tmp, vl), vl); + + tmp = RV(vsrl_vx_u8m1)(src0, 4, vl); + *rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l1, tmp, vl), vl); + *rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h1, tmp, vl), vl); + + tmp = RV(vsrl_vx_u8m1)(src1, 4, vl); + *rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l3, tmp, vl), vl); + *rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h3, tmp, vl), vl); +} + + +static HEDLEY_ALWAYS_INLINE void gf16_shuffle_muladd_x_128_rvv( + const void *HEDLEY_RESTRICT scratch, + uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, GF16_MULADD_MULTI_SRCLIST, size_t len, + const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf +) { + GF16_MULADD_MULTI_SRC_UNUSED(3); + + vuint8m1_t poly_l = RV(vle8_v_u8m1)((const uint8_t*)scratch, 16); + + vuint8m1_t tbl_Ah0, tbl_Ah1, tbl_Ah2, tbl_Ah3, tbl_Al0, tbl_Al1, tbl_Al2, tbl_Al3; + vuint8m1_t tbl_Bh0, tbl_Bh1, tbl_Bh2, tbl_Bh3, tbl_Bl0, tbl_Bl1, tbl_Bl2, tbl_Bl3; + vuint8m1_t tbl_Ch0, tbl_Ch1, tbl_Ch2, tbl_Ch3, tbl_Cl0, tbl_Cl1, tbl_Cl2, tbl_Cl3; + // TODO: support calcing multiple tables together + #define CALC_TABLE(n, t) \ + if(srcCount >= n) \ + gf16_shuffle_128_rvv_calc_table( \ + poly_l, coefficients[n], \ + &tbl_##t##l0, &tbl_##t##l1, &tbl_##t##l2, &tbl_##t##l3, &tbl_##t##h0, &tbl_##t##h1, &tbl_##t##h2, &tbl_##t##h3 \ + ) + CALC_TABLE(0, A); + CALC_TABLE(1, B); + CALC_TABLE(2, C); + #undef CALC_TABLE + + size_t vl = RV(vsetvlmax_e8m1)(); + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += vl*2) { + // TODO: does RISC-V have prefetch instructions? + UNUSED(doPrefetch); UNUSED(_pf); + + vuint8m1_t rl, rh; + _vlseg2e8(&rl, &rh, _dst+ptr, vl*2); + + vuint8m1_t in0, in1; + _vlseg2e8(&in0, &in1, _src1+ptr*srcScale, vl*2); + + gf16_shuffle_128_rvv_round(vl, in0, in1, &rl, &rh, tbl_Al0, tbl_Al1, tbl_Al2, tbl_Al3, tbl_Ah0, tbl_Ah1, tbl_Ah2, tbl_Ah3); + if(srcCount > 1) { + _vlseg2e8(&in0, &in1, _src2+ptr*srcScale, vl*2); + gf16_shuffle_128_rvv_round(vl, in0, in1, &rl, &rh, tbl_Bl0, tbl_Bl1, tbl_Bl2, tbl_Bl3, tbl_Bh0, tbl_Bh1, tbl_Bh2, tbl_Bh3); + } + if(srcCount > 2) { + _vlseg2e8(&in0, &in1, _src3+ptr*srcScale, vl*2); + gf16_shuffle_128_rvv_round(vl, in0, in1, &rl, &rh, tbl_Cl0, tbl_Cl1, tbl_Cl2, tbl_Cl3, tbl_Ch0, tbl_Ch1, tbl_Ch2, tbl_Ch3); + } + + _vsseg2e8(_dst+ptr, rl, rh, vl*2); + } +} + +#endif /*defined(__RVV_LE)*/ + + + + +void gf16_shuffle_mul_128_rvv(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(__RVV_LE) + vuint8m1_t poly_l = RV(vle8_v_u8m1)((const uint8_t*)scratch, 16); + vuint8m1_t tbl_h0, tbl_h1, tbl_h2, tbl_h3, tbl_l0, tbl_l1, tbl_l2, tbl_l3; + gf16_shuffle_128_rvv_calc_table(poly_l, val, &tbl_l0, &tbl_l1, &tbl_l2, &tbl_l3, &tbl_h0, &tbl_h1, &tbl_h2, &tbl_h3); + + + const uint8_t* _src = (const uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + size_t vl = RV(vsetvlmax_e8m1)(); + + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += vl*2) { + vuint8m1_t in0, in1; + _vlseg2e8(&in0, &in1, _src+ptr, vl*2); + + vuint8m1_t tmp = RV(vand_vx_u8m1)(in0, 0xf, vl); + vuint8m1_t rl = RV(vrgather_vv_u8m1)(tbl_l0, tmp, vl); + vuint8m1_t rh = RV(vrgather_vv_u8m1)(tbl_h0, tmp, vl); + + tmp = RV(vand_vx_u8m1)(in1, 0xf, vl); + rl = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(tbl_l2, tmp, vl), vl); + rh = RV(vxor_vv_u8m1)(rh, RV(vrgather_vv_u8m1)(tbl_h2, tmp, vl), vl); + + tmp = RV(vsrl_vx_u8m1)(in0, 4, vl); + rl = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(tbl_l1, tmp, vl), vl); + rh = RV(vxor_vv_u8m1)(rh, RV(vrgather_vv_u8m1)(tbl_h1, tmp, vl), vl); + + tmp = RV(vsrl_vx_u8m1)(in1, 4, vl); + rl = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(tbl_l3, tmp, vl), vl); + rh = RV(vxor_vv_u8m1)(rh, RV(vrgather_vv_u8m1)(tbl_h3, tmp, vl), vl); + + _vsseg2e8(_dst+ptr, rl, rh, vl*2); + } +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + +void gf16_shuffle_muladd_128_rvv(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(__RVV_LE) + gf16_muladd_single(scratch, gf16_shuffle_muladd_x_128_rvv, dst, src, len, val); +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + + +#if defined(__RVV_LE) +GF16_MULADD_MULTI_FUNCS(gf16_shuffle, _128_rvv, gf16_shuffle_muladd_x_128_rvv, 3, RV(vsetvlmax_e8m1)()*2, 0, (void)0) +#else +GF16_MULADD_MULTI_FUNCS_STUB(gf16_shuffle, _128_rvv) +#endif + + + +#ifdef __RVV_LE +static HEDLEY_ALWAYS_INLINE void gf16_prepare_block_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) { + size_t vl = RV(vsetvlmax_e8m2)(); + RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl); +} +// final block +static HEDLEY_ALWAYS_INLINE void gf16_prepare_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { + // current intrinsics don't seem to support tail-undisturbed policy, so zero explicitly for now + size_t vl = RV(vsetvlmax_e8m2)(); + RV(vse8_v_u8m2)((uint8_t*)dst, RV(vmv_v_x_u8m2)(0, vl), vl); + vl = RV(vsetvl_e8m2)(remaining); + RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl); +} +static HEDLEY_ALWAYS_INLINE void gf16_finish_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { + size_t vl = RV(vsetvl_e8m2)(remaining); + RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl); +} + +static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_rvv(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) { + int16_t tmp[blockLen/2]; + memset(tmp, 0, blockLen); + RV(vse16_v_i16m1)(tmp, *(vint16m1_t*)checksum, RV(vsetvlmax_e16m1)()); + + prepareBlock(dst, tmp); +} + +#include "gf16_checksum_rvv.h" + +// TODO: should align be width of the vector, instead of 16? +GF_PREPARE_PACKED_FUNCS(gf16_shuffle, _rvv, RV(vsetvlmax_e8m1)()*2, gf16_prepare_block_rvv, gf16_prepare_blocku_rvv, 3, (void)0, vuint16m1_t checksum = RV(vmv_v_x_u16m1)(0, RV(vsetvlmax_e16m1)()), gf16_checksum_block_rvv, gf16_checksum_blocku_rvv, gf16_checksum_exp_rvv, gf16_checksum_prepare_rvv, 16) +GF_FINISH_PACKED_FUNCS(gf16_shuffle, _rvv, RV(vsetvlmax_e8m1)()*2, gf16_prepare_block_rvv, gf16_finish_blocku_rvv, 1, (void)0, gf16_checksum_block_rvv, gf16_checksum_blocku_rvv, gf16_checksum_exp_rvv, NULL, 16) +#else +GF_PREPARE_PACKED_FUNCS_STUB(gf16_shuffle, _rvv) +GF_FINISH_PACKED_FUNCS_STUB(gf16_shuffle, _rvv) +#endif + + + + + +int gf16_rvv_get_size() { +#ifdef __RVV_LE + return RV(vsetvlmax_e8m1)(); +#else + return 0; +#endif +} + +void* gf16_shuffle_init_128_rvv(int polynomial) { +#ifdef __RVV_LE + uint8_t* ret; + if((polynomial | 0x1f) != 0x1101f) return NULL; + ALIGN_ALLOC(ret, 16, 16); + for(int i=0; i<16; i++) { + int p = 0; + if(i & 8) p ^= polynomial << 3; + if(i & 4) p ^= polynomial << 2; + if(i & 2) p ^= polynomial << 1; + if(i & 1) p ^= polynomial << 0; + + ret[i] = p & 0xff; + } + return ret; +#else + UNUSED(polynomial); + return NULL; +#endif +} + diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 0d48bf73..1094ced4 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -210,6 +210,17 @@ struct CpuCap { } }; #endif +#ifdef __riscv +# include "../src/cpuid.h" + +struct CpuCap { + bool hasVector; + CpuCap(bool detect) : hasVector(true) { + if(!detect) return; + hasVector = CPU_HAS_VECTOR; + } +}; +#endif Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) { @@ -309,6 +320,13 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) { _info.idealInputMultiple = 4; break; + case GF16_SHUFFLE_128_RVV: + _info.alignment = 16; // I guess this is good enough... + _info.cksumSize = gf16_rvv_get_size(); + _info.stride = _info.cksumSize*2; + _info.idealInputMultiple = 3; + break; + case GF16_CLMUL_SVE2: _info.alignment = 16; _info.cksumSize = gf16_sve_get_size(); @@ -436,6 +454,7 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) { case GF16_SHUFFLE_NEON: case GF16_SHUFFLE_128_SVE: // may need smaller chunks for larger vector size case GF16_SHUFFLE_128_SVE2: + case GF16_SHUFFLE_128_RVV: _info.idealChunkSize = 16*1024; break; case GF16_SHUFFLE_AVX2: @@ -846,6 +865,29 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { copy_cksum_check = &gf16_cksum_copy_check_sve; break; + case GF16_SHUFFLE_128_RVV: + scratch = gf16_shuffle_init_128_rvv(GF16_POLYNOMIAL); + METHOD_REQUIRES(gf16_available_rvv) + + _mul = &gf16_shuffle_mul_128_rvv; + _mul_add = &gf16_shuffle_muladd_128_rvv; + _mul_add_multi = &gf16_shuffle_muladd_multi_128_rvv; + _mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_128_rvv; + _mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_128_rvv; + //_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_128_rvv; + add_multi = &gf_add_multi_rvv; + add_multi_packed = &gf_add_multi_packed_v2i3_rvv; + add_multi_packpf = &gf_add_multi_packpf_v2i3_rvv; + prepare_packed = &gf16_shuffle_prepare_packed_rvv; + prepare_packed_cksum = &gf16_shuffle_prepare_packed_cksum_rvv; + prepare_partial_packsum = &gf16_shuffle_prepare_partial_packsum_rvv; + finish_packed = &gf16_shuffle_finish_packed_rvv; + finish_packed_cksum = &gf16_shuffle_finish_packed_cksum_rvv; + finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_rvv; + copy_cksum = &gf16_cksum_copy_rvv; + copy_cksum_check = &gf16_cksum_copy_check_rvv; + break; + case GF16_AFFINE_AVX512: scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL); METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512) @@ -1332,7 +1374,11 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu # endif ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; #endif - +#ifdef __riscv_ + const CpuCap caps(true); + if(caps.hasVector && gf16_available_rvv && gf16_rvv_get_size() >= 16) + return GF16_SHUFFLE_128_RVV; +#endif // lookup vs lookup3: latter seems to be slightly faster than former in most cases (SKX, Silvermont, Zen1, Rpi3 (arm64; arm32 faster muladd, slower mul)), sometimes slightly slower (Haswell, IvB?, Piledriver) // but test w/ multi-region lh-lookup & fat table before preferring it @@ -1410,6 +1456,11 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { ret.push_back(GF16_SHUFFLE_512_SVE2); } #endif +#ifdef __riscv + const CpuCap caps(checkCpuid); + if(gf16_available_rvv && caps.hasVector && gf16_rvv_get_size() >= 16) + ret.push_back(GF16_SHUFFLE_128_RVV); +#endif return ret; } diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index b1e91b7b..8d47a8ff 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -45,6 +45,7 @@ enum Galois16Methods { GF16_SHUFFLE_128_SVE2, GF16_SHUFFLE2X_128_SVE2, GF16_SHUFFLE_512_SVE2, + GF16_SHUFFLE_128_RVV, GF16_SHUFFLE_SSSE3, GF16_SHUFFLE_AVX, GF16_SHUFFLE_AVX2, @@ -76,6 +77,7 @@ static const char* Galois16MethodsText[] = { "Shuffle-128 (SVE2)", "Shuffle2x-128 (SVE2)", "Shuffle-512 (SVE2)", + "Shuffle-128 (RVV)", "Shuffle (SSSE3)", "Shuffle (AVX)", "Shuffle (AVX2)", diff --git a/gf16/gf_add.h b/gf16/gf_add.h index ee964fb8..8a746f8a 100644 --- a/gf16/gf_add.h +++ b/gf16/gf_add.h @@ -8,6 +8,7 @@ void gf_add_multi_avx512(unsigned regions, size_t offset, void *HEDLEY_RESTRICT void gf_add_multi_neon(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len); void gf_add_multi_sve(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len); void gf_add_multi_sve2(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len); +void gf_add_multi_rvv(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len); void gf_add_multi_packed_v1i2_sse2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len); @@ -39,6 +40,7 @@ void gf_add_multi_packed_v1i6_sve2(unsigned packRegions, unsigned regions, void void gf_add_multi_packed_v2i3_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len); void gf_add_multi_packed_v2i4_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len); void gf_add_multi_packed_v2i8_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len); +void gf_add_multi_packed_v2i3_rvv(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len); void gf_add_multi_packpf_v1i2_sse2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); void gf_add_multi_packpf_v1i6_sse2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); @@ -69,6 +71,7 @@ void gf_add_multi_packpf_v1i6_sve2(unsigned packRegions, unsigned regions, void void gf_add_multi_packpf_v2i3_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); void gf_add_multi_packpf_v2i4_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); void gf_add_multi_packpf_v2i8_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); +void gf_add_multi_packpf_v2i3_rvv(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); void gf_add_multi_packed_generic(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len); diff --git a/gf16/gf_add_rvv.c b/gf16/gf_add_rvv.c new file mode 100644 index 00000000..347de417 --- /dev/null +++ b/gf16/gf_add_rvv.c @@ -0,0 +1,83 @@ +#include "gf16_rvv_common.h" +#include "gf16_muladd_multi.h" + +#ifdef __riscv_vector + +static HEDLEY_ALWAYS_INLINE void gf_add_x_rvv( + const void *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, + GF16_MULADD_MULTI_SRCLIST, size_t len, + const uint16_t *HEDLEY_RESTRICT coefficients, + const int doPrefetch, const char* _pf +) { + ASSUME(len > 0); + + GF16_MULADD_MULTI_SRC_UNUSED(18); + UNUSED(coefficients); + + unsigned vecStride = (unsigned)((uintptr_t)scratch); // abuse this otherwise unused variable + + if(vecStride == 2) { // only support a vecStride of 2 for now (may eventually support 1 for CLMul) + size_t vl = RV(vsetvlmax_e8m2)(); + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += vl) { + vuint8m2_t data = RV(vle8_v_u8m2)(_dst+ptr, vl); + + #define XOR_LOAD(n) \ + if(srcCount >= n) \ + data = RV(vxor_vv_u8m2)(data, RV(vle8_v_u8m2)(_src##n+ptr*srcScale, vl), vl) + XOR_LOAD(1); + XOR_LOAD(2); + XOR_LOAD(3); + XOR_LOAD(4); + XOR_LOAD(5); + XOR_LOAD(6); + XOR_LOAD(7); + XOR_LOAD(8); + XOR_LOAD(9); + XOR_LOAD(10); + XOR_LOAD(11); + XOR_LOAD(12); + XOR_LOAD(13); + XOR_LOAD(14); + XOR_LOAD(15); + XOR_LOAD(16); + XOR_LOAD(17); + XOR_LOAD(18); + #undef XOR_LOAD + + RV(vse8_v_u8m2)(_dst+ptr, data, vl); + + UNUSED(doPrefetch); UNUSED(_pf); + } + } +} +#endif + +void gf_add_multi_rvv(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len) { +#ifdef __riscv_vector + gf16_muladd_multi((void*)2, &gf_add_x_rvv, 4, regions, offset, dst, src, len, NULL); +#else + UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len); +#endif +} + +#ifdef __riscv_vector +# define PACKED_FUNC(vs, il, it) \ +void gf_add_multi_packed_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \ + gf16_muladd_multi_packed((void*)vs, &gf_add_x_rvv, il, it, packedRegions, regions, dst, src, len, RV(vsetvlmax_e8m1)()*vs, NULL); \ +} \ +void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ + gf16_muladd_multi_packpf((void*)vs, &gf_add_x_rvv, il, it, packedRegions, regions, dst, src, len, RV(vsetvlmax_e8m1)()*vs, NULL, vs>1, prefetchIn, prefetchOut); \ +} +#else +# define PACKED_FUNC(vs, il, it) \ +void gf_add_multi_packed_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \ + UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); \ +}\ +void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ + UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(prefetchIn); UNUSED(prefetchOut); \ +} +#endif + +PACKED_FUNC(2, 3, 12) + +#undef PACKED_FUNC diff --git a/help.txt b/help.txt index 66674bbd..6e0d28b9 100644 --- a/help.txt +++ b/help.txt @@ -295,6 +295,8 @@ CPU Tuning Options: shuffle2x128-sve2: half width variant of shuffle-neon (requires SVE width >= 256 bits) shuffle512-sve2: SVE2 variant of shuffle-vbmi (requires SVE width >= 512 bits) clmul-sve2: SVE2 variant of clmul-neon + RISC-V only choices: + shuffle128-rvv: RISC-V Vector variant of shuffle128-sve2 Default is auto-detected. --loop-tile-size Target size used for loop tiling optimisation. Default is 0 (auto-detected) diff --git a/lib/par2.js b/lib/par2.js index 5fa7203c..c45f3ecf 100644 --- a/lib/par2.js +++ b/lib/par2.js @@ -884,6 +884,7 @@ PAR2Chunked.prototype = { var GF_METHODS = [ '' /*default*/, 'lookup', 'lookup-sse', '3p_lookup', 'shuffle-neon', 'shuffle128-sve', 'shuffle128-sve2', 'shuffle2x128-sve2', 'shuffle512-sve2', + 'shuffle128-rvv', 'shuffle-sse', 'shuffle-avx', 'shuffle-avx2', 'shuffle-avx512', 'shuffle-vbmi', 'shuffle2x-avx2', 'shuffle2x-avx512', 'xor-sse', 'xorjit-sse', 'xorjit-avx2', 'xorjit-avx512', diff --git a/src/cpuid.h b/src/cpuid.h index af6b58df..05201e37 100644 --- a/src/cpuid.h +++ b/src/cpuid.h @@ -125,4 +125,32 @@ static unsigned long getauxval(unsigned long cap) { #endif +#ifdef __riscv +# if defined(__has_include) +# if __has_include() +# include +# ifdef __FreeBSD__ +static unsigned long getauxval(unsigned long cap) { + unsigned long ret; + elf_aux_info(cap, &ret, sizeof(ret)); + return ret; +} +# endif +# if __has_include() +# include +# endif +# endif +# endif + +# ifndef CPU_HAS_VECTOR +# define CPU_HAS_VECTOR false + +# if defined(AT_HWCAP) +# undef CPU_HAS_VECTOR +# define CPU_HAS_VECTOR (getauxval(AT_HWCAP) & (1 << ('V'-'A'))) +# endif +# endif + +#endif + #endif /* PP_CPUID_H */ From 07ead19ca15036a6d52f621b07091f9884081fc7 Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 13 Aug 2023 20:35:42 +1000 Subject: [PATCH 50/91] Fixes for last commit --- binding.gyp | 2 +- gf16/gf_add_rvv.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/binding.gyp b/binding.gyp index 42d1e9f1..89065506 100644 --- a/binding.gyp +++ b/binding.gyp @@ -37,7 +37,7 @@ { "target_name": "parpar_gf", "dependencies": [ - "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sve", "gf16_sve2", + "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sve", "gf16_sve2", "gf16_rvv", "hasher", "hasher_sse2", "hasher_clmul", "hasher_xop", "hasher_bmi1", "hasher_avx2", "hasher_avx512", "hasher_avx512vl", "hasher_armcrc", "hasher_neon", "hasher_neoncrc", "hasher_sve2" ], "sources": ["src/gf.cc", "gf16/controller.cpp", "gf16/controller_cpu.cpp", "gf16/controller_ocl.cpp", "gf16/controller_ocl_init.cpp"], diff --git a/gf16/gf_add_rvv.c b/gf16/gf_add_rvv.c index 347de417..f52a237e 100644 --- a/gf16/gf_add_rvv.c +++ b/gf16/gf_add_rvv.c @@ -72,7 +72,7 @@ void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned reg # define PACKED_FUNC(vs, il, it) \ void gf_add_multi_packed_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \ UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); \ -}\ +} \ void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \ UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(prefetchIn); UNUSED(prefetchOut); \ } From e7d471a3176f2e7a1ac4d58786d853dbc01002ba Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 13 Aug 2023 20:46:41 +1000 Subject: [PATCH 51/91] Display chosen kernel for whenever a kernel is selected --- bin/parpar.js | 8 ++- gf16/gf16pmul.cpp | 22 +++++++ gf16/gf16pmul.h | 11 ++++ gf16/gfmat_inv.cpp | 12 +++- gf16/gfmat_inv.h | 4 ++ hasher/hasher.cpp | 139 +++++++++++++++++++++++++++++++++++-------- hasher/hasher.h | 6 ++ hasher/hasher_impl.h | 11 ++++ lib/par2.js | 6 ++ lib/par2gen.js | 3 + src/gf.cc | 2 + 11 files changed, 197 insertions(+), 27 deletions(-) diff --git a/bin/parpar.js b/bin/parpar.js index a2d3c18f..f134bf5b 100755 --- a/bin/parpar.js +++ b/bin/parpar.js @@ -732,6 +732,7 @@ var inputFiles = argv._; var sizeDisp = function(val) { return cliFormat('1', friendlySize(val)); }; + var hash_methods = g.hash_methods(); if(argv.json) { print_json('processing_info', { input_size: g.totalSize, @@ -753,7 +754,9 @@ var inputFiles = argv._; recovery_offset: rf.recoveryOffset, size: rf.totalSize }; - }) + }), + hash_input_method: hash_methods[0], + hash_recovery_method: hash_methods[1] }); } else { if(g.opts.sliceSize > 1024*1048576) { @@ -771,6 +774,9 @@ var inputFiles = argv._; process.stderr.write('Input pass(es) : ' + cliFormat('1', g.chunks * g.passes) + ', processing ' + pluralDisp(g.slicesPerPass, '* ' + sizeDisp(g._chunkSize) + ' chunk') + ' per pass\n'); } process.stderr.write('Read buffer size : ' + sizeDisp(g.readSize) + ' * max ' + pluralDisp(g.opts.readBuffers, 'buffer') + '\n'); + process.stderr.write('Hash method : ' + cliFormat('1', hash_methods[0]) + ' (input)' + (g.opts.recoverySlices ? + ', ' + cliFormat('1', hash_methods[1]) + ' (recovery)' + : '') + '\n'); } } if(argv.progress != 'none') { diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp index 5e1f7504..22e31381 100644 --- a/gf16/gf16pmul.cpp +++ b/gf16/gf16pmul.cpp @@ -2,11 +2,13 @@ #include "../src/cpuid.h" Gf16PMulFunc gf16pmul = nullptr; +Galois16PointMulMethods gf16pmul_method = GF16PMUL_NONE; size_t gf16pmul_alignment = 1; size_t gf16pmul_blocklen = 1; void setup_pmul() { gf16pmul = nullptr; + gf16pmul_method = GF16PMUL_NONE; gf16pmul_alignment = 1; gf16pmul_blocklen = 1; @@ -40,21 +42,25 @@ void setup_pmul() { if(gf16pmul_available_vpclgfni) { gf16pmul = &gf16pmul_vpclgfni; + gf16pmul_method = GF16PMUL_VPCLMUL_GFNI; gf16pmul_alignment = 32; gf16pmul_blocklen = 64; } else if(gf16pmul_available_vpclmul) { gf16pmul = &gf16pmul_vpclmul; + gf16pmul_method = GF16PMUL_VPCLMUL; gf16pmul_alignment = 32; gf16pmul_blocklen = 32; } else if(gf16pmul_available_avx2) { gf16pmul = &gf16pmul_avx2; + gf16pmul_method = GF16PMUL_AVX2; gf16pmul_alignment = 32; gf16pmul_blocklen = 32; } else if(gf16pmul_available_sse) { gf16pmul = &gf16pmul_sse; + gf16pmul_method = GF16PMUL_PCLMUL; gf16pmul_alignment = 16; gf16pmul_blocklen = 16; } @@ -66,13 +72,29 @@ void setup_pmul() { if(gf16pmul_available_sve2) { gf16pmul = &gf16pmul_sve2; + gf16pmul_method = GF16PMUL_SVE2; gf16pmul_alignment = gf16pmul_sve2_width(); gf16pmul_blocklen = gf16pmul_alignment*2; } else if(gf16pmul_available_neon) { gf16pmul = &gf16pmul_neon; + gf16pmul_method = GF16PMUL_NEON; gf16pmul_alignment = 16; gf16pmul_blocklen = 32; } #endif } + +const char* gf16pmul_methodName() { + const char* names[] = { + "None (exponentiate)", + "PCLMUL", + "AVX2", + "VPCLMUL", + "VPCLMUL+GFNI", + "NEON", + "SVE2" + }; + + return names[(int)gf16pmul_method]; +} diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h index 7ef94ded..c740bc03 100644 --- a/gf16/gf16pmul.h +++ b/gf16/gf16pmul.h @@ -4,11 +4,22 @@ #include "../src/hedley.h" #include +enum Galois16PointMulMethods { + GF16PMUL_NONE, + GF16PMUL_PCLMUL, + GF16PMUL_AVX2, + GF16PMUL_VPCLMUL, + GF16PMUL_VPCLMUL_GFNI, + GF16PMUL_NEON, + GF16PMUL_SVE2 +}; + // TODO: consider multi-dest typedef void(*Gf16PMulFunc)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); extern Gf16PMulFunc gf16pmul; extern size_t gf16pmul_alignment; extern size_t gf16pmul_blocklen; +const char* gf16pmul_methodName(); void setup_pmul(); diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 26b4055b..0697b788 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -567,7 +567,11 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va unsigned matWidth = (unsigned)inputValid.size() * sizeof(uint16_t); - Galois16RecMatrixComputeState state(Galois16Mul::default_method(matWidth, (unsigned)inputValid.size(), (unsigned)inputValid.size(), true)); + if(regionMethod == GF16_AUTO) { + regionMethod = Galois16Mul::default_method(matWidth, numRec, numRec, true); + } + + Galois16RecMatrixComputeState state((Galois16Methods)regionMethod); state.validCount = validCount; const auto gfInfo = state.gf.info(); state.pfFactor = gfInfo.prefetchDownscale; @@ -681,11 +685,17 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va return true; } +const char* Galois16RecMatrix::getPointMulMethodName() const { + return gf16pmul_methodName(); +} + Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) { numThreads = hardware_concurrency(); numRec = 0; numStripes = 0; stripeWidth = 0; + + regionMethod = (int)GF16_AUTO; } Galois16RecMatrix::~Galois16RecMatrix() { diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index ce53d7fe..0e18a096 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -41,6 +41,10 @@ class Galois16RecMatrix { unsigned stripe = inIdx / sw; return mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)]; } + + // these should only be queried after Compute has started (i.e. from the progressCb, or after it returns) + /*Galois16Methods*/ int regionMethod; + const char* getPointMulMethodName() const; }; #endif diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp index cff2ac4a..85e727f0 100644 --- a/hasher/hasher.cpp +++ b/hasher/hasher.cpp @@ -3,8 +3,11 @@ #include IHasherInput*(*HasherInput_Create)() = NULL; +HasherInputMethods HasherInput_Method = INHASH_SCALAR; uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*) = NULL; +MD5CRCMethods MD5CRC_Method = MD5CRCMETH_SCALAR; uint32_t(*CRC32_Calc)(const void*, size_t) = NULL; +MD5CRCMethods CRC32_Method = MD5CRCMETH_SCALAR; struct _CpuCap { #ifdef PLATFORM_X86 bool hasSSE2, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW; @@ -22,8 +25,11 @@ void setup_hasher() { if(HasherInput_Create) return; HasherInput_Create = &HasherInput_Scalar::create; + HasherInput_Method = INHASH_SCALAR; MD5CRC_Calc = &MD5CRC_Calc_Scalar; + MD5CRC_Method = MD5CRCMETH_SCALAR; CRC32_Calc = &CRC32_Calc_Slice4; + CRC32_Method = MD5CRCMETH_SCALAR; struct _CpuCap CpuCap; (void)CpuCap; @@ -73,45 +79,67 @@ void setup_hasher() { _cpuid(cpuInfo, 0x80000001); CpuCap.hasXOP = hasAVX && (cpuInfo[2] & 0x800); - if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && HasherInput_AVX512::isAvailable) + if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && HasherInput_AVX512::isAvailable) { HasherInput_Create = &HasherInput_AVX512::create; + HasherInput_Method = INHASH_AVX512; + } // SSE seems to be faster than scalar on Zen1/2, not Zen3; BMI > SSE on Zen1, unknown on Zen2 else if(hasClMul && !isSmallCore && HasherInput_ClMulScalar::isAvailable) { // Gracemont: SSE > scalar, but SSE ~= BMI - if(CpuCap.hasBMI1 && HasherInput_BMI1::isAvailable) + if(CpuCap.hasBMI1 && HasherInput_BMI1::isAvailable) { HasherInput_Create = &HasherInput_BMI1::create; - else + HasherInput_Method = INHASH_BMI1; + } else { HasherInput_Create = &HasherInput_ClMulScalar::create; - } else if(hasClMul && isSmallCore && HasherInput_ClMulSSE::isAvailable) + HasherInput_Method = INHASH_CRC; + } + } else if(hasClMul && isSmallCore && HasherInput_ClMulSSE::isAvailable) { HasherInput_Create = &HasherInput_ClMulSSE::create; - else if(CpuCap.hasSSE2 && isSmallCore && HasherInput_SSE::isAvailable) // TODO: CPU w/o ClMul might all be small enough + HasherInput_Method = INHASH_SIMD_CRC; + } + else if(CpuCap.hasSSE2 && isSmallCore && HasherInput_SSE::isAvailable) { // TODO: CPU w/o ClMul might all be small enough HasherInput_Create = &HasherInput_SSE::create; + HasherInput_Method = INHASH_SIMD; + } if(CpuCap.hasAVX512VLBW && !isVecRotSlow && MD5Single_isAvailable_AVX512) { MD5Single::_update = &MD5Single_update_AVX512; MD5Single::_updateZero = &MD5Single_updateZero_AVX512; + MD5Single::method = MD5CRCMETH_AVX512; } else if(isLEASlow && hasClMul && MD5Single_isAvailable_NoLEA) { MD5Single::_update = &MD5Single_update_NoLEA; MD5Single::_updateZero = &MD5Single_updateZero_NoLEA; + MD5Single::method = MD5CRCMETH_NOLEA; } // for some reason, single MD5 BMI1 seems to be slower on most cores, except Jaguar... unsure why else if(CpuCap.hasBMI1 && isSmallCore && MD5Single_isAvailable_BMI1) { MD5Single::_update = &MD5Single_update_BMI1; MD5Single::_updateZero = &MD5Single_updateZero_BMI1; + MD5Single::method = MD5CRCMETH_BMI1; } - if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && MD5CRC_isAvailable_AVX512) + if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && MD5CRC_isAvailable_AVX512) { MD5CRC_Calc = &MD5CRC_Calc_AVX512; - else if(isLEASlow && hasClMul && MD5CRC_isAvailable_NoLEA) + MD5CRC_Method = MD5CRCMETH_AVX512; + } + else if(isLEASlow && hasClMul && MD5CRC_isAvailable_NoLEA) { MD5CRC_Calc = &MD5CRC_Calc_NoLEA; - else if(CpuCap.hasBMI1 && hasClMul && isSmallCore && MD5CRC_isAvailable_BMI1) + MD5CRC_Method = MD5CRCMETH_NOLEA; + } + else if(CpuCap.hasBMI1 && hasClMul && isSmallCore && MD5CRC_isAvailable_BMI1) { MD5CRC_Calc = &MD5CRC_Calc_BMI1; - else if(hasClMul && MD5CRC_isAvailable_ClMul) + MD5CRC_Method = MD5CRCMETH_BMI1; + } + else if(hasClMul && MD5CRC_isAvailable_ClMul) { MD5CRC_Calc = &MD5CRC_Calc_ClMul; + MD5CRC_Method = MD5CRCMETH_PCLMUL; + } - if(hasClMul && CRC32_isAvailable_ClMul) + if(hasClMul && CRC32_isAvailable_ClMul) { CRC32_Calc = &CRC32_Calc_ClMul; + CRC32_Method = MD5CRCMETH_PCLMUL; + } #endif #ifdef PLATFORM_ARM @@ -120,19 +148,28 @@ void setup_hasher() { CpuCap.hasNEON = CPU_HAS_NEON; CpuCap.hasSVE2 = CPU_HAS_SVE2; - if(hasCRC && HasherInput_ARMCRC::isAvailable) // TODO: fast core only + if(hasCRC && HasherInput_ARMCRC::isAvailable) { // TODO: fast core only HasherInput_Create = &HasherInput_ARMCRC::create; + HasherInput_Method = INHASH_CRC; + } else if(CpuCap.hasNEON) { // TODO: slow core only - if(hasCRC && HasherInput_NEONCRC::isAvailable) + if(hasCRC && HasherInput_NEONCRC::isAvailable) { HasherInput_Create = &HasherInput_NEONCRC::create; - else if(HasherInput_NEON::isAvailable) + HasherInput_Method = INHASH_SIMD_CRC; + } else if(HasherInput_NEON::isAvailable) { HasherInput_Create = &HasherInput_NEON::create; + HasherInput_Method = INHASH_SIMD; + } } - if(hasCRC && MD5CRC_isAvailable_ARMCRC) + if(hasCRC && MD5CRC_isAvailable_ARMCRC) { MD5CRC_Calc = &MD5CRC_Calc_ARMCRC; - if(hasCRC && CRC32_isAvailable_ARMCRC) + MD5CRC_Method = MD5CRCMETH_ARMCRC; + } + if(hasCRC && CRC32_isAvailable_ARMCRC) { CRC32_Calc = &CRC32_Calc_ARMCRC; + CRC32_Method = MD5CRCMETH_ARMCRC; + } #endif @@ -155,24 +192,25 @@ void setup_hasher() { } bool set_hasherInput(HasherInputMethods method) { -#define SET_HASHER(x) { \ +#define SET_HASHER(h, x) if(method == h) { \ if(!x::isAvailable) return false; \ HasherInput_Create = &x::create; \ + HasherInput_Method = h; \ return true; \ } - if(method == INHASH_SCALAR) SET_HASHER(HasherInput_Scalar) + SET_HASHER(INHASH_SCALAR, HasherInput_Scalar) #ifdef PLATFORM_X86 - if(method == INHASH_SIMD) SET_HASHER(HasherInput_SSE) - if(method == INHASH_CRC) SET_HASHER(HasherInput_ClMulScalar) - if(method == INHASH_SIMD_CRC) SET_HASHER(HasherInput_ClMulSSE) - if(method == INHASH_BMI1) SET_HASHER(HasherInput_BMI1) - if(method == INHASH_AVX512) SET_HASHER(HasherInput_AVX512) + SET_HASHER(INHASH_SIMD, HasherInput_SSE) + SET_HASHER(INHASH_CRC, HasherInput_ClMulScalar) + SET_HASHER(INHASH_SIMD_CRC, HasherInput_ClMulSSE) + SET_HASHER(INHASH_BMI1, HasherInput_BMI1) + SET_HASHER(INHASH_AVX512, HasherInput_AVX512) #endif #ifdef PLATFORM_ARM - if(method == INHASH_SIMD) SET_HASHER(HasherInput_NEON) - if(method == INHASH_CRC) SET_HASHER(HasherInput_ARMCRC) - if(method == INHASH_SIMD_CRC) SET_HASHER(HasherInput_NEONCRC) + SET_HASHER(INHASH_SIMD, HasherInput_NEON) + SET_HASHER(INHASH_CRC, HasherInput_ARMCRC) + SET_HASHER(INHASH_SIMD_CRC, HasherInput_NEONCRC) #endif #undef SET_HASHER return false; @@ -367,6 +405,7 @@ void MD5Multi::get(void* md5s) { void(*MD5Single::_update)(uint32_t*, const void*, size_t) = &MD5Single_update_Scalar; void(*MD5Single::_updateZero)(uint32_t*, size_t) = &MD5Single_updateZero_Scalar; +MD5CRCMethods MD5Single::method = MD5CRCMETH_SCALAR; const size_t MD5_BLOCKSIZE = 64; void MD5Single::update(const void* data, size_t len) { uint_fast8_t buffered = dataLen & (MD5_BLOCKSIZE-1); @@ -415,3 +454,53 @@ void MD5Single::end(void* md5) { md5_final_block(md5State, tmp, dataLen, 0); memcpy(md5, md5State, 16); } + + +const char* hasherInput_methodName() { + const char* names[] = { + "Scalar + Slice4", +#ifdef PLATFORM_X86 + "SSE2 + Slice4", + "Scalar + PCLMUL", + "SSE2 + PCLMUL", +#elif defined(PLATFORM_ARM) + "NEON + Slice4", + "Scalar + ARMv8-CRC32", + "NEON + ARMv8-CRC32", +#else + "SIMD + Slice4", + "Scalar + CRC", + "SIMD + CRC", +#endif + "BMI1 + PCLMUL", + "AVX512" + }; + + return names[(int)HasherInput_Method]; +} +const char* hasherMD5Multi_methodName() { + const char* names[] = { + "Scalar", + "SSE2", + "AVX2", + "XOP", + "AVX512F", + "AVX512VL", + "NEON", + "SVE2" + }; + + return names[(int)HasherMD5Multi_level]; +} +const char* md5crc_methodName(MD5CRCMethods m) { + const char* names[] = { + "Scalar", + "BMI1", + "NoLEA", + "AVX512", + "ARMv8-CRC32", + "PCLMUL" + }; + + return names[(int)m]; +} diff --git a/hasher/hasher.h b/hasher/hasher.h index 801cee1c..a52c0bc7 100644 --- a/hasher/hasher.h +++ b/hasher/hasher.h @@ -30,6 +30,9 @@ bool set_hasherInput(HasherInputMethods method); void set_hasherMD5MultiLevel(MD5MultiLevels level); extern IHasherInput*(*HasherInput_Create)(); +const char* hasherInput_methodName(); +const char* hasherMD5Multi_methodName(); + class MD5Multi { std::vector ctx; std::vector lastCtxData; @@ -58,6 +61,9 @@ class MD5Multi { // single hash instances extern uint32_t(*CRC32_Calc)(const void*, size_t); +extern MD5CRCMethods CRC32_Method; extern uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*); +extern MD5CRCMethods MD5CRC_Method; +const char* md5crc_methodName(MD5CRCMethods m); #endif /* __HASHER_H */ diff --git a/hasher/hasher_impl.h b/hasher/hasher_impl.h index 75236e22..d908f4fb 100644 --- a/hasher/hasher_impl.h +++ b/hasher/hasher_impl.h @@ -5,6 +5,16 @@ #include "../src/platform.h" #include +enum MD5CRCMethods { + MD5CRCMETH_SCALAR, + // MD5 + MD5CRCMETH_BMI1, + MD5CRCMETH_NOLEA, + MD5CRCMETH_AVX512, + // CRC32 + MD5CRCMETH_ARMCRC, + MD5CRCMETH_PCLMUL +}; class MD5Single { public: @@ -16,6 +26,7 @@ class MD5Single { // private, set by setup_hasher static void(*_update)(uint32_t*, const void*, size_t); static void(*_updateZero)(uint32_t*, size_t); + static MD5CRCMethods method; // public, read-only // public interface void reset() { diff --git a/lib/par2.js b/lib/par2.js index c45f3ecf..5a26b685 100644 --- a/lib/par2.js +++ b/lib/par2.js @@ -946,6 +946,12 @@ module.exports = { set_outhash_method: function(method) { return binding.set_HasherOutput(getMethodNum(OUTHASH_METHODS, method)); }, + get_inhash_methodDesc: function() { + return binding.hasherInput_method; + }, + get_outhash_methodDesc: function() { + return binding.hasherOutput_method; + }, _extend: Object.assign || function(to) { for(var i=1; i Date: Mon, 14 Aug 2023 10:32:20 +1000 Subject: [PATCH 52/91] Remove support for non power-of-two SVE widths ARM has removed support for such vector sizes --- gf16/gf16mul.cpp | 7 +++++++ gf16/gf16mul.h | 10 ---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 1094ced4..72500339 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -207,6 +207,13 @@ struct CpuCap { hasNEON = CPU_HAS_NEON; hasSVE = CPU_HAS_SVE; hasSVE2 = CPU_HAS_SVE2; + if(hasSVE) { + size_t sz = gf16_sve_get_size(); + if(sz & (sz-1)) { // we don't support non-pow2 vector widths + hasSVE = false; + hasSVE2 = false; + } + } } }; #endif diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index 8d47a8ff..5a3910b8 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -189,20 +189,10 @@ class Galois16Mul { static Galois16MethodInfo info(Galois16Methods _method); inline HEDLEY_CONST bool isMultipleOfStride(size_t len) const { -#if defined(_M_ARM64) || defined(__aarch64__) - // SVE can have non-power-of-2 strides - if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) // ...but most of the time, expect stride to be a power of 2 - return (len % _info.stride) == 0; -#endif return (len & (_info.stride-1)) == 0; } inline HEDLEY_CONST size_t alignToStride(size_t len) const { size_t alignMask = _info.stride-1; -#if defined(_M_ARM64) || defined(__aarch64__) - if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) { - return ((len + alignMask) / _info.stride) * _info.stride; - } -#endif return (len + alignMask) & ~alignMask; } From 9c7db32591558f885ec4ed320c2d722ca0b4b62f Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 14 Aug 2023 11:22:01 +1000 Subject: [PATCH 53/91] Detect RVV intrinsic version --- gf16/gf16_checksum_rvv.h | 14 +++++++++++--- gf16/gf16_rvv_common.h | 7 ++++++- gf16/gf16_shuffle128_rvv.c | 25 +++++++++++++++---------- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/gf16/gf16_checksum_rvv.h b/gf16/gf16_checksum_rvv.h index 615109b9..7ea94ba2 100644 --- a/gf16/gf16_checksum_rvv.h +++ b/gf16/gf16_checksum_rvv.h @@ -37,11 +37,14 @@ static HEDLEY_ALWAYS_INLINE void gf16_checksum_blocku_rvv(const void *HEDLEY_RES while(amount) { size_t vl = RV(vsetvl_e8m1)(amount); - // intrinsics lack tail-undisturbed, so emulate it +#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000 + v8 = RV(vxor_vv_i8m1_tu)(v8, v8, RV(vle8_v_i8m1)(_src, vl), vl); +#else + // emulate tail-undisturbed vint8m1_t tmp = RV(vmv_v_x_i8m1)(0, vlmax); memcpy(&tmp, _src, vl); v8 = RV(vxor_vv_i8m1)(v8, tmp, vlmax); - //v8 = RV(vxor_vv_i8m1)(v8, RV(vle8_v_i8m1)(_src, vl), vl); +#endif amount -= vl; _src += vl; } @@ -58,7 +61,12 @@ static HEDLEY_ALWAYS_INLINE void gf16_checksum_exp_rvv(void *HEDLEY_RESTRICT che for(int i=0; i<15; i++) { res = gf16_vec_mul2_rvv(res); coeff = RV(vadd_vv_i16m1)(coeff, coeff, vl); - res = RV(vxor_vv_i16m1_m)( +#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000 + res = RV(vxor_vv_i16m1_mu) +#else + res = RV(vxor_vv_i16m1_m) +#endif + ( RV(vmslt_vx_i16m1_b16)(coeff, 0, vl), res, res, _checksum, vl diff --git a/gf16/gf16_rvv_common.h b/gf16/gf16_rvv_common.h index 7bc43769..da608806 100644 --- a/gf16/gf16_rvv_common.h +++ b/gf16/gf16_rvv_common.h @@ -23,7 +23,12 @@ static HEDLEY_ALWAYS_INLINE vint16m1_t gf16_vec_mul2_rvv(vint16m1_t v) { size_t vl = RV(vsetvlmax_e16m1)(); vbool16_t maskPoly = RV(vmslt_vx_i16m1_b16)(v, 0, vl); v = RV(vadd_vv_i16m1)(v, v, vl); - return RV(vxor_vx_i16m1_m)( +#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000 + return RV(vxor_vx_i16m1_mu) +#else + return RV(vxor_vx_i16m1_m) +#endif + ( maskPoly, v, v, GF16_POLYNOMIAL & 0xffff, diff --git a/gf16/gf16_shuffle128_rvv.c b/gf16/gf16_shuffle128_rvv.c index ba23a1a9..494fc4d0 100644 --- a/gf16/gf16_shuffle128_rvv.c +++ b/gf16/gf16_shuffle128_rvv.c @@ -9,12 +9,7 @@ int gf16_available_rvv = 0; #include "gf16_muladd_multi.h" #if defined(__RVV_LE) -// TODO: detect intrinsics version -# if 1 -// intrinsics v0.11.x (up to at least GCC 13 / Clang 16) -# define _vlseg2e8 RV(vlseg2e8_v_u8m1) -# define _vsseg2e8 RV(vsseg2e8_v_u8m1) -# else +# if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000 // intrinsics v0.12.x static HEDLEY_ALWAYS_INLINE void _vlseg2e8(vuint8m1_t* v0, vuint8m1_t* v1, const uint8_t* src, size_t vl) { vuint8m1x2_t d = RV(vlseg2e8_v_u8m1x2)(src, vl); @@ -27,6 +22,10 @@ static HEDLEY_ALWAYS_INLINE void _vsseg2e8(uint8_t* dst, vuint8m1_t v0, vuint8m1 d = RV(vset_v_u8m1_u8m1x2)(d, 1, v1); RV(vsseg2e8_v_u8m1x2)(dst, d, vl); } +# else +// intrinsics v0.11.x (up to at least GCC 13 / Clang 16) +# define _vlseg2e8 RV(vlseg2e8_v_u8m1) +# define _vsseg2e8 RV(vsseg2e8_v_u8m1) # endif static HEDLEY_ALWAYS_INLINE void gf16_shuffle_128_rvv_calc_table(vuint8m1_t poly_l, uint16_t val, @@ -217,11 +216,17 @@ static HEDLEY_ALWAYS_INLINE void gf16_prepare_block_rvv(void *HEDLEY_RESTRICT ds } // final block static HEDLEY_ALWAYS_INLINE void gf16_prepare_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { - // current intrinsics don't seem to support tail-undisturbed policy, so zero explicitly for now - size_t vl = RV(vsetvlmax_e8m2)(); - RV(vse8_v_u8m2)((uint8_t*)dst, RV(vmv_v_x_u8m2)(0, vl), vl); - vl = RV(vsetvl_e8m2)(remaining); + size_t vlmax = RV(vsetvlmax_e8m2)(); + vuint8m1_t v = RV(vmv_v_x_u8m2)(0, vlmax); + size_t vl = RV(vsetvl_e8m2)(remaining); +#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000 + v = RV(vle8_v_u8m2_tu)(v, (const uint8_t*)src, vl); + RV(vse8_v_u8m2)((uint8_t*)dst, v, vlmax); +#else + // tail-undisturbed not supported, so zero explicitly as a workaround + RV(vse8_v_u8m2)((uint8_t*)dst, v, vlmax); RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl); +#endif } static HEDLEY_ALWAYS_INLINE void gf16_finish_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { size_t vl = RV(vsetvl_e8m2)(remaining); From 6d0d71f3af4104b6193da70e176ba2978aa567b5 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 14 Aug 2023 11:50:46 +1000 Subject: [PATCH 54/91] Suppress GCC 12's warnings for some AVX512 intrinsics --- hasher/hasher_avx512.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hasher/hasher_avx512.cpp b/hasher/hasher_avx512.cpp index 9b66747d..8c6678f1 100644 --- a/hasher/hasher_avx512.cpp +++ b/hasher/hasher_avx512.cpp @@ -1,5 +1,14 @@ +// suppress warning spam in GCC 12.0-12.2 (caused by some AVX512 intrinsics) +#include "../src/hedley.h" +#if HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,3,0) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wuninitialized" +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + #include "../src/platform.h" + #define MD5Multi MD5Multi_AVX512 #define _FNMD5mb(f) f##_avx512 #define _FNMD5mb2(f) f##_avx512 From 60090b950038769466f521221586ed9b09299253 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 14 Aug 2023 15:12:00 +1000 Subject: [PATCH 55/91] Create NEON-SHA3 variant of CLMul and move M1 optimisation there Also add ability to bypass getauxval checks, for testing --- binding.gyp | 36 ++++++- gf16/gf16_clmul.h | 17 +++- gf16/gf16_clmul_neon.c | 193 ++---------------------------------- gf16/gf16_clmul_neon.h | 4 +- gf16/gf16_clmul_neon_base.h | 172 ++++++++++++++++++++++++++++++++ gf16/gf16_clmul_sha3.c | 47 +++++++++ gf16/gf16mul.cpp | 40 ++++++++ gf16/gf16mul.h | 2 + gf16/gf16pmul_neon.c | 6 ++ help.txt | 1 + lib/par2.js | 2 +- src/cpuid.h | 105 ++++++++++++-------- 12 files changed, 388 insertions(+), 237 deletions(-) create mode 100644 gf16/gf16_clmul_neon_base.h create mode 100644 gf16/gf16_clmul_sha3.c diff --git a/binding.gyp b/binding.gyp index 89065506..86757b38 100644 --- a/binding.gyp +++ b/binding.gyp @@ -37,7 +37,7 @@ { "target_name": "parpar_gf", "dependencies": [ - "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sve", "gf16_sve2", "gf16_rvv", + "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sha3", "gf16_sve", "gf16_sve2", "gf16_rvv", "hasher", "hasher_sse2", "hasher_clmul", "hasher_xop", "hasher_bmi1", "hasher_avx2", "hasher_avx512", "hasher_avx512vl", "hasher_armcrc", "hasher_neon", "hasher_neoncrc", "hasher_sve2" ], "sources": ["src/gf.cc", "gf16/controller.cpp", "gf16/controller_cpu.cpp", "gf16/controller_ocl.cpp", "gf16/controller_ocl_init.cpp"], @@ -794,6 +794,40 @@ }] ] }, + { + "target_name": "gf16_sha3", + "type": "static_library", + "defines": ["NDEBUG"], + "sources": [ + "gf16/gf16_clmul_sha3.c" + ], + "cflags": ["-Wno-unused-function", "-std=c99"], + "xcode_settings": { + "OTHER_CFLAGS": ["-Wno-unused-function"], + "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"] + }, + "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], + "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}}, + "conditions": [ + ['target_arch=="arm64" and OS!="win"', { + "variables": {"supports_sha3%": "/dev/null || true)"}, + "conditions": [ + ['supports_sha3!=""', { + "cflags!": ["-march=native"], + "cxxflags!": ["-march=native"], + "cflags": ["-march=armv8.2-a+sha3"], + "cxxflags": ["-march=armv8.2-a+sha3"], + "xcode_settings": { + "OTHER_CFLAGS!": ["-march=native"], + "OTHER_CXXFLAGS!": ["-march=native"], + "OTHER_CFLAGS": ["-march=armv8.2-a+sha3"], + "OTHER_CXXFLAGS": ["-march=armv8.2-a+sha3"], + } + }] + ] + }] + ] + }, { "target_name": "gf16_sve", "type": "static_library", diff --git a/gf16/gf16_clmul.h b/gf16/gf16_clmul.h index d8f189c2..23845a6c 100644 --- a/gf16/gf16_clmul.h +++ b/gf16/gf16_clmul.h @@ -7,18 +7,27 @@ void gf16_clmul_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \ void gf16_clmul_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \ void gf16_clmul_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ - void gf16_clmul_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \ - void gf16_clmul_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ - void gf16_clmul_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ - void gf16_clmul_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen) + void gf16_clmul_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) // this is the same as the shuffle version, so re-use that //int gf16_clmul_finish_packed_cksum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); //int gf16_clmul_finish_partial_packsum_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); +FUNCS(neon); +FUNCS(sha3); +FUNCS(sve2); + +#undef FUNCS + +#define FUNCS(v) \ + void gf16_clmul_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ + void gf16_clmul_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \ + void gf16_clmul_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen) + FUNCS(neon); FUNCS(sve2); #undef FUNCS int gf16_clmul_init_arm(int polynomial); +extern int gf16_available_neon_sha3; diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c index 95a89cf2..5784a715 100644 --- a/gf16/gf16_clmul_neon.c +++ b/gf16/gf16_clmul_neon.c @@ -1,203 +1,24 @@ -#include "gf16_clmul_neon.h" -#include "gf16_muladd_multi.h" +#include "gf16_neon_common.h" // TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort #if defined(__ARM_NEON) -// NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3 -#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__) -// Apple M1 supports fusing PMULL+EOR, so ensure these are paired -static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { - poly16x8_t result; - __asm__ ("pmull %0.8h,%1.8b,%2.8b\n" - "eor %0.16b,%0.16b,%3.16b\n" - : "=&w"(result) - : "w"(a), "w"(b), "w"(sum) - : /* No clobbers */); - return result; -} -static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_high(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { - poly16x8_t result; - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b\n" - "eor %0.16b,%0.16b,%3.16b\n" - : "=&w"(result) - : "w"(a), "w"(b), "w"(sum) - : /* No clobbers */); - return result; -} -#else static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) { return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b))); } -# define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b)) -# define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b)) -#endif - -static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) { - poly8x16x2_t data = vld2q_p8((const poly8_t*)src); - *low1 = pmull_low(data.val[0], coeff[0]); - *low2 = pmull_high(data.val[0], coeff[0]); - poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); - *mid1 = pmull_low(mid, coeff[2]); - *mid2 = pmull_high(mid, coeff[2]); - *high1 = pmull_low(data.val[1], coeff[1]); - *high2 = pmull_high(data.val[1], coeff[1]); - - // TODO: try idea of forcing an EOR via asm volatile - -/* Alternative approach for AArch64, which only needs one register per region at the expense of 2 additional instructions; unfortunately compilers won't heed our aim - // the `midCoeff` approach can also work with AArch32 - coeff_t swapCoeff = vextq_p8(coeff[0], coeff[0], 8); - coeff_t midCoeff = veorq_p8(coeff[0], swapCoeff); - - *low1 = pmull_low(data.val[0], coeff[0]); - *low2 = pmull_high(data.val[0], swapCoeff); - poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); - *mid1 = pmull_low(mid, midCoeff); - *mid2 = pmull_high(mid, midCoeff); - *high1 = pmull_low(data.val[1], swapCoeff); - *high2 = pmull_high(data.val[1], coeff[0]); -*/ -} - -static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) { - poly8x16x2_t data = vld2q_p8((const poly8_t*)src); - *low1 = pmacl_low(*low1, data.val[0], coeff[0]); - *low2 = pmacl_high(*low2, data.val[0], coeff[0]); - poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); - *mid1 = pmacl_low(*mid1, mid, coeff[2]); - *mid2 = pmacl_high(*mid2, mid, coeff[2]); - *high1 = pmacl_low(*high1, data.val[1], coeff[1]); - *high2 = pmacl_high(*high2, data.val[1], coeff[1]); -} +#define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b)) +#define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b)) +#define _AVAILABLE 1 -#ifdef __aarch64__ -# define CLMUL_NUM_REGIONS 8 -#else -# define CLMUL_NUM_REGIONS 3 -#endif -#define CLMUL_COEFF_PER_REGION 3 - -static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_neon( - const void *HEDLEY_RESTRICT scratch, - uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, GF16_MULADD_MULTI_SRCLIST, size_t len, - const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf -) { - GF16_MULADD_MULTI_SRC_UNUSED(CLMUL_NUM_REGIONS); - UNUSED(scratch); - - coeff_t coeff[CLMUL_COEFF_PER_REGION*CLMUL_NUM_REGIONS]; - for(int src=0; src> 8; - coeff[src*CLMUL_COEFF_PER_REGION +0] = coeff_fn(vdup, n_p8)(lo); - coeff[src*CLMUL_COEFF_PER_REGION +1] = coeff_fn(vdup, n_p8)(hi); - coeff[src*CLMUL_COEFF_PER_REGION +2] = coeff_fn(veor, p8)(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]); - // if we want to have one register per region (AArch64), at the expense of 2 extra instructions per region - //coeff[src] = vcombine_p8(vdup_n_p8(lo), vdup_n_p8(hi)); - } - - poly16x8_t low1, low2, mid1, mid2, high1, high2; - #define DO_PROCESS \ - gf16_clmul_neon_round1(_src1+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + 0); \ - if(srcCount > 1) \ - gf16_clmul_neon_round(_src2+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*1); \ - if(srcCount > 2) \ - gf16_clmul_neon_round(_src3+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*2); \ - if(srcCount > 3) \ - gf16_clmul_neon_round(_src4+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*3); \ - if(srcCount > 4) \ - gf16_clmul_neon_round(_src5+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*4); \ - if(srcCount > 5) \ - gf16_clmul_neon_round(_src6+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*5); \ - if(srcCount > 6) \ - gf16_clmul_neon_round(_src7+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*6); \ - if(srcCount > 7) \ - gf16_clmul_neon_round(_src8+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*7); \ - \ - gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); \ - \ - uint8x16x2_t vb = vld2q_u8(_dst+ptr); \ - vb.val[0] = veorq_u8(vreinterpretq_u8_p16(low1), vb.val[0]); \ - vb.val[1] = veorq_u8(vreinterpretq_u8_p16(high1), vb.val[1]); \ - vst2q_u8(_dst+ptr, vb) - - if(doPrefetch) { - intptr_t ptr = -(intptr_t)len; - if(doPrefetch == 1) - PREFETCH_MEM(_pf+ptr, 1); - if(doPrefetch == 2) - PREFETCH_MEM(_pf+ptr, 0); - while(ptr & (CACHELINE_SIZE-1)) { - DO_PROCESS; - ptr += sizeof(uint8x16_t)*2; - } - while(ptr) { - if(doPrefetch == 1) - PREFETCH_MEM(_pf+ptr, 1); - if(doPrefetch == 2) - PREFETCH_MEM(_pf+ptr, 0); - - for(size_t iter=0; iter<(CACHELINE_SIZE/(sizeof(uint8x16_t)*2)); iter++) { - DO_PROCESS; - ptr += sizeof(uint8x16_t)*2; - } - } - } else { - for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { - DO_PROCESS; - } - } - #undef DO_PROCESS -} #endif /*defined(__ARM_NEON)*/ - -void gf16_clmul_mul_neon(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { - UNUSED(mutScratch); UNUSED(scratch); -#if defined(__ARM_NEON) - - coeff_t coeff[3]; - coeff[0] = coeff_fn(vdup, n_p8)(val & 0xff); - coeff[1] = coeff_fn(vdup, n_p8)(val >> 8); - coeff[2] = coeff_fn(veor, p8)(coeff[0], coeff[1]); - - uint8_t* _src = (uint8_t*)src + len; - uint8_t* _dst = (uint8_t*)dst + len; - poly16x8_t low1, low2, mid1, mid2, high1, high2; - for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { - gf16_clmul_neon_round1(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff); - gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); - uint8x16x2_t out; - out.val[0] = vreinterpretq_u8_p16(low1); - out.val[1] = vreinterpretq_u8_p16(high1); - vst2q_u8(_dst+ptr, out); - } -#else - UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); -#endif -} - - -void gf16_clmul_muladd_neon(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { - UNUSED(mutScratch); -#if defined(__ARM_NEON) - gf16_muladd_single(scratch, &gf16_clmul_muladd_x_neon, dst, src, len, val); -#else - UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); -#endif -} - - -#if defined(__ARM_NEON) -GF16_MULADD_MULTI_FUNCS(gf16_clmul, _neon, gf16_clmul_muladd_x_neon, CLMUL_NUM_REGIONS, sizeof(uint8x16_t)*2, 0, (void)0) -#else -GF16_MULADD_MULTI_FUNCS_STUB(gf16_clmul, _neon) -#endif +#define _FNSUFFIX _neon +#include "gf16_clmul_neon_base.h" +#undef _FNSUFFIX #if defined(__ARM_NEON) diff --git a/gf16/gf16_clmul_neon.h b/gf16/gf16_clmul_neon.h index a638101f..a0fa0dc2 100644 --- a/gf16/gf16_clmul_neon.h +++ b/gf16/gf16_clmul_neon.h @@ -1,6 +1,6 @@ #include "gf16_neon_common.h" -#if defined(__ARM_NEON) +#if defined(_AVAILABLE) // `vaddq_p8` and co seems to be missing from some compilers (like GCC), so define our own variant static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) { @@ -42,9 +42,11 @@ typedef poly8x8_t coeff_t; # define coeff_fn(f1, f2) f1##_##f2 #endif +#ifndef eor3q_u8 static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return veorq_u8(a, veorq_u8(b, c)); } +#endif static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) { // put data in proper form diff --git a/gf16/gf16_clmul_neon_base.h b/gf16/gf16_clmul_neon_base.h new file mode 100644 index 00000000..59613c71 --- /dev/null +++ b/gf16/gf16_clmul_neon_base.h @@ -0,0 +1,172 @@ + +#include "gf16_clmul_neon.h" +#include "gf16_muladd_multi.h" + +// TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort + +#if defined(_AVAILABLE) + + +static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) { + poly8x16x2_t data = vld2q_p8((const poly8_t*)src); + *low1 = pmull_low(data.val[0], coeff[0]); + *low2 = pmull_high(data.val[0], coeff[0]); + poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); + *mid1 = pmull_low(mid, coeff[2]); + *mid2 = pmull_high(mid, coeff[2]); + *high1 = pmull_low(data.val[1], coeff[1]); + *high2 = pmull_high(data.val[1], coeff[1]); + + // TODO: try idea of forcing an EOR via asm volatile + +/* Alternative approach for AArch64, which only needs one register per region at the expense of 2 additional instructions; unfortunately compilers won't heed our aim + // the `midCoeff` approach can also work with AArch32 + coeff_t swapCoeff = vextq_p8(coeff[0], coeff[0], 8); + coeff_t midCoeff = veorq_p8(coeff[0], swapCoeff); + + *low1 = pmull_low(data.val[0], coeff[0]); + *low2 = pmull_high(data.val[0], swapCoeff); + poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); + *mid1 = pmull_low(mid, midCoeff); + *mid2 = pmull_high(mid, midCoeff); + *high1 = pmull_low(data.val[1], swapCoeff); + *high2 = pmull_high(data.val[1], coeff[0]); +*/ +} + +static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) { + poly8x16x2_t data = vld2q_p8((const poly8_t*)src); + *low1 = pmacl_low(*low1, data.val[0], coeff[0]); + *low2 = pmacl_high(*low2, data.val[0], coeff[0]); + poly8x16_t mid = veorq_p8(data.val[0], data.val[1]); + *mid1 = pmacl_low(*mid1, mid, coeff[2]); + *mid2 = pmacl_high(*mid2, mid, coeff[2]); + *high1 = pmacl_low(*high1, data.val[1], coeff[1]); + *high2 = pmacl_high(*high2, data.val[1], coeff[1]); +} + + +#ifdef __aarch64__ +# define CLMUL_NUM_REGIONS 8 +#else +# define CLMUL_NUM_REGIONS 3 +#endif +#define CLMUL_COEFF_PER_REGION 3 + +static HEDLEY_ALWAYS_INLINE void _FN(gf16_clmul_muladd_x)( + const void *HEDLEY_RESTRICT scratch, + uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, GF16_MULADD_MULTI_SRCLIST, size_t len, + const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf +) { + GF16_MULADD_MULTI_SRC_UNUSED(CLMUL_NUM_REGIONS); + UNUSED(scratch); + + coeff_t coeff[CLMUL_COEFF_PER_REGION*CLMUL_NUM_REGIONS]; + for(int src=0; src> 8; + coeff[src*CLMUL_COEFF_PER_REGION +0] = coeff_fn(vdup, n_p8)(lo); + coeff[src*CLMUL_COEFF_PER_REGION +1] = coeff_fn(vdup, n_p8)(hi); + coeff[src*CLMUL_COEFF_PER_REGION +2] = coeff_fn(veor, p8)(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]); + // if we want to have one register per region (AArch64), at the expense of 2 extra instructions per region + //coeff[src] = vcombine_p8(vdup_n_p8(lo), vdup_n_p8(hi)); + } + + poly16x8_t low1, low2, mid1, mid2, high1, high2; + #define DO_PROCESS \ + gf16_clmul_neon_round1(_src1+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + 0); \ + if(srcCount > 1) \ + gf16_clmul_neon_round(_src2+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*1); \ + if(srcCount > 2) \ + gf16_clmul_neon_round(_src3+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*2); \ + if(srcCount > 3) \ + gf16_clmul_neon_round(_src4+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*3); \ + if(srcCount > 4) \ + gf16_clmul_neon_round(_src5+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*4); \ + if(srcCount > 5) \ + gf16_clmul_neon_round(_src6+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*5); \ + if(srcCount > 6) \ + gf16_clmul_neon_round(_src7+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*6); \ + if(srcCount > 7) \ + gf16_clmul_neon_round(_src8+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*7); \ + \ + gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); \ + \ + uint8x16x2_t vb = vld2q_u8(_dst+ptr); \ + vb.val[0] = veorq_u8(vreinterpretq_u8_p16(low1), vb.val[0]); \ + vb.val[1] = veorq_u8(vreinterpretq_u8_p16(high1), vb.val[1]); \ + vst2q_u8(_dst+ptr, vb) + + if(doPrefetch) { + intptr_t ptr = -(intptr_t)len; + if(doPrefetch == 1) + PREFETCH_MEM(_pf+ptr, 1); + if(doPrefetch == 2) + PREFETCH_MEM(_pf+ptr, 0); + while(ptr & (CACHELINE_SIZE-1)) { + DO_PROCESS; + ptr += sizeof(uint8x16_t)*2; + } + while(ptr) { + if(doPrefetch == 1) + PREFETCH_MEM(_pf+ptr, 1); + if(doPrefetch == 2) + PREFETCH_MEM(_pf+ptr, 0); + + for(size_t iter=0; iter<(CACHELINE_SIZE/(sizeof(uint8x16_t)*2)); iter++) { + DO_PROCESS; + ptr += sizeof(uint8x16_t)*2; + } + } + } else { + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { + DO_PROCESS; + } + } + #undef DO_PROCESS +} +#endif /*defined(_AVAILABLE)*/ + + + +void _FN(gf16_clmul_mul)(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); UNUSED(scratch); +#if defined(_AVAILABLE) + + coeff_t coeff[3]; + coeff[0] = coeff_fn(vdup, n_p8)(val & 0xff); + coeff[1] = coeff_fn(vdup, n_p8)(val >> 8); + coeff[2] = coeff_fn(veor, p8)(coeff[0], coeff[1]); + + uint8_t* _src = (uint8_t*)src + len; + uint8_t* _dst = (uint8_t*)dst + len; + poly16x8_t low1, low2, mid1, mid2, high1, high2; + for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { + gf16_clmul_neon_round1(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff); + gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); + uint8x16x2_t out; + out.val[0] = vreinterpretq_u8_p16(low1); + out.val[1] = vreinterpretq_u8_p16(high1); + vst2q_u8(_dst+ptr, out); + } +#else + UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + + +void _FN(gf16_clmul_muladd)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) { + UNUSED(mutScratch); +#if defined(_AVAILABLE) + gf16_muladd_single(scratch, &_FN(gf16_clmul_muladd_x), dst, src, len, val); +#else + UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val); +#endif +} + + +#if defined(_AVAILABLE) +GF16_MULADD_MULTI_FUNCS(gf16_clmul, _FNSUFFIX, _FN(gf16_clmul_muladd_x), CLMUL_NUM_REGIONS, sizeof(uint8x16_t)*2, 0, (void)0) +#else +GF16_MULADD_MULTI_FUNCS_STUB(gf16_clmul, _FNSUFFIX) +#endif diff --git a/gf16/gf16_clmul_sha3.c b/gf16/gf16_clmul_sha3.c new file mode 100644 index 00000000..d9424a3f --- /dev/null +++ b/gf16/gf16_clmul_sha3.c @@ -0,0 +1,47 @@ + +// this CLMul variant is optimised for Apple M1 + +#include "gf16_neon_common.h" + +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA3) +int gf16_available_neon_sha3 = 1; + +// NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3 +#if defined(__GNUC__) || defined(__clang__) +// Apple M1 supports fusing PMULL+EOR, so ensure these are paired +static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { + poly16x8_t result; + __asm__ ("pmull %0.8h,%1.8b,%2.8b\n" + "eor %0.16b,%0.16b,%3.16b\n" + : "=&w"(result) + : "w"(a), "w"(b), "w"(sum) + : /* No clobbers */); + return result; +} +static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_high(poly16x8_t sum, poly8x16_t a, poly8x16_t b) { + poly16x8_t result; + __asm__ ("pmull2 %0.8h,%1.16b,%2.16b\n" + "eor %0.16b,%0.16b,%3.16b\n" + : "=&w"(result) + : "w"(a), "w"(b), "w"(sum) + : /* No clobbers */); + return result; +} +#else +static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) { + return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b))); +} +# define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b)) +# define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b)) +#endif + +#define _AVAILABLE 1 +#define eor3q_u8 veor3q_u8 + +#else +int gf16_available_neon_sha3 = 0; +#endif /*defined(__ARM_NEON)*/ + +#define _FNSUFFIX _sha3 +#include "gf16_clmul_neon_base.h" +#undef _FNSUFFIX diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 72500339..1aa04723 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -200,11 +200,13 @@ struct CpuCap { struct CpuCap { bool hasNEON; + bool hasSHA3; bool hasSVE; bool hasSVE2; CpuCap(bool detect) : hasNEON(true), hasSVE(true), hasSVE2(true) { if(!detect) return; hasNEON = CPU_HAS_NEON; + hasSHA3 = CPU_HAS_NEON_SHA3; hasSVE = CPU_HAS_SVE; hasSVE2 = CPU_HAS_SVE2; if(hasSVE) { @@ -295,6 +297,7 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) { break; case GF16_CLMUL_NEON: + case GF16_CLMUL_SHA3: _info.alignment = 32; // presumably double-loads work best when aligned to 32 instead of 16? _info.stride = 32; _info.cksumSize = 16; @@ -483,6 +486,7 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) { _info.idealChunkSize = 4*1024; break; case GF16_CLMUL_NEON: // faster init than Shuffle, and usually faster + case GF16_CLMUL_SHA3: case GF16_CLMUL_SVE2: // may want smaller chunk size for wider vectors case GF16_AFFINE_GFNI: case GF16_AFFINE2X_GFNI: @@ -756,6 +760,35 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { copy_cksum_check = &gf16_cksum_copy_check_neon; } break; + case GF16_CLMUL_SHA3: { + int available = gf16_clmul_init_arm(GF16_POLYNOMIAL); + METHOD_REQUIRES(gf16_available_neon_sha3 && available) + + scratch = gf16_shuffle_init_arm(GF16_POLYNOMIAL); + if(scratch) { + _mul = &gf16_shuffle_mul_neon; + _mul_add = &gf16_shuffle_muladd_neon; + } else { + _mul = &gf16_clmul_mul_sha3; + _mul_add = &gf16_clmul_muladd_sha3; + } + _mul_add_multi = &gf16_clmul_muladd_multi_sha3; + _mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_sha3; + _mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sha3; + add_multi = &gf_add_multi_neon; + add_multi_packed = &gf_add_multi_packed_clmul_neon; + add_multi_packpf = &gf_add_multi_packpf_clmul_neon; + _mul_add_multi_packpf = &gf16_clmul_muladd_multi_packpf_sha3; + prepare_packed = &gf16_clmul_prepare_packed_neon; + prepare_packed_cksum = &gf16_clmul_prepare_packed_cksum_neon; + prepare_partial_packsum = &gf16_clmul_prepare_partial_packsum_neon; + finish_packed = &gf16_shuffle_finish_packed_neon; + finish_packed_cksum = &gf16_shuffle_finish_packed_cksum_neon; // re-use shuffle routine + finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_neon; + copy_cksum = &gf16_cksum_copy_neon; + copy_cksum_check = &gf16_cksum_copy_check_neon; + } break; + case GF16_SHUFFLE_128_SVE: METHOD_REQUIRES(gf16_available_sve) @@ -1372,6 +1405,10 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu } if(caps.hasSVE && gf16_sve_get_size() > 16) return GF16_SHUFFLE_128_SVE; +# ifdef __aarch64__ + if(gf16_available_neon_sha3 && caps.hasSHA3) + return inputs > 3 ? GF16_CLMUL_SHA3 : GF16_SHUFFLE_NEON; +# endif if(gf16_available_neon && caps.hasNEON) return # ifdef __aarch64__ @@ -1452,6 +1489,9 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { ret.push_back(GF16_SHUFFLE_NEON); ret.push_back(GF16_CLMUL_NEON); } + if(gf16_available_neon_sha3 && caps.hasSHA3) { + ret.push_back(GF16_CLMUL_SHA3); + } if(gf16_available_sve && caps.hasSVE) ret.push_back(GF16_SHUFFLE_128_SVE); if(gf16_available_sve2 && caps.hasSVE2) { diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index 5a3910b8..5979d93a 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -64,6 +64,7 @@ enum Galois16Methods { GF16_AFFINE2X_AVX2, GF16_AFFINE2X_AVX512, GF16_CLMUL_NEON, + GF16_CLMUL_SHA3, GF16_CLMUL_SVE2 // TODO: consider non-transforming shuffle/affine }; @@ -96,6 +97,7 @@ static const char* Galois16MethodsText[] = { "Affine2x (GFNI+AVX2)", "Affine2x (GFNI+AVX512)", "CLMul (NEON)", + "CLMul (SHA3)", "CLMul (SVE2)" }; diff --git a/gf16/gf16pmul_neon.c b/gf16/gf16pmul_neon.c index c23cc3c4..52c68655 100644 --- a/gf16/gf16pmul_neon.c +++ b/gf16/gf16pmul_neon.c @@ -1,7 +1,13 @@ #include "gf16_global.h" + +#ifdef __ARM_NEON +# define _AVAILABLE +#endif #include "gf16_clmul_neon.h" #ifdef __ARM_NEON +# undef _AVAILABLE + int gf16pmul_available_neon = 1; void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) { diff --git a/help.txt b/help.txt index 6e0d28b9..82ddb81e 100644 --- a/help.txt +++ b/help.txt @@ -294,6 +294,7 @@ CPU Tuning Options: shuffle128-sve2: SVE2 variant of shuffle-neon shuffle2x128-sve2: half width variant of shuffle-neon (requires SVE width >= 256 bits) shuffle512-sve2: SVE2 variant of shuffle-vbmi (requires SVE width >= 512 bits) + clmul-sha3: NEON-SHA3 variant of clmul-neon clmul-sve2: SVE2 variant of clmul-neon RISC-V only choices: shuffle128-rvv: RISC-V Vector variant of shuffle128-sve2 diff --git a/lib/par2.js b/lib/par2.js index 5a26b685..20ef302f 100644 --- a/lib/par2.js +++ b/lib/par2.js @@ -890,7 +890,7 @@ var GF_METHODS = [ 'xor-sse', 'xorjit-sse', 'xorjit-avx2', 'xorjit-avx512', 'affine-sse', 'affine-avx2', 'affine-avx512', 'affine2x-sse', 'affine2x-avx2', 'affine2x-avx512', - 'clmul-neon', 'clmul-sve2' + 'clmul-neon', 'clmul-sha3', 'clmul-sve2' ]; var GFOCL_METHODS = [ '' /*default*/, 'lookup', 'lookup_half', 'lookup_nc', 'lookup_half_nc', diff --git a/src/cpuid.h b/src/cpuid.h index 05201e37..56c95761 100644 --- a/src/cpuid.h +++ b/src/cpuid.h @@ -67,53 +67,67 @@ static unsigned long getauxval(unsigned long cap) { # endif # endif -# define CPU_HAS_NEON false -# define CPU_HAS_ARMCRC false -# define CPU_HAS_SVE false -# define CPU_HAS_SVE2 false - -# if defined(AT_HWCAP) -# undef CPU_HAS_NEON -# ifdef __aarch64__ -# define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_ASIMD) -# if defined(HWCAP_SVE) -# undef CPU_HAS_SVE -# define CPU_HAS_SVE (getauxval(AT_HWCAP) & HWCAP_SVE) + +# ifdef PARPAR_SKIP_AUX_CHECK +# define CPU_HAS_NEON true +# define CPU_HAS_ARMCRC true +# define CPU_HAS_NEON_SHA3 true +# define CPU_HAS_SVE true +# define CPU_HAS_SVE2 true +# else +# define CPU_HAS_NEON false +# define CPU_HAS_ARMCRC false +# define CPU_HAS_NEON_SHA3 false +# define CPU_HAS_SVE false +# define CPU_HAS_SVE2 false + +# if defined(AT_HWCAP) +# undef CPU_HAS_NEON +# ifdef __aarch64__ +# define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_ASIMD) +# if defined(HWCAP_SHA3) +# undef CPU_HAS_NEON_SHA3 +# define CPU_HAS_NEON_SHA3 (getauxval(AT_HWCAP) & HWCAP_SHA3) +# endif +# if defined(HWCAP_SVE) +# undef CPU_HAS_SVE +# define CPU_HAS_SVE (getauxval(AT_HWCAP) & HWCAP_SVE) +# endif +# if defined(AT_HWCAP2) && defined(HWCAP2_SVE2) +# undef CPU_HAS_SVE2 +# define CPU_HAS_SVE2 (getauxval(AT_HWCAP2) & HWCAP2_SVE2) +# endif +# else +# define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_NEON) # endif -# if defined(AT_HWCAP2) && defined(HWCAP2_SVE2) -# undef CPU_HAS_SVE2 -# define CPU_HAS_SVE2 (getauxval(AT_HWCAP2) & HWCAP2_SVE2) +# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32) +# undef CPU_HAS_ARMCRC +# define CPU_HAS_ARMCRC (getauxval(AT_HWCAP2) & HWCAP2_CRC32) +# elif defined(HWCAP_CRC32) +# undef CPU_HAS_ARMCRC +# define CPU_HAS_ARMCRC (getauxval(AT_HWCAP) & HWCAP_CRC32) # endif -# else -# define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_NEON) -# endif -# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32) +# elif defined(ANDROID_CPU_FAMILY_ARM) +# undef CPU_HAS_NEON # undef CPU_HAS_ARMCRC -# define CPU_HAS_ARMCRC (getauxval(AT_HWCAP2) & HWCAP2_CRC32) -# elif defined(HWCAP_CRC32) +# ifdef __aarch64__ +# define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) +# define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32) +# else +# define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) +# define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32) +# endif +# elif defined(_WIN32) +# undef CPU_HAS_NEON # undef CPU_HAS_ARMCRC -# define CPU_HAS_ARMCRC (getauxval(AT_HWCAP) & HWCAP_CRC32) -# endif -# elif defined(ANDROID_CPU_FAMILY_ARM) -# undef CPU_HAS_NEON -# undef CPU_HAS_ARMCRC -# ifdef __aarch64__ -# define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) -# define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32) -# else -# define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) -# define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32) -# endif -# elif defined(_WIN32) -# undef CPU_HAS_NEON -# undef CPU_HAS_ARMCRC -# define CPU_HAS_NEON (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) -# define CPU_HAS_ARMCRC (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) -# elif defined(__APPLE__) -# undef CPU_HAS_NEON -# undef CPU_HAS_ARMCRC -# define CPU_HAS_NEON (cpuHasFeature("hw.optional.neon")) -# define CPU_HAS_ARMCRC (cpuHasFeature("hw.optional.armv8_crc32")) +# define CPU_HAS_NEON (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) +# define CPU_HAS_ARMCRC (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) +# elif defined(__APPLE__) +# undef CPU_HAS_NEON +# undef CPU_HAS_ARMCRC +# define CPU_HAS_NEON (cpuHasFeature("hw.optional.neon")) +# define CPU_HAS_ARMCRC (cpuHasFeature("hw.optional.armv8_crc32")) +# define CPU_HAS_NEON_SHA3 (cpuHasFeature("hw.optional.armv8_2_sha3")) static inline bool cpuHasFeature(const char* feature) { int supported = 0; size_t len = sizeof(supported); @@ -121,6 +135,7 @@ static unsigned long getauxval(unsigned long cap) { return (bool)supported; return false; } +# endif # endif #endif @@ -142,7 +157,9 @@ static unsigned long getauxval(unsigned long cap) { # endif # endif -# ifndef CPU_HAS_VECTOR +# ifdef PARPAR_SKIP_AUX_CHECK +# define CPU_HAS_VECTOR true +# else # define CPU_HAS_VECTOR false # if defined(AT_HWCAP) From fb6863c8dff5974b6fc43126c677d826417e481e Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 14 Aug 2023 15:25:32 +1000 Subject: [PATCH 56/91] RVV bugfix --- gf16/gf16_shuffle128_rvv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gf16/gf16_shuffle128_rvv.c b/gf16/gf16_shuffle128_rvv.c index 494fc4d0..433b2faa 100644 --- a/gf16/gf16_shuffle128_rvv.c +++ b/gf16/gf16_shuffle128_rvv.c @@ -217,7 +217,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_prepare_block_rvv(void *HEDLEY_RESTRICT ds // final block static HEDLEY_ALWAYS_INLINE void gf16_prepare_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) { size_t vlmax = RV(vsetvlmax_e8m2)(); - vuint8m1_t v = RV(vmv_v_x_u8m2)(0, vlmax); + vuint8m2_t v = RV(vmv_v_x_u8m2)(0, vlmax); size_t vl = RV(vsetvl_e8m2)(remaining); #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000 v = RV(vle8_v_u8m2_tu)(v, (const uint8_t*)src, vl); From 867e173ef7d9e31c4c336d91e1632935bb15cd97 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 16 Aug 2023 10:32:50 +1000 Subject: [PATCH 57/91] Add ability to list available hasher methods + ability to override single MD5/CRC kernel --- gf16/gf16pmul.cpp | 6 +- gf16/gf16pmul.h | 6 +- hasher/hasher.cpp | 426 ++++++++++++++++++++++++++++------------------ hasher/hasher.h | 35 +++- 4 files changed, 296 insertions(+), 177 deletions(-) diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp index 22e31381..b5962993 100644 --- a/gf16/gf16pmul.cpp +++ b/gf16/gf16pmul.cpp @@ -85,9 +85,9 @@ void setup_pmul() { #endif } -const char* gf16pmul_methodName() { +const char* gf16pmul_methodName(Galois16PointMulMethods method) { const char* names[] = { - "None (exponentiate)", + "None", "PCLMUL", "AVX2", "VPCLMUL", @@ -96,5 +96,5 @@ const char* gf16pmul_methodName() { "SVE2" }; - return names[(int)gf16pmul_method]; + return names[(int)method]; } diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h index c740bc03..d88460fb 100644 --- a/gf16/gf16pmul.h +++ b/gf16/gf16pmul.h @@ -17,9 +17,13 @@ enum Galois16PointMulMethods { // TODO: consider multi-dest typedef void(*Gf16PMulFunc)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); extern Gf16PMulFunc gf16pmul; +extern Galois16PointMulMethods gf16pmul_method; extern size_t gf16pmul_alignment; extern size_t gf16pmul_blocklen; -const char* gf16pmul_methodName(); +const char* gf16pmul_methodName(Galois16PointMulMethods method); +inline const char* gf16pmul_methodName() { + return gf16pmul_methodName(gf16pmul_method); +} void setup_pmul(); diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp index 85e727f0..ced8b76e 100644 --- a/hasher/hasher.cpp +++ b/hasher/hasher.cpp @@ -8,14 +8,67 @@ uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*) = NULL; MD5CRCMethods MD5CRC_Method = MD5CRCMETH_SCALAR; uint32_t(*CRC32_Calc)(const void*, size_t) = NULL; MD5CRCMethods CRC32_Method = MD5CRCMETH_SCALAR; -struct _CpuCap { +struct CpuCap { #ifdef PLATFORM_X86 - bool hasSSE2, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW; - _CpuCap() : hasSSE2(false), hasXOP(false), hasBMI1(false), hasAVX2(false), hasAVX512F(false), hasAVX512VLBW(false) {} + bool hasSSE2, hasClMul, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW; + bool isSmallCore, isLEASlow, isVecRotSlow; + CpuCap(bool detect) : + hasSSE2(true), hasClMul(true), hasXOP(true), hasBMI1(true), hasAVX2(true), hasAVX512F(true), hasAVX512VLBW(true), + isSmallCore(false), isLEASlow(false), isVecRotSlow(false) + { + if(!detect) return; + + bool hasAVX = false; + + int cpuInfo[4]; + int cpuInfoX[4]; + int family, model; + _cpuid(cpuInfo, 1); + hasSSE2 = (cpuInfo[3] & 0x4000000); + hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL + + family = ((cpuInfo[0]>>8) & 0xf) + ((cpuInfo[0]>>16) & 0xff0); + model = ((cpuInfo[0]>>4) & 0xf) + ((cpuInfo[0]>>12) & 0xf0); + + // TODO: check perf on small cores + if(family == 6) { + isSmallCore = CPU_MODEL_IS_BNL_SLM(model); + // Intel Sandy Bridge to Skylake has slow 3-component LEA + isLEASlow = (model == 0x2A || model == 0x2D || model == 0x3A || model == 0x3C || model == 0x3D || model == 0x3E || model == 0x3F || model == 0x45 || model == 0x46 || model == 0x47 || model == 0x4E || model == 0x4F || model == 0x55 || model == 0x56 || model == 0x5E || model == 0x66 || model == 0x67 || model == 0x8E || model == 0x9E || model == 0xA5 || model == 0xA6); + } else { + isSmallCore = CPU_FAMMDL_IS_AMDCAT(family, model); + } + + isVecRotSlow = (family == 0xaf); // vector rotate has 2 cycle latency on Zen4 + +#if !defined(_MSC_VER) || _MSC_VER >= 1600 + _cpuidX(cpuInfoX, 7, 0); + if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE + int xcr = _GET_XCR() & 0xff; + if((xcr & 6) == 6) { // AVX enabled + hasAVX = true; + hasBMI1 = hasAVX && (cpuInfoX[1] & 0x08); + hasAVX2 = cpuInfoX[1] & 0x20; + if((xcr & 0xE0) == 0xE0) { + hasAVX512F = ((cpuInfoX[1] & 0x10000) == 0x10000); + hasAVX512VLBW = ((cpuInfoX[1] & 0xC0010100) == 0xC0010100); // AVX512VL + AVX512BW + AVX512F + BMI2 + } + } + } +#endif + + _cpuid(cpuInfo, 0x80000001); + hasXOP = hasAVX && (cpuInfo[2] & 0x800); + } #endif #ifdef PLATFORM_ARM - bool hasNEON, hasSVE2; - _CpuCap() : hasNEON(false), hasSVE2(false) {} + bool hasCRC, hasNEON, hasSVE2; + CpuCap(bool detect) : hasCRC(true), hasNEON(true), hasSVE2(true) { + if(!detect) return; + hasCRC = CPU_HAS_ARMCRC; + hasNEON = CPU_HAS_NEON; + hasSVE2 = CPU_HAS_SVE2; + } #endif }; @@ -24,168 +77,67 @@ MD5MultiLevels HasherMD5Multi_level; void setup_hasher() { if(HasherInput_Create) return; - HasherInput_Create = &HasherInput_Scalar::create; - HasherInput_Method = INHASH_SCALAR; - MD5CRC_Calc = &MD5CRC_Calc_Scalar; - MD5CRC_Method = MD5CRCMETH_SCALAR; - CRC32_Calc = &CRC32_Calc_Slice4; - CRC32_Method = MD5CRCMETH_SCALAR; - - struct _CpuCap CpuCap; - (void)CpuCap; + set_hasherInput(INHASH_SCALAR); + set_hasherMD5CRC(MD5CRCMETH_SCALAR); - // CPU detection #ifdef PLATFORM_X86 - bool hasClMul = false, hasAVX = false; - bool isSmallCore = false, isLEASlow = false, isVecRotSlow = false; - - int cpuInfo[4]; - int cpuInfoX[4]; - int family, model; - _cpuid(cpuInfo, 1); - CpuCap.hasSSE2 = (cpuInfo[3] & 0x4000000); - hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL - - family = ((cpuInfo[0]>>8) & 0xf) + ((cpuInfo[0]>>16) & 0xff0); - model = ((cpuInfo[0]>>4) & 0xf) + ((cpuInfo[0]>>12) & 0xf0); - - // TODO: check perf on small cores - if(family == 6) { - isSmallCore = CPU_MODEL_IS_BNL_SLM(model); - // Intel Sandy Bridge to Skylake has slow 3-component LEA - isLEASlow = (model == 0x2A || model == 0x2D || model == 0x3A || model == 0x3C || model == 0x3D || model == 0x3E || model == 0x3F || model == 0x45 || model == 0x46 || model == 0x47 || model == 0x4E || model == 0x4F || model == 0x55 || model == 0x56 || model == 0x5E || model == 0x66 || model == 0x67 || model == 0x8E || model == 0x9E || model == 0xA5 || model == 0xA6); - } else { - isSmallCore = CPU_FAMMDL_IS_AMDCAT(family, model); - } - - isVecRotSlow = (family == 0xaf); // vector rotate has 2 cycle latency on Zen4 + struct CpuCap caps(true); -#if !defined(_MSC_VER) || _MSC_VER >= 1600 - _cpuidX(cpuInfoX, 7, 0); - if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE - int xcr = _GET_XCR() & 0xff; - if((xcr & 6) == 6) { // AVX enabled - hasAVX = true; - CpuCap.hasBMI1 = hasAVX && (cpuInfoX[1] & 0x08); - CpuCap.hasAVX2 = cpuInfoX[1] & 0x20; - if((xcr & 0xE0) == 0xE0) { - CpuCap.hasAVX512F = ((cpuInfoX[1] & 0x10000) == 0x10000); - CpuCap.hasAVX512VLBW = ((cpuInfoX[1] & 0xC0010100) == 0xC0010100); // AVX512VL + AVX512BW + AVX512F + BMI2 - } - } - } -#endif - - _cpuid(cpuInfo, 0x80000001); - CpuCap.hasXOP = hasAVX && (cpuInfo[2] & 0x800); - - if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && HasherInput_AVX512::isAvailable) { - HasherInput_Create = &HasherInput_AVX512::create; - HasherInput_Method = INHASH_AVX512; - } + if(caps.hasAVX512VLBW && caps.hasClMul && !caps.isVecRotSlow && HasherInput_AVX512::isAvailable) + set_hasherInput(INHASH_AVX512); // SSE seems to be faster than scalar on Zen1/2, not Zen3; BMI > SSE on Zen1, unknown on Zen2 - else if(hasClMul && !isSmallCore && HasherInput_ClMulScalar::isAvailable) { + else if(caps.hasClMul && !caps.isSmallCore && HasherInput_ClMulScalar::isAvailable) { // Gracemont: SSE > scalar, but SSE ~= BMI - if(CpuCap.hasBMI1 && HasherInput_BMI1::isAvailable) { - HasherInput_Create = &HasherInput_BMI1::create; - HasherInput_Method = INHASH_BMI1; - } else { - HasherInput_Create = &HasherInput_ClMulScalar::create; - HasherInput_Method = INHASH_CRC; - } - } else if(hasClMul && isSmallCore && HasherInput_ClMulSSE::isAvailable) { - HasherInput_Create = &HasherInput_ClMulSSE::create; - HasherInput_Method = INHASH_SIMD_CRC; - } - else if(CpuCap.hasSSE2 && isSmallCore && HasherInput_SSE::isAvailable) { // TODO: CPU w/o ClMul might all be small enough - HasherInput_Create = &HasherInput_SSE::create; - HasherInput_Method = INHASH_SIMD; - } + if(caps.hasBMI1 && HasherInput_BMI1::isAvailable) + set_hasherInput(INHASH_BMI1); + else + set_hasherInput(INHASH_CRC); + } else if(caps.hasClMul && caps.isSmallCore && HasherInput_ClMulSSE::isAvailable) + set_hasherInput(INHASH_SIMD_CRC); + else if(caps.hasSSE2 && caps.isSmallCore && HasherInput_SSE::isAvailable) // TODO: CPU w/o ClMul might all be small enough + set_hasherInput(INHASH_SIMD); - if(CpuCap.hasAVX512VLBW && !isVecRotSlow && MD5Single_isAvailable_AVX512) { - MD5Single::_update = &MD5Single_update_AVX512; - MD5Single::_updateZero = &MD5Single_updateZero_AVX512; - MD5Single::method = MD5CRCMETH_AVX512; - } - else if(isLEASlow && hasClMul && MD5Single_isAvailable_NoLEA) { - MD5Single::_update = &MD5Single_update_NoLEA; - MD5Single::_updateZero = &MD5Single_updateZero_NoLEA; - MD5Single::method = MD5CRCMETH_NOLEA; - } + if(caps.hasAVX512VLBW && caps.hasClMul && !caps.isVecRotSlow && MD5CRC_isAvailable_AVX512) + set_hasherMD5CRC(MD5CRCMETH_AVX512); + else if(caps.isLEASlow && caps.hasClMul && MD5CRC_isAvailable_NoLEA) + set_hasherMD5CRC(MD5CRCMETH_NOLEA); // for some reason, single MD5 BMI1 seems to be slower on most cores, except Jaguar... unsure why - else if(CpuCap.hasBMI1 && isSmallCore && MD5Single_isAvailable_BMI1) { - MD5Single::_update = &MD5Single_update_BMI1; - MD5Single::_updateZero = &MD5Single_updateZero_BMI1; - MD5Single::method = MD5CRCMETH_BMI1; - } - - if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && MD5CRC_isAvailable_AVX512) { - MD5CRC_Calc = &MD5CRC_Calc_AVX512; - MD5CRC_Method = MD5CRCMETH_AVX512; - } - else if(isLEASlow && hasClMul && MD5CRC_isAvailable_NoLEA) { - MD5CRC_Calc = &MD5CRC_Calc_NoLEA; - MD5CRC_Method = MD5CRCMETH_NOLEA; - } - else if(CpuCap.hasBMI1 && hasClMul && isSmallCore && MD5CRC_isAvailable_BMI1) { - MD5CRC_Calc = &MD5CRC_Calc_BMI1; - MD5CRC_Method = MD5CRCMETH_BMI1; - } - else if(hasClMul && MD5CRC_isAvailable_ClMul) { - MD5CRC_Calc = &MD5CRC_Calc_ClMul; - MD5CRC_Method = MD5CRCMETH_PCLMUL; - } - - if(hasClMul && CRC32_isAvailable_ClMul) { - CRC32_Calc = &CRC32_Calc_ClMul; - CRC32_Method = MD5CRCMETH_PCLMUL; - } + else if(caps.hasBMI1 && caps.hasClMul && caps.isSmallCore && MD5CRC_isAvailable_BMI1) + set_hasherMD5CRC(MD5CRCMETH_BMI1); + else if(caps.hasClMul && MD5CRC_isAvailable_ClMul) + set_hasherMD5CRC(MD5CRCMETH_PCLMUL); #endif #ifdef PLATFORM_ARM - bool hasCRC = CPU_HAS_ARMCRC; + struct CpuCap caps(true); - CpuCap.hasNEON = CPU_HAS_NEON; - CpuCap.hasSVE2 = CPU_HAS_SVE2; - - if(hasCRC && HasherInput_ARMCRC::isAvailable) { // TODO: fast core only - HasherInput_Create = &HasherInput_ARMCRC::create; - HasherInput_Method = INHASH_CRC; - } - else if(CpuCap.hasNEON) { // TODO: slow core only - if(hasCRC && HasherInput_NEONCRC::isAvailable) { - HasherInput_Create = &HasherInput_NEONCRC::create; - HasherInput_Method = INHASH_SIMD_CRC; - } else if(HasherInput_NEON::isAvailable) { - HasherInput_Create = &HasherInput_NEON::create; - HasherInput_Method = INHASH_SIMD; - } + if(caps.hasCRC && HasherInput_ARMCRC::isAvailable) // TODO: fast core only + set_hasherInput(INHASH_CRC); + else if(caps.hasNEON) { // TODO: slow core only + if(caps.hasCRC && HasherInput_NEONCRC::isAvailable) + set_hasherInput(INHASH_SIMD_CRC); + else if(HasherInput_NEON::isAvailable) + set_hasherInput(INHASH_SIMD); } - if(hasCRC && MD5CRC_isAvailable_ARMCRC) { - MD5CRC_Calc = &MD5CRC_Calc_ARMCRC; - MD5CRC_Method = MD5CRCMETH_ARMCRC; - } - if(hasCRC && CRC32_isAvailable_ARMCRC) { - CRC32_Calc = &CRC32_Calc_ARMCRC; - CRC32_Method = MD5CRCMETH_ARMCRC; - } + if(caps.hasCRC && MD5CRC_isAvailable_ARMCRC) + set_hasherMD5CRC(MD5CRCMETH_ARMCRC); #endif // note that this logic assumes that if a compiler can compile for more advanced ISAs, it supports simpler ones as well #ifdef PLATFORM_X86 - if(CpuCap.hasAVX512VLBW && MD5Multi_AVX512_128::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512VL; - else if(CpuCap.hasAVX512F && MD5Multi_AVX512::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512F; - else if(CpuCap.hasXOP && MD5Multi_XOP::isAvailable) HasherMD5Multi_level = MD5MULT_XOP; // for the only CPU with AVX2 + XOP (Excavator) I imagine XOP works better than AVX2, due to half rate AVX - else if(CpuCap.hasAVX2 && MD5Multi_AVX2::isAvailable) HasherMD5Multi_level = MD5MULT_AVX2; - else if(CpuCap.hasSSE2 && MD5Multi_SSE::isAvailable) HasherMD5Multi_level = MD5MULT_SSE; + if(caps.hasAVX512VLBW && MD5Multi_AVX512_256::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512VL; + else if(caps.hasAVX512F && MD5Multi_AVX512::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512F; + else if(caps.hasXOP && MD5Multi_XOP::isAvailable) HasherMD5Multi_level = MD5MULT_XOP; // for the only CPU with AVX2 + XOP (Excavator) I imagine XOP works better than AVX2, due to half rate AVX + else if(caps.hasAVX2 && MD5Multi_AVX2::isAvailable) HasherMD5Multi_level = MD5MULT_AVX2; + else if(caps.hasSSE2 && MD5Multi_SSE::isAvailable) HasherMD5Multi_level = MD5MULT_SSE; else #endif #ifdef PLATFORM_ARM // TODO: if SVE2 width = 128b, prefer NEON? - if(CpuCap.hasSVE2 && MD5Multi_SVE2::isAvailable) HasherMD5Multi_level = MD5MULT_SVE2; - else if(CpuCap.hasNEON && MD5Multi_NEON::isAvailable) HasherMD5Multi_level = MD5MULT_NEON; + if(caps.hasSVE2 && MD5Multi_SVE2::isAvailable) HasherMD5Multi_level = MD5MULT_SVE2; + else if(caps.hasNEON && MD5Multi_NEON::isAvailable) HasherMD5Multi_level = MD5MULT_NEON; else #endif HasherMD5Multi_level = MD5MULT_SCALAR; @@ -216,16 +168,76 @@ bool set_hasherInput(HasherInputMethods method) { return false; } +bool set_hasherMD5CRC(MD5CRCMethods method) { +#define SET_HASHER(h, x, hMd5, hCrc) case h: { \ + if(!MD5CRC_isAvailable_##x) return false; \ + MD5CRC_Calc = &MD5CRC_Calc_##x; \ + MD5CRC_Method = h; \ + MD5Single::method = hMd5; \ + CRC32_Method = hCrc; \ + break; \ + } + + switch(method) { + SET_HASHER(MD5CRCMETH_SCALAR, Scalar, MD5CRCMETH_SCALAR, MD5CRCMETH_SCALAR) +#ifdef PLATFORM_X86 + SET_HASHER(MD5CRCMETH_BMI1, BMI1, MD5CRCMETH_BMI1, MD5CRCMETH_PCLMUL) + SET_HASHER(MD5CRCMETH_NOLEA, NoLEA, MD5CRCMETH_NOLEA, MD5CRCMETH_PCLMUL) + SET_HASHER(MD5CRCMETH_AVX512, AVX512, MD5CRCMETH_AVX512, MD5CRCMETH_PCLMUL) + SET_HASHER(MD5CRCMETH_PCLMUL, ClMul, MD5CRCMETH_SCALAR, MD5CRCMETH_PCLMUL) +#endif +#ifdef PLATFORM_ARM + SET_HASHER(MD5CRCMETH_ARMCRC, ARMCRC, MD5CRCMETH_SCALAR, MD5CRCMETH_ARMCRC) +#endif + default: return false; + } +#undef SET_HASHER + + switch(MD5Single::method) { + case MD5CRCMETH_AVX512: + MD5Single::_update = &MD5Single_update_AVX512; + MD5Single::_updateZero = &MD5Single_updateZero_AVX512; + break; + case MD5CRCMETH_NOLEA: + MD5Single::_update = &MD5Single_update_NoLEA; + MD5Single::_updateZero = &MD5Single_updateZero_NoLEA; + break; + case MD5CRCMETH_BMI1: + MD5Single::_update = &MD5Single_update_BMI1; + MD5Single::_updateZero = &MD5Single_updateZero_BMI1; + break; + case MD5CRCMETH_SCALAR: + MD5Single::_update = &MD5Single_update_Scalar; + MD5Single::_updateZero = &MD5Single_updateZero_Scalar; + break; + default: return false; // shouldn't happen + } + switch(CRC32_Method) { + case MD5CRCMETH_PCLMUL: + CRC32_Calc = &CRC32_Calc_ClMul; + break; + case MD5CRCMETH_ARMCRC: + CRC32_Calc = &CRC32_Calc_ARMCRC; + break; + case MD5CRCMETH_SCALAR: + CRC32_Calc = &CRC32_Calc_Slice4; + break; + default: return false; // shouldn't happen + } + + return true; +} + void set_hasherMD5MultiLevel(MD5MultiLevels level) { #define SET_LEVEL(h, l) \ if(h::isAvailable) { \ HasherMD5Multi_level = l; \ - break; \ + return; \ } switch(level) { #ifdef PLATFORM_X86 case MD5MULT_AVX512VL: - SET_LEVEL(MD5Multi_AVX512_128, MD5MULT_AVX512VL) + SET_LEVEL(MD5Multi_AVX512_256, MD5MULT_AVX512VL) // fallthrough case MD5MULT_AVX512F: SET_LEVEL(MD5Multi_AVX512, MD5MULT_AVX512F) @@ -456,29 +468,29 @@ void MD5Single::end(void* md5) { } -const char* hasherInput_methodName() { +const char* hasherInput_methodName(HasherInputMethods m) { const char* names[] = { - "Scalar + Slice4", + "Scalar+Generic", #ifdef PLATFORM_X86 - "SSE2 + Slice4", - "Scalar + PCLMUL", - "SSE2 + PCLMUL", + "SSE2+Generic", + "Scalar+PCLMUL", + "SSE2+PCLMUL", #elif defined(PLATFORM_ARM) - "NEON + Slice4", - "Scalar + ARMv8-CRC32", - "NEON + ARMv8-CRC32", + "NEON+Generic", + "Scalar+ARMCRC", + "NEON+ARMCRC", #else - "SIMD + Slice4", - "Scalar + CRC", - "SIMD + CRC", + "SIMD+Generic", + "Scalar+CRC", + "SIMD+CRC", #endif - "BMI1 + PCLMUL", + "BMI1+PCLMUL", "AVX512" }; - return names[(int)HasherInput_Method]; + return names[(int)m]; } -const char* hasherMD5Multi_methodName() { +const char* hasherMD5Multi_methodName(MD5MultiLevels l) { const char* names[] = { "Scalar", "SSE2", @@ -490,17 +502,103 @@ const char* hasherMD5Multi_methodName() { "SVE2" }; - return names[(int)HasherMD5Multi_level]; + return names[(int)l]; } const char* md5crc_methodName(MD5CRCMethods m) { const char* names[] = { - "Scalar", + "Generic", // or Slice4 for CRC "BMI1", "NoLEA", "AVX512", - "ARMv8-CRC32", + "ARMCRC", "PCLMUL" }; return names[(int)m]; } + + + +std::vector hasherInput_availableMethods(bool checkCpuid) { + std::vector ret; + ret.push_back(INHASH_SCALAR); + +#ifdef PLATFORM_X86 + const CpuCap caps(checkCpuid); + if(caps.hasClMul) { + if(caps.hasAVX512VLBW && HasherInput_AVX512::isAvailable) + ret.push_back(INHASH_AVX512); + if(caps.hasBMI1 && HasherInput_BMI1::isAvailable) + ret.push_back(INHASH_BMI1); + if(HasherInput_ClMulSSE::isAvailable) + ret.push_back(INHASH_SIMD_CRC); + if(HasherInput_ClMulScalar::isAvailable) + ret.push_back(INHASH_CRC); + } + if(caps.hasSSE2 && HasherInput_SSE::isAvailable) + ret.push_back(INHASH_SIMD); +#endif +#ifdef PLATFORM_ARM + const CpuCap caps(checkCpuid); + if(caps.hasCRC && HasherInput_ARMCRC::isAvailable) + ret.push_back(INHASH_CRC); + if(caps.hasNEON && HasherInput_NEON::isAvailable) + ret.push_back(INHASH_SIMD); + if(caps.hasCRC && caps.hasNEON && HasherInput_NEONCRC::isAvailable) + ret.push_back(INHASH_SIMD_CRC); +#endif + + return ret; +} +std::vector hasherMD5CRC_availableMethods(bool checkCpuid) { + std::vector ret; + ret.push_back(MD5CRCMETH_SCALAR); + +#ifdef PLATFORM_X86 + const CpuCap caps(checkCpuid); + if(caps.hasClMul) { + if(caps.hasAVX512VLBW && MD5CRC_isAvailable_AVX512) + ret.push_back(MD5CRCMETH_AVX512); + if(MD5CRC_isAvailable_NoLEA) + ret.push_back(MD5CRCMETH_NOLEA); + if(caps.hasBMI1 && MD5CRC_isAvailable_BMI1) + ret.push_back(MD5CRCMETH_BMI1); + if(MD5CRC_isAvailable_ClMul) + ret.push_back(MD5CRCMETH_PCLMUL); + } +#endif +#ifdef PLATFORM_ARM + const CpuCap caps(checkCpuid); + if(caps.hasCRC && MD5CRC_isAvailable_ARMCRC) + ret.push_back(MD5CRCMETH_ARMCRC); +#endif + + return ret; +} +std::vector hasherMD5Multi_availableMethods(bool checkCpuid) { + std::vector ret; + ret.push_back(MD5MULT_SCALAR); + +#ifdef PLATFORM_X86 + const CpuCap caps(checkCpuid); + if(caps.hasAVX512VLBW && MD5Multi_AVX512_256::isAvailable) + ret.push_back(MD5MULT_AVX512VL); + if(caps.hasAVX512F && MD5Multi_AVX512::isAvailable) + ret.push_back(MD5MULT_AVX512F); + if(caps.hasXOP && MD5Multi_XOP::isAvailable) + ret.push_back(MD5MULT_XOP); + if(caps.hasAVX2 && MD5Multi_AVX2::isAvailable) + ret.push_back(MD5MULT_AVX2); + if(caps.hasSSE2 && MD5Multi_SSE::isAvailable) + ret.push_back(MD5MULT_SSE); +#endif +#ifdef PLATFORM_ARM + const CpuCap caps(checkCpuid); + if(caps.hasSVE2 && MD5Multi_SVE2::isAvailable) + ret.push_back(MD5MULT_SVE2); + if(caps.hasNEON && MD5Multi_NEON::isAvailable) + ret.push_back(MD5MULT_NEON); +#endif + + return ret; +} diff --git a/hasher/hasher.h b/hasher/hasher.h index a52c0bc7..3eea2564 100644 --- a/hasher/hasher.h +++ b/hasher/hasher.h @@ -25,13 +25,37 @@ enum MD5MultiLevels { MD5MULT_SVE2 }; +// single hash instances +extern uint32_t(*CRC32_Calc)(const void*, size_t); +extern MD5CRCMethods CRC32_Method; +extern uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*); +extern MD5CRCMethods MD5CRC_Method; + + void setup_hasher(); bool set_hasherInput(HasherInputMethods method); +bool set_hasherMD5CRC(MD5CRCMethods method); void set_hasherMD5MultiLevel(MD5MultiLevels level); extern IHasherInput*(*HasherInput_Create)(); +extern HasherInputMethods HasherInput_Method; +extern MD5MultiLevels HasherMD5Multi_level; + +const char* hasherInput_methodName(HasherInputMethods m); +const char* md5crc_methodName(MD5CRCMethods m); +const char* hasherMD5Multi_methodName(MD5MultiLevels l); +inline const char* hasherInput_methodName() { + return hasherInput_methodName(HasherInput_Method); +} +inline const char* md5crc_methodName() { + return md5crc_methodName(MD5CRC_Method); +} +inline const char* hasherMD5Multi_methodName() { + return hasherMD5Multi_methodName(HasherMD5Multi_level); +} -const char* hasherInput_methodName(); -const char* hasherMD5Multi_methodName(); +std::vector hasherInput_availableMethods(bool checkCpuid); +std::vector hasherMD5CRC_availableMethods(bool checkCpuid); +std::vector hasherMD5Multi_availableMethods(bool checkCpuid); class MD5Multi { std::vector ctx; @@ -59,11 +83,4 @@ class MD5Multi { }; -// single hash instances -extern uint32_t(*CRC32_Calc)(const void*, size_t); -extern MD5CRCMethods CRC32_Method; -extern uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*); -extern MD5CRCMethods MD5CRC_Method; -const char* md5crc_methodName(MD5CRCMethods m); - #endif /* __HASHER_H */ From 583afb5bc26f3edae01eb0c7d730bf239190266e Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 17 Aug 2023 14:26:57 +1000 Subject: [PATCH 58/91] Hasher fixes --- hasher/hasher.cpp | 2 ++ hasher/hasher_base.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp index ced8b76e..121d303f 100644 --- a/hasher/hasher.cpp +++ b/hasher/hasher.cpp @@ -31,6 +31,7 @@ struct CpuCap { model = ((cpuInfo[0]>>4) & 0xf) + ((cpuInfo[0]>>12) & 0xf0); // TODO: check perf on small cores + isLEASlow = false; if(family == 6) { isSmallCore = CPU_MODEL_IS_BNL_SLM(model); // Intel Sandy Bridge to Skylake has slow 3-component LEA @@ -41,6 +42,7 @@ struct CpuCap { isVecRotSlow = (family == 0xaf); // vector rotate has 2 cycle latency on Zen4 + hasAVX = false; hasBMI1 = false; hasAVX2 = false; hasAVX512F = false; hasAVX512VLBW = false; #if !defined(_MSC_VER) || _MSC_VER >= 1600 _cpuidX(cpuInfoX, 7, 0); if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE diff --git a/hasher/hasher_base.h b/hasher/hasher_base.h index 56eb7f1c..9464f463 100644 --- a/hasher/hasher_base.h +++ b/hasher/hasher_base.h @@ -56,7 +56,7 @@ const bool MD5CRC(isAvailable) = true; uint32_t MD5CRC(Calc)(const void* data, size_t length, size_t zeroPad, void* md5) { uint32_t md5State[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476}; #ifdef PLATFORM_X86 - char crcState[64]; // ClMul uses 4x16B state, others use 4B + ALIGN_TO(16, char crcState[64]); // ClMul uses 4x16B state, others use 4B #else char crcState[4]; #endif @@ -293,7 +293,7 @@ void MD5Multi::reset() { const bool CRC32Impl(CRC32_isAvailable) = true; uint32_t CRC32Impl(CRC32_Calc)(const void* data, size_t len) { #ifdef PLATFORM_X86 - char crcState[64]; // ClMul uses 4x16B state, others use 4B + ALIGN_TO(16, char crcState[64]); // ClMul uses 4x16B state, others use 4B #else char crcState[4]; #endif From 88b3f2086ba66704abe703883dd24e09f672e00a Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 20 Aug 2023 10:22:22 +1000 Subject: [PATCH 59/91] Fix compile on AArch32 + EOL fix --- gf16/gf16pmul_neon.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/gf16/gf16pmul_neon.c b/gf16/gf16pmul_neon.c index 52c68655..d2dc6398 100644 --- a/gf16/gf16pmul_neon.c +++ b/gf16/gf16pmul_neon.c @@ -20,14 +20,20 @@ void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2 for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) { poly8x16x2_t data1 = vld2q_p8(_src1+ptr); poly8x16x2_t data2 = vld2q_p8(_src2+ptr); - poly16x8_t low1 = pmull_low(data1.val[0], data2.val[0]); - poly16x8_t low2 = pmull_high(data1.val[0], data2.val[0]); + poly16x8_t low1 = vmull_p8(vget_low_p8(data1.val[0]), vget_low_p8(data2.val[0])); poly8x16_t dataMid1 = veorq_p8(data1.val[0], data1.val[1]); poly8x16_t dataMid2 = veorq_p8(data2.val[0], data2.val[1]); - poly16x8_t mid1 = pmull_low(dataMid1, dataMid2); + poly16x8_t mid1 = vmull_p8(vget_low_p8(dataMid1), vget_low_p8(dataMid2)); + poly16x8_t high1 = vmull_p8(vget_low_p8(data1.val[1]), vget_low_p8(data2.val[1])); +#ifdef __aarch64__ + poly16x8_t low2 = pmull_high(data1.val[0], data2.val[0]); poly16x8_t mid2 = pmull_high(dataMid1, dataMid2); - poly16x8_t high1 = pmull_low(data1.val[1], data2.val[1]); poly16x8_t high2 = pmull_high(data1.val[1], data2.val[1]); +#else + poly16x8_t low2 = vmull_p8(vget_high_p8(data1.val[0]), vget_high_p8(data2.val[0])); + poly16x8_t mid2 = vmull_p8(vget_high_p8(dataMid1), vget_high_p8(dataMid2)); + poly16x8_t high2 = vmull_p8(vget_high_p8(data1.val[1]), vget_high_p8(data2.val[1])); +#endif gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); uint8x16x2_t out; From 626482eda5233ea557c71ff383dc3824eddcc79b Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 20 Aug 2023 17:25:11 +1000 Subject: [PATCH 60/91] Check for presence of GC in RVV (since it's compiled that way) --- gf16/gf16mul.cpp | 2 +- src/cpuid.h | 4 ++++ src/platform.h | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 1aa04723..136e6829 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -226,7 +226,7 @@ struct CpuCap { bool hasVector; CpuCap(bool detect) : hasVector(true) { if(!detect) return; - hasVector = CPU_HAS_VECTOR; + hasVector = CPU_HAS_VECTOR && CPU_HAS_GC; } }; #endif diff --git a/src/cpuid.h b/src/cpuid.h index 56c95761..6d7c1c11 100644 --- a/src/cpuid.h +++ b/src/cpuid.h @@ -158,11 +158,15 @@ static unsigned long getauxval(unsigned long cap) { # endif # ifdef PARPAR_SKIP_AUX_CHECK +# define CPU_HAS_GC true # define CPU_HAS_VECTOR true # else +# define CPU_HAS_GC false # define CPU_HAS_VECTOR false # if defined(AT_HWCAP) +# undef CPU_HAS_GC +# define CPU_HAS_GC ((getauxval(AT_HWCAP) & 4397) == 4397) // 4397 = IMAFDC; TODO: how to detect Z* features of 'G'? # undef CPU_HAS_VECTOR # define CPU_HAS_VECTOR (getauxval(AT_HWCAP) & (1 << ('V'-'A'))) # endif diff --git a/src/platform.h b/src/platform.h index 11d7f6d6..3ee0d37d 100644 --- a/src/platform.h +++ b/src/platform.h @@ -203,6 +203,11 @@ HEDLEY_WARNING("GFNI disabled on GCC < 10 due to incorrect GF2P8AFFINEQB operand # endif #endif +#if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0) +// GCC added RVV intrinsics in GCC13 +# undef __riscv_vector +#endif + // Some environments lack ARM headers, so try to check for these #ifdef __has_include # if defined(__ARM_FEATURE_SVE) && !__has_include() From c9f864af95df817d0e1f04f296b92775b622018d Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 20 Aug 2023 18:48:34 +1000 Subject: [PATCH 61/91] Suppress build/UBSan warnings + upgrade Hedley --- gf16/gf16_xor_avx512.c | 4 +- gf16/gf16_xor_common.h | 2 +- gf16/gf16mul.cpp | 1 + hasher/hasher.cpp | 3 + hasher/md5x2-x86-asm.h | 4 +- src/hedley.h | 1001 ++++++++++++++++++++++++++++++---------- 6 files changed, 778 insertions(+), 237 deletions(-) diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c index 694f5ef5..d87d79e1 100644 --- a/gf16/gf16_xor_avx512.c +++ b/gf16/gf16_xor_avx512.c @@ -851,7 +851,7 @@ void gf16_xor_jit_muladd_multi_avx512(const void *HEDLEY_RESTRICT scratch, unsig /* cmp/jcc */ write64(jitptr, 0x800FC03948 | (AX <<16) | (CX <<19) | ((uint64_t)JL <<32)); if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT || info->jitOptStrat == GF16_XOR_JIT_STRAT_COPY) { - write32(jitptr +5, (int32_t)((jitTemp - (jitdst - (uint8_t*)jit->w)) - jitptr -9)); + write32(jitptr +5, (int32_t)(((intptr_t)jitTemp - (jitdst - (uint8_t*)jit->w)) - (intptr_t)jitptr -9)); jitptr[9] = 0xC3; /* ret */ /* memcpy to destination */ if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT) { @@ -957,7 +957,7 @@ void gf16_xor_jit_muladd_multi_packed_avx512(const void *HEDLEY_RESTRICT scratch /* cmp/jcc */ write64(jitptr, 0x800FC03948 | (AX <<16) | (CX <<19) | ((uint64_t)JL <<32)); if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT || info->jitOptStrat == GF16_XOR_JIT_STRAT_COPY) { - write32(jitptr +5, (int32_t)((jitTemp - (jitdst - (uint8_t*)jit->w)) - jitptr -9)); + write32(jitptr +5, (int32_t)(((intptr_t)jitTemp - (jitdst - (uint8_t*)jit->w)) - (intptr_t)jitptr -9)); jitptr[9] = 0xC3; /* ret */ /* memcpy to destination */ if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT) { diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h index 21b7ea80..7260172d 100644 --- a/gf16/gf16_xor_common.h +++ b/gf16/gf16_xor_common.h @@ -150,7 +150,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRI jitptr = jitTemp; jitptr = writeFunc(info, jitptr, coefficient, mode, prefetch); - write32(jitptr, (int32_t)(jitTemp - copyOffset - jitptr -4)); + write32(jitptr, (int32_t)((intptr_t)jitTemp - copyOffset - (intptr_t)jitptr -4)); jitptr[4] = 0xC3; /* ret */ jitptr += 5; diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 136e6829..142aba3d 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -1430,6 +1430,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu } std::vector Galois16Mul::availableMethods(bool checkCpuid) { + UNUSED(checkCpuid); std::vector ret; ret.push_back(GF16_LOOKUP); if(gf16_lookup3_stride()) diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp index 121d303f..ce6e3f40 100644 --- a/hasher/hasher.cpp +++ b/hasher/hasher.cpp @@ -522,6 +522,7 @@ const char* md5crc_methodName(MD5CRCMethods m) { std::vector hasherInput_availableMethods(bool checkCpuid) { + (void)checkCpuid; std::vector ret; ret.push_back(INHASH_SCALAR); @@ -553,6 +554,7 @@ std::vector hasherInput_availableMethods(bool checkCpuid) { return ret; } std::vector hasherMD5CRC_availableMethods(bool checkCpuid) { + (void)checkCpuid; std::vector ret; ret.push_back(MD5CRCMETH_SCALAR); @@ -578,6 +580,7 @@ std::vector hasherMD5CRC_availableMethods(bool checkCpuid) { return ret; } std::vector hasherMD5Multi_availableMethods(bool checkCpuid) { + (void)checkCpuid; std::vector ret; ret.push_back(MD5MULT_SCALAR); diff --git a/hasher/md5x2-x86-asm.h b/hasher/md5x2-x86-asm.h index 64322d2f..325938a7 100644 --- a/hasher/md5x2-x86-asm.h +++ b/hasher/md5x2-x86-asm.h @@ -195,8 +195,8 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 21) \ : ASM_PARAMS(i2, i3)); - A1 += _data[0][0]; - A2 += _data[1][0]; + A1 += read32(_data[0]); + A2 += read32(_data[1]); RF4( 1, 2, 3, 4, -0x28955b88, -0x173848aa, 0x242070db, -0x3e423112) RF4( 5, 6, 7, 8, -0x0a83f051, 0x4787c62a, -0x57cfb9ed, -0x02b96aff) diff --git a/src/hedley.h b/src/hedley.h index d20c2297..8a713e67 100644 --- a/src/hedley.h +++ b/src/hedley.h @@ -10,11 +10,11 @@ * SPDX-License-Identifier: CC0-1.0 */ -#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 9) +#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 15) #if defined(HEDLEY_VERSION) # undef HEDLEY_VERSION #endif -#define HEDLEY_VERSION 9 +#define HEDLEY_VERSION 15 #if defined(HEDLEY_STRINGIFY_EX) # undef HEDLEY_STRINGIFY_EX @@ -36,6 +36,16 @@ #endif #define HEDLEY_CONCAT(a,b) HEDLEY_CONCAT_EX(a,b) +#if defined(HEDLEY_CONCAT3_EX) +# undef HEDLEY_CONCAT3_EX +#endif +#define HEDLEY_CONCAT3_EX(a,b,c) a##b##c + +#if defined(HEDLEY_CONCAT3) +# undef HEDLEY_CONCAT3 +#endif +#define HEDLEY_CONCAT3(a,b,c) HEDLEY_CONCAT3_EX(a,b,c) + #if defined(HEDLEY_VERSION_ENCODE) # undef HEDLEY_VERSION_ENCODE #endif @@ -77,18 +87,18 @@ #if defined(HEDLEY_MSVC_VERSION) # undef HEDLEY_MSVC_VERSION #endif -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL) # define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) -#elif defined(_MSC_FULL_VER) +#elif defined(_MSC_FULL_VER) && !defined(__ICL) # define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) -#elif defined(_MSC_VER) +#elif defined(_MSC_VER) && !defined(__ICL) # define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) #endif #if defined(HEDLEY_MSVC_VERSION_CHECK) # undef HEDLEY_MSVC_VERSION_CHECK #endif -#if !defined(_MSC_VER) +#if !defined(HEDLEY_MSVC_VERSION) # define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) #elif defined(_MSC_VER) && (_MSC_VER >= 1400) # define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) @@ -101,9 +111,9 @@ #if defined(HEDLEY_INTEL_VERSION) # undef HEDLEY_INTEL_VERSION #endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL) # define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) && !defined(__ICL) # define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) #endif @@ -116,6 +126,22 @@ # define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) #endif +#if defined(HEDLEY_INTEL_CL_VERSION) +# undef HEDLEY_INTEL_CL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL) +# define HEDLEY_INTEL_CL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0) +#endif + +#if defined(HEDLEY_INTEL_CL_VERSION_CHECK) +# undef HEDLEY_INTEL_CL_VERSION_CHECK +#endif +#if defined(HEDLEY_INTEL_CL_VERSION) +# define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0) +#endif + #if defined(HEDLEY_PGI_VERSION) # undef HEDLEY_PGI_VERSION #endif @@ -211,8 +237,16 @@ #if defined(HEDLEY_TI_VERSION) # undef HEDLEY_TI_VERSION #endif -#if defined(__TI_COMPILER_VERSION__) -# define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#if \ + defined(__TI_COMPILER_VERSION__) && \ + ( \ + defined(__TMS470__) || defined(__TI_ARM__) || \ + defined(__MSP430__) || \ + defined(__TMS320C2000__) \ + ) +# if (__TI_COMPILER_VERSION__ >= 16000000) +# define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +# endif #endif #if defined(HEDLEY_TI_VERSION_CHECK) @@ -224,6 +258,102 @@ # define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) #endif +#if defined(HEDLEY_TI_CL2000_VERSION) +# undef HEDLEY_TI_CL2000_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) +# define HEDLEY_TI_CL2000_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL2000_VERSION_CHECK) +# undef HEDLEY_TI_CL2000_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL2000_VERSION) +# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL430_VERSION) +# undef HEDLEY_TI_CL430_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) +# define HEDLEY_TI_CL430_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL430_VERSION_CHECK) +# undef HEDLEY_TI_CL430_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL430_VERSION) +# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_ARMCL_VERSION) +# undef HEDLEY_TI_ARMCL_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) +# define HEDLEY_TI_ARMCL_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_ARMCL_VERSION_CHECK) +# undef HEDLEY_TI_ARMCL_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_ARMCL_VERSION) +# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL6X_VERSION) +# undef HEDLEY_TI_CL6X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) +# define HEDLEY_TI_CL6X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL6X_VERSION_CHECK) +# undef HEDLEY_TI_CL6X_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL6X_VERSION) +# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL7X_VERSION) +# undef HEDLEY_TI_CL7X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) +# define HEDLEY_TI_CL7X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL7X_VERSION_CHECK) +# undef HEDLEY_TI_CL7X_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL7X_VERSION) +# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CLPRU_VERSION) +# undef HEDLEY_TI_CLPRU_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) +# define HEDLEY_TI_CLPRU_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CLPRU_VERSION_CHECK) +# undef HEDLEY_TI_CLPRU_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CLPRU_VERSION) +# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) +#endif + #if defined(HEDLEY_CRAY_VERSION) # undef HEDLEY_CRAY_VERSION #endif @@ -251,7 +381,7 @@ # if __VER__ > 1000 # define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) # else -# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0) +# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0) # endif #endif @@ -328,6 +458,22 @@ # define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) #endif +#if defined(HEDLEY_MCST_LCC_VERSION) +# undef HEDLEY_MCST_LCC_VERSION +#endif +#if defined(__LCC__) && defined(__LCC_MINOR__) +# define HEDLEY_MCST_LCC_VERSION HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__) +#endif + +#if defined(HEDLEY_MCST_LCC_VERSION_CHECK) +# undef HEDLEY_MCST_LCC_VERSION_CHECK +#endif +#if defined(HEDLEY_MCST_LCC_VERSION) +# define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (HEDLEY_MCST_LCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0) +#endif + #if defined(HEDLEY_GCC_VERSION) # undef HEDLEY_GCC_VERSION #endif @@ -337,8 +483,16 @@ !defined(HEDLEY_INTEL_VERSION) && \ !defined(HEDLEY_PGI_VERSION) && \ !defined(HEDLEY_ARM_VERSION) && \ + !defined(HEDLEY_CRAY_VERSION) && \ !defined(HEDLEY_TI_VERSION) && \ - !defined(__COMPCERT__) + !defined(HEDLEY_TI_ARMCL_VERSION) && \ + !defined(HEDLEY_TI_CL430_VERSION) && \ + !defined(HEDLEY_TI_CL2000_VERSION) && \ + !defined(HEDLEY_TI_CL6X_VERSION) && \ + !defined(HEDLEY_TI_CL7X_VERSION) && \ + !defined(HEDLEY_TI_CLPRU_VERSION) && \ + !defined(__COMPCERT__) && \ + !defined(HEDLEY_MCST_LCC_VERSION) # define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION #endif @@ -354,7 +508,11 @@ #if defined(HEDLEY_HAS_ATTRIBUTE) # undef HEDLEY_HAS_ATTRIBUTE #endif -#if defined(__has_attribute) +#if \ + defined(__has_attribute) && \ + ( \ + (!defined(HEDLEY_IAR_VERSION) || HEDLEY_IAR_VERSION_CHECK(8,5,9)) \ + ) # define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) #else # define HEDLEY_HAS_ATTRIBUTE(attribute) (0) @@ -364,7 +522,7 @@ # undef HEDLEY_GNUC_HAS_ATTRIBUTE #endif #if defined(__has_attribute) -# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute) +# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute) #else # define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) #endif @@ -373,7 +531,7 @@ # undef HEDLEY_GCC_HAS_ATTRIBUTE #endif #if defined(__has_attribute) -# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute) +# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute) #else # define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) #endif @@ -381,12 +539,30 @@ #if defined(HEDLEY_HAS_CPP_ATTRIBUTE) # undef HEDLEY_HAS_CPP_ATTRIBUTE #endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) +#if \ + defined(__has_cpp_attribute) && \ + defined(__cplusplus) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) # define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) #else # define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) #endif +#if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS) +# undef HEDLEY_HAS_CPP_ATTRIBUTE_NS +#endif +#if !defined(__cplusplus) || !defined(__has_cpp_attribute) +# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#elif \ + !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_IAR_VERSION) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) +# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) +#else +# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#endif + #if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) # undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE #endif @@ -548,7 +724,13 @@ HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(6,0,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ @@ -575,13 +757,21 @@ #elif HEDLEY_GCC_VERSION_CHECK(4,6,0) # define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") # define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +#elif \ + HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) # define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) #elif HEDLEY_ARM_VERSION_CHECK(5,6,0) # define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") # define HEDLEY_DIAGNOSTIC_POP _Pragma("pop") -#elif HEDLEY_TI_VERSION_CHECK(8,1,0) +#elif \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) # define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") # define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") #elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) @@ -592,6 +782,102 @@ # define HEDLEY_DIAGNOSTIC_POP #endif +/* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) +# undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ +#endif +#if defined(__cplusplus) +# if HEDLEY_HAS_WARNING("-Wc++98-compat") +# if HEDLEY_HAS_WARNING("-Wc++17-extensions") +# if HEDLEY_HAS_WARNING("-Wc++1z-extensions") +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \ + xpr \ + HEDLEY_DIAGNOSTIC_POP +# else +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + xpr \ + HEDLEY_DIAGNOSTIC_POP +# endif +# else +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + xpr \ + HEDLEY_DIAGNOSTIC_POP +# endif +# endif +#endif +#if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x +#endif + +#if defined(HEDLEY_CONST_CAST) +# undef HEDLEY_CONST_CAST +#endif +#if defined(__cplusplus) +# define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) +#elif \ + HEDLEY_HAS_WARNING("-Wcast-qual") || \ + HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + ((T) (expr)); \ + HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define HEDLEY_CONST_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(HEDLEY_REINTERPRET_CAST) +# undef HEDLEY_REINTERPRET_CAST +#endif +#if defined(__cplusplus) +# define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#else +# define HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(HEDLEY_STATIC_CAST) +# undef HEDLEY_STATIC_CAST +#endif +#if defined(__cplusplus) +# define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#else +# define HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(HEDLEY_CPP_CAST) +# undef HEDLEY_CPP_CAST +#endif +#if defined(__cplusplus) +# if HEDLEY_HAS_WARNING("-Wold-style-cast") +# define HEDLEY_CPP_CAST(T, expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ + ((T) (expr)) \ + HEDLEY_DIAGNOSTIC_POP +# elif HEDLEY_IAR_VERSION_CHECK(8,3,0) +# define HEDLEY_CPP_CAST(T, expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("diag_suppress=Pe137") \ + HEDLEY_DIAGNOSTIC_POP +# else +# define HEDLEY_CPP_CAST(T, expr) ((T) (expr)) +# endif +#else +# define HEDLEY_CPP_CAST(T, expr) (expr) +#endif + #if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) # undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED #endif @@ -599,13 +885,30 @@ # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") +#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786)) +#elif HEDLEY_PGI_VERSION_CHECK(20,7,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445") #elif HEDLEY_PGI_VERSION_CHECK(17,10,0) # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") #elif HEDLEY_GCC_VERSION_CHECK(4,3,0) # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) -#elif HEDLEY_TI_VERSION_CHECK(8,0,0) +#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") #elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) # define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") @@ -626,20 +929,62 @@ # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") +#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161)) #elif HEDLEY_PGI_VERSION_CHECK(17,10,0) # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") #elif HEDLEY_GCC_VERSION_CHECK(4,3,0) # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) -#elif HEDLEY_TI_VERSION_CHECK(8,0,0) +#elif \ + HEDLEY_TI_VERSION_CHECK(16,9,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") #elif HEDLEY_IAR_VERSION_CHECK(8,0,0) # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") +#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161") #else # define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS #endif +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) +# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-attributes") +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") +#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_INTEL_VERSION_CHECK(17,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") +#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292)) +#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) +#elif HEDLEY_PGI_VERSION_CHECK(20,7,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098") +#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") +#elif \ + HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") +#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#else +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif + #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) # undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL #endif @@ -653,40 +998,74 @@ # define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL #endif +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION) +# undef HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#endif +#if HEDLEY_HAS_WARNING("-Wunused-function") +# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"") +#elif HEDLEY_GCC_VERSION_CHECK(3,4,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#elif HEDLEY_MSVC_VERSION_CHECK(1,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505)) +#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142") +#else +# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION +#endif + #if defined(HEDLEY_DEPRECATED) # undef HEDLEY_DEPRECATED #endif #if defined(HEDLEY_DEPRECATED_FOR) # undef HEDLEY_DEPRECATED_FOR #endif -#if defined(__cplusplus) && (__cplusplus >= 201402L) -# define HEDLEY_DEPRECATED(since) [[deprecated("Since " #since)]] -# define HEDLEY_DEPRECATED_FOR(since, replacement) [[deprecated("Since " #since "; use " #replacement)]] +#if \ + HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) +# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) #elif \ - HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \ + (HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(HEDLEY_IAR_VERSION)) || \ HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_TI_VERSION_CHECK(8,3,0) + HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) # define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) +#elif defined(__cplusplus) && (__cplusplus >= 201402L) +# define HEDLEY_DEPRECATED(since) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) +# define HEDLEY_DEPRECATED_FOR(since, replacement) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) #elif \ HEDLEY_HAS_ATTRIBUTE(deprecated) || \ HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + HEDLEY_IAR_VERSION_CHECK(8,10,0) # define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) # define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) -#elif HEDLEY_MSVC_VERSION_CHECK(14,0,0) -# define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) #elif \ HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_PELLES_VERSION_CHECK(6,50,0) -# define HEDLEY_DEPRECATED(since) _declspec(deprecated) + HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_DEPRECATED(since) __declspec(deprecated) # define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) #elif HEDLEY_IAR_VERSION_CHECK(8,0,0) # define HEDLEY_DEPRECATED(since) _Pragma("deprecated") @@ -702,7 +1081,8 @@ #if \ HEDLEY_HAS_ATTRIBUTE(warning) || \ HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) #else # define HEDLEY_UNAVAILABLE(available_since) @@ -711,21 +1091,41 @@ #if defined(HEDLEY_WARN_UNUSED_RESULT) # undef HEDLEY_WARN_UNUSED_RESULT #endif -#if defined(__cplusplus) && (__cplusplus >= 201703L) -# define HEDLEY_WARN_UNUSED_RESULT [[nodiscard]] -#elif \ +#if defined(HEDLEY_WARN_UNUSED_RESULT_MSG) +# undef HEDLEY_WARN_UNUSED_RESULT_MSG +#endif +#if \ HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) + HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) +#elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) +# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) +#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) +# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) #elif defined(_Check_return_) /* SAL */ # define HEDLEY_WARN_UNUSED_RESULT _Check_return_ +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ #else # define HEDLEY_WARN_UNUSED_RESULT +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) #endif #if defined(HEDLEY_SENTINEL) @@ -735,7 +1135,8 @@ HEDLEY_HAS_ATTRIBUTE(sentinel) || \ HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,4,0) + HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) #else # define HEDLEY_SENTINEL(position) @@ -746,24 +1147,40 @@ #endif #if HEDLEY_IAR_VERSION_CHECK(8,0,0) # define HEDLEY_NO_RETURN __noreturn -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +#elif \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_NO_RETURN __attribute__((__noreturn__)) #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L # define HEDLEY_NO_RETURN _Noreturn #elif defined(__cplusplus) && (__cplusplus >= 201103L) -# define HEDLEY_NO_RETURN [[noreturn]] +# define HEDLEY_NO_RETURN HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) #elif \ HEDLEY_HAS_ATTRIBUTE(noreturn) || \ HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(18,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(17,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_IAR_VERSION_CHECK(8,10,0) # define HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_NO_RETURN _Pragma("does_not_return") +#elif \ + HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_NO_RETURN __declspec(noreturn) -#elif HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus) +#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) # define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") #elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) # define HEDLEY_NO_RETURN __attribute((noreturn)) @@ -773,67 +1190,82 @@ # define HEDLEY_NO_RETURN #endif +#if defined(HEDLEY_NO_ESCAPE) +# undef HEDLEY_NO_ESCAPE +#endif +#if HEDLEY_HAS_ATTRIBUTE(noescape) +# define HEDLEY_NO_ESCAPE __attribute__((__noescape__)) +#else +# define HEDLEY_NO_ESCAPE +#endif + #if defined(HEDLEY_UNREACHABLE) # undef HEDLEY_UNREACHABLE #endif #if defined(HEDLEY_UNREACHABLE_RETURN) # undef HEDLEY_UNREACHABLE_RETURN #endif -#if \ - (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \ - HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,5) -# define HEDLEY_UNREACHABLE() __builtin_unreachable() -#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) -# define HEDLEY_UNREACHABLE() __assume(0) -#elif HEDLEY_TI_VERSION_CHECK(6,0,0) -# if defined(__cplusplus) -# define HEDLEY_UNREACHABLE() std::_nassert(0) -# else -# define HEDLEY_UNREACHABLE() _nassert(0) -# endif -# define HEDLEY_UNREACHABLE_RETURN(value) return value -#elif defined(EXIT_FAILURE) -# define HEDLEY_UNREACHABLE() abort() -#else -# define HEDLEY_UNREACHABLE() -# define HEDLEY_UNREACHABLE_RETURN(value) return value -#endif -#if !defined(HEDLEY_UNREACHABLE_RETURN) -# define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() -#endif - #if defined(HEDLEY_ASSUME) # undef HEDLEY_ASSUME #endif #if \ HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_ASSUME(expr) __assume(expr) #elif HEDLEY_HAS_BUILTIN(__builtin_assume) # define HEDLEY_ASSUME(expr) __builtin_assume(expr) -#elif HEDLEY_TI_VERSION_CHECK(6,0,0) +#elif \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) # if defined(__cplusplus) # define HEDLEY_ASSUME(expr) std::_nassert(expr) # else # define HEDLEY_ASSUME(expr) _nassert(expr) # endif -#elif \ - (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && !defined(HEDLEY_ARM_VERSION)) || \ +#endif +#if \ + (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \ HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,5) -# define HEDLEY_ASSUME(expr) ((void) ((expr) ? 1 : (__builtin_unreachable(), 1))) + HEDLEY_IBM_VERSION_CHECK(13,1,5) || \ + HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_UNREACHABLE() __builtin_unreachable() +#elif defined(HEDLEY_ASSUME) +# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) +#endif +#if !defined(HEDLEY_ASSUME) +# if defined(HEDLEY_UNREACHABLE) +# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1))) +# else +# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr) +# endif +#endif +#if defined(HEDLEY_UNREACHABLE) +# if \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) +# define HEDLEY_UNREACHABLE_RETURN(value) return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value)) +# else +# define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() +# endif #else -# define HEDLEY_ASSUME(expr) ((void) (expr)) +# define HEDLEY_UNREACHABLE_RETURN(value) return (value) +#endif +#if !defined(HEDLEY_UNREACHABLE) +# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) #endif - HEDLEY_DIAGNOSTIC_PUSH -#if \ - HEDLEY_HAS_WARNING("-Wvariadic-macros") || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) +#if HEDLEY_HAS_WARNING("-Wpedantic") +# pragma clang diagnostic ignored "-Wpedantic" +#endif +#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) +# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#endif +#if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) # if defined(__clang__) # pragma clang diagnostic ignored "-Wvariadic-macros" # elif defined(HEDLEY_GCC_VERSION) @@ -867,8 +1299,18 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) #elif HEDLEY_PELLES_VERSION_CHECK(6,0,0) # define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) @@ -881,7 +1323,7 @@ HEDLEY_DIAGNOSTIC_POP #endif #if defined(__cplusplus) # if __cplusplus >= 201103L -# define HEDLEY_CONSTEXPR constexpr +# define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) # endif #endif #if !defined(HEDLEY_CONSTEXPR) @@ -901,44 +1343,50 @@ HEDLEY_DIAGNOSTIC_POP # undef HEDLEY_UNPREDICTABLE #endif #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable) -# define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable(!!(expr)) +# define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) #endif #if \ - HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \ - HEDLEY_GCC_VERSION_CHECK(9,0,0) -# define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(expr, value, probability) -# define HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1, probability) -# define HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0, probability) -# define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) -# define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -# if !defined(HEDLEY_BUILTIN_UNPREDICTABLE) -# define HEDLEY_BUILTIN_UNPREDICTABLE(expr) __builtin_expect_with_probability(!!(expr), 1, 0.5) -# endif + (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(HEDLEY_PGI_VERSION)) || \ + HEDLEY_GCC_VERSION_CHECK(9,0,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) +# define HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) +# define HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) +# define HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) +# define HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) #elif \ - HEDLEY_HAS_BUILTIN(__builtin_expect) || \ + (HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(HEDLEY_INTEL_CL_VERSION)) || \ HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(6,1,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,27) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ + HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_PREDICT(expr, expected, probability) \ - (((probability) >= 0.9) ? __builtin_expect(!!(expr), (expected)) : (((void) (expected)), !!(expr))) + (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (HEDLEY_STATIC_CAST(void, expected), (expr))) # define HEDLEY_PREDICT_TRUE(expr, probability) \ (__extension__ ({ \ - HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \ + double hedley_probability_ = (probability); \ ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ })) # define HEDLEY_PREDICT_FALSE(expr, probability) \ (__extension__ ({ \ - HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \ + double hedley_probability_ = (probability); \ ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ })) # define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) # define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) #else -# define HEDLEY_PREDICT(expr, expected, probability) (((void) (expected)), !!(expr)) +# define HEDLEY_PREDICT(expr, expected, probability) (HEDLEY_STATIC_CAST(void, expected), (expr)) # define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) # define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) # define HEDLEY_LIKELY(expr) (!!(expr)) @@ -958,10 +1406,24 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_MALLOC __attribute__((__malloc__)) -#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_MALLOC _Pragma("returns_new_memory") +#elif \ + HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_MALLOC __declspec(restrict) #else # define HEDLEY_MALLOC @@ -977,11 +1439,28 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_PURE __attribute__((__pure__)) -#elif HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_PURE _Pragma("does_not_write_global_data") +#elif defined(__cplusplus) && \ + ( \ + HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ + ) # define HEDLEY_PURE _Pragma("FUNC_IS_PURE;") #else # define HEDLEY_PURE @@ -997,10 +1476,23 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_CONST __attribute__((__const__)) +#elif \ + HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_CONST _Pragma("no_side_effect") #else # define HEDLEY_CONST HEDLEY_PURE #endif @@ -1014,13 +1506,18 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - defined(__clang__) + defined(__clang__) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_RESTRICT __restrict #elif HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) # define HEDLEY_RESTRICT _Restrict @@ -1041,8 +1538,15 @@ HEDLEY_DIAGNOSTIC_POP # define HEDLEY_INLINE __inline__ #elif \ HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) + HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_INLINE __inline #else # define HEDLEY_INLINE @@ -1058,12 +1562,33 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + HEDLEY_IAR_VERSION_CHECK(8,10,0) # define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE -#elif HEDLEY_MSVC_VERSION_CHECK(12,0,0) +#elif \ + HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_ALWAYS_INLINE __forceinline -#elif HEDLEY_TI_VERSION_CHECK(7,0,0) && defined(__cplusplus) +#elif defined(__cplusplus) && \ + ( \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ + ) # define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") #elif HEDLEY_IAR_VERSION_CHECK(8,0,0) # define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") @@ -1081,14 +1606,27 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ + HEDLEY_IAR_VERSION_CHECK(8,10,0) # define HEDLEY_NEVER_INLINE __attribute__((__noinline__)) -#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) +#elif \ + HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_NEVER_INLINE __declspec(noinline) #elif HEDLEY_PGI_VERSION_CHECK(10,2,0) # define HEDLEY_NEVER_INLINE _Pragma("noinline") -#elif HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus) +#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) # define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") #elif HEDLEY_IAR_VERSION_CHECK(8,0,0) # define HEDLEY_NEVER_INLINE _Pragma("inline=never") @@ -1121,8 +1659,14 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_TI_VERSION_CHECK(8,0,0) || \ - (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_EABI__) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + ( \ + defined(__TI_EABI__) && \ + ( \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ + ) \ + ) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) # define HEDLEY_PUBLIC __attribute__((__visibility__("default"))) # else @@ -1138,10 +1682,12 @@ HEDLEY_DIAGNOSTIC_POP #if \ HEDLEY_HAS_ATTRIBUTE(nothrow) || \ HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_NO_THROW __attribute__((__nothrow__)) #elif \ HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) # define HEDLEY_NO_THROW __declspec(nothrow) #else @@ -1149,30 +1695,21 @@ HEDLEY_DIAGNOSTIC_POP #endif #if defined(HEDLEY_FALL_THROUGH) -# undef HEDLEY_FALL_THROUGH +# undef HEDLEY_FALL_THROUGH #endif #if \ - defined(__cplusplus) && \ - (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ - !defined(HEDLEY_PGI_VERSION) -# if \ - (__cplusplus >= 201703L) || \ - ((__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)) -# define HEDLEY_FALL_THROUGH [[fallthrough]] -# elif (__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(clang::fallthrough) -# define HEDLEY_FALL_THROUGH [[clang::fallthrough]] -# elif (__cplusplus >= 201103L) && HEDLEY_GCC_VERSION_CHECK(7,0,0) -# define HEDLEY_FALL_THROUGH [[gnu::fallthrough]] -# endif -#endif -#if !defined(HEDLEY_FALL_THROUGH) -# if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough,7,0,0) && !defined(HEDLEY_PGI_VERSION) -# define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) -# elif defined(__fallthrough) /* SAL */ -# define HEDLEY_FALL_THROUGH __fallthrough -# else -# define HEDLEY_FALL_THROUGH -# endif + HEDLEY_HAS_ATTRIBUTE(fallthrough) || \ + HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) +# define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) +#elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) +# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) +#elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) +# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) +#elif defined(__fallthrough) /* SAL */ +# define HEDLEY_FALL_THROUGH __fallthrough +#else +# define HEDLEY_FALL_THROUGH #endif #if defined(HEDLEY_RETURNS_NON_NULL) @@ -1180,7 +1717,8 @@ HEDLEY_DIAGNOSTIC_POP #endif #if \ HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) + HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) #elif defined(_Ret_notnull_) /* SAL */ # define HEDLEY_RETURNS_NON_NULL _Ret_notnull_ @@ -1208,12 +1746,11 @@ HEDLEY_DIAGNOSTIC_POP #if defined(HEDLEY_REQUIRE_CONSTEXPR) # undef HEDLEY_REQUIRE_CONSTEXPR #endif -/* Note the double-underscore. For internal use only; no API - * guarantees! */ -#if defined(HEDLEY__IS_CONSTEXPR) -# undef HEDLEY__IS_CONSTEXPR +/* HEDLEY_IS_CONSTEXPR_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(HEDLEY_IS_CONSTEXPR_) +# undef HEDLEY_IS_CONSTEXPR_ #endif - #if \ HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ @@ -1221,9 +1758,10 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_TI_VERSION_CHECK(6,1,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) + HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ + HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) # define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) #endif #if !defined(__cplusplus) @@ -1236,31 +1774,40 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ HEDLEY_TINYC_VERSION_CHECK(0,9,24) # if defined(__INTPTR_TYPE__) -# define HEDLEY__IS_CONSTEXPR(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) +# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) # else # include -# define HEDLEY__IS_CONSTEXPR(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) +# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) # endif # elif \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(HEDLEY_SUNPRO_VERSION) && !defined(HEDLEY_PGI_VERSION)) || \ - HEDLEY_HAS_EXTENSION(c_generic_selections) || \ + ( \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(HEDLEY_SUNPRO_VERSION) && \ + !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_IAR_VERSION)) || \ + (HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(HEDLEY_IAR_VERSION)) || \ HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ HEDLEY_ARM_VERSION_CHECK(5,3,0) # if defined(__INTPTR_TYPE__) -# define HEDLEY__IS_CONSTEXPR(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) +# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) # else # include -# define HEDLEY__IS_CONSTEXPR(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) +# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) # endif # elif \ defined(HEDLEY_GCC_VERSION) || \ defined(HEDLEY_INTEL_VERSION) || \ defined(HEDLEY_TINYC_VERSION) || \ - defined(HEDLEY_TI_VERSION) || \ + defined(HEDLEY_TI_ARMCL_VERSION) || \ + HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ + defined(HEDLEY_TI_CL2000_VERSION) || \ + defined(HEDLEY_TI_CL6X_VERSION) || \ + defined(HEDLEY_TI_CL7X_VERSION) || \ + defined(HEDLEY_TI_CLPRU_VERSION) || \ defined(__clang__) -# define HEDLEY__IS_CONSTEXPR(expr) ( \ +# define HEDLEY_IS_CONSTEXPR_(expr) ( \ sizeof(void) != \ sizeof(*( \ 1 ? \ @@ -1271,11 +1818,11 @@ HEDLEY_DIAGNOSTIC_POP ) # endif #endif -#if defined(HEDLEY__IS_CONSTEXPR) +#if defined(HEDLEY_IS_CONSTEXPR_) # if !defined(HEDLEY_IS_CONSTANT) -# define HEDLEY_IS_CONSTANT(expr) HEDLEY__IS_CONSTEXPR(expr) +# define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr) # endif -# define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY__IS_CONSTEXPR(expr) ? (expr) : (-1)) +# define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) #else # if !defined(HEDLEY_IS_CONSTANT) # define HEDLEY_IS_CONSTANT(expr) (0) @@ -1308,67 +1855,36 @@ HEDLEY_DIAGNOSTIC_POP #if \ !defined(__cplusplus) && ( \ (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - HEDLEY_HAS_FEATURE(c_static_assert) || \ + (HEDLEY_HAS_FEATURE(c_static_assert) && !defined(HEDLEY_INTEL_CL_VERSION)) || \ HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ defined(_Static_assert) \ ) # define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) #elif \ - (defined(__cplusplus) && (__cplusplus >= 201703L)) || \ + (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \ - (defined(__cplusplus) && HEDLEY_TI_VERSION_CHECK(8,3,0)) -# define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr, message) -#elif defined(__cplusplus) && (__cplusplus >= 201103L) -# define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr) + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) #else # define HEDLEY_STATIC_ASSERT(expr, message) #endif -#if defined(HEDLEY_CONST_CAST) -# undef HEDLEY_CONST_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) -#elif \ - HEDLEY_HAS_WARNING("-Wcast-qual") || \ - HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ - ((T) (expr)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define HEDLEY_CONST_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_REINTERPRET_CAST) -# undef HEDLEY_REINTERPRET_CAST +#if defined(HEDLEY_NULL) +# undef HEDLEY_NULL #endif #if defined(__cplusplus) -# define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) -#else -# define HEDLEY_REINTERPRET_CAST(T, expr) (*((T*) &(expr))) -#endif - -#if defined(HEDLEY_STATIC_CAST) -# undef HEDLEY_STATIC_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) -#else -# define HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_CPP_CAST) -# undef HEDLEY_CPP_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_CPP_CAST(T, expr) static_cast(expr) +# if __cplusplus >= 201103L +# define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) +# elif defined(NULL) +# define HEDLEY_NULL NULL +# else +# define HEDLEY_NULL HEDLEY_STATIC_CAST(void*, 0) +# endif +#elif defined(NULL) +# define HEDLEY_NULL NULL #else -# define HEDLEY_CPP_CAST(T, expr) (expr) +# define HEDLEY_NULL ((void*) 0) #endif #if defined(HEDLEY_MESSAGE) @@ -1405,41 +1921,51 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_DIAGNOSTIC_POP #elif \ HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,4,0) + HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) # define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg) -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +#elif \ + HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) # define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg)) #else # define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg) #endif +#if defined(HEDLEY_REQUIRE) +# undef HEDLEY_REQUIRE +#endif #if defined(HEDLEY_REQUIRE_MSG) # undef HEDLEY_REQUIRE_MSG #endif #if HEDLEY_HAS_ATTRIBUTE(diagnose_if) # if HEDLEY_HAS_WARNING("-Wgcc-compat") -# define HEDLEY_REQUIRE_MSG(expr, msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((__diagnose_if__(!(expr), msg, "error"))) \ - HEDLEY_DIAGNOSTIC_POP +# define HEDLEY_REQUIRE(expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), #expr, "error"))) \ + HEDLEY_DIAGNOSTIC_POP +# define HEDLEY_REQUIRE_MSG(expr,msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), msg, "error"))) \ + HEDLEY_DIAGNOSTIC_POP # else -# define HEDLEY_REQUIRE_MSG(expr, msg) __attribute__((__diagnose_if__(!(expr), msg, "error"))) +# define HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) +# define HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) # endif #else -# define HEDLEY_REQUIRE_MSG(expr, msg) +# define HEDLEY_REQUIRE(expr) +# define HEDLEY_REQUIRE_MSG(expr,msg) #endif -#if defined(HEDLEY_REQUIRE) -# undef HEDLEY_REQUIRE -#endif -#define HEDLEY_REQUIRE(expr) HEDLEY_REQUIRE_MSG(expr, #expr) - #if defined(HEDLEY_FLAGS) # undef HEDLEY_FLAGS #endif -#if HEDLEY_HAS_ATTRIBUTE(flag_enum) +#if HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion")) # define HEDLEY_FLAGS __attribute__((__flag_enum__)) +#else +# define HEDLEY_FLAGS #endif #if defined(HEDLEY_FLAGS_CAST) @@ -1456,6 +1982,17 @@ HEDLEY_DIAGNOSTIC_POP # define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr) #endif +#if defined(HEDLEY_EMPTY_BASES) +# undef HEDLEY_EMPTY_BASES +#endif +#if \ + (HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \ + HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) +# define HEDLEY_EMPTY_BASES __declspec(empty_bases) +#else +# define HEDLEY_EMPTY_BASES +#endif + /* Remaining macros are deprecated. */ #if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) From ca8bd0f1c91c110d7dd68b68a43a7480a58710ee Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 22 Aug 2023 15:25:27 +1000 Subject: [PATCH 62/91] Tweak C flags --- binding.gyp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/binding.gyp b/binding.gyp index 86757b38..cfdf1265 100644 --- a/binding.gyp +++ b/binding.gyp @@ -21,7 +21,7 @@ ] }] ], - "cflags": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"], + "cflags_c": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"], "cxxflags": ["-std=c++11"], "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}}, "configurations": {"Release": { @@ -46,8 +46,8 @@ "cxxflags!": ["-fno-exceptions"], "cflags_cc!": ["-fno-exceptions"], "defines": ["USE_LIBUV"], - "cflags": ["-fexceptions", "-std=c++11"], - "cxxflags": ["-fexceptions"], + "cflags": ["-fexceptions"], + "cxxflags": ["-fexceptions", "-std=c++11"], "cflags_cc": ["-fexceptions"], "xcode_settings": { "OTHER_CFLAGS!": ["-fno-exceptions"], @@ -64,7 +64,7 @@ "defines": ["NDEBUG", "PARPAR_LIBDL_SUPPORT"], "sources": ["gf16/opencl-include/cl.c", "gf16/gfmat_coeff.c"], "include_dirs": ["gf16/opencl-include"], - "cflags": ["-Wno-unused-function", "-std=gnu99"], + "cflags": ["-Wno-unused-function", "-std=c99"], "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], "xcode_settings": { "OTHER_CFLAGS": ["-Wno-unused-function"], @@ -77,6 +77,7 @@ "defines": ["NDEBUG"], "sources": ["hasher/hasher.cpp", "hasher/hasher_scalar.cpp"], "dependencies": ["hasher_c"], + "cxxflags": ["-std=c++11"], "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], "xcode_settings": { @@ -90,7 +91,7 @@ "type": "static_library", "defines": ["NDEBUG"], "sources": ["hasher/crc_zeropad.c", "hasher/md5-final.c"], - "cflags": ["-Wno-unused-function", "-std=gnu99"], + "cflags": ["-Wno-unused-function", "-std=c99"], "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], "xcode_settings": { "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], @@ -444,7 +445,7 @@ "gf16/gf_add_generic.c", "gf16/gf16_cksum_generic.c" ], - "cflags": ["-Wno-unused-function", "-std=gnu99"], + "cflags": ["-Wno-unused-function", "-std=c99"], "xcode_settings": { "OTHER_CFLAGS": ["-Wno-unused-function"], "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"], @@ -772,7 +773,7 @@ "gf16/gf_add_neon.c", "gf16/gf16_cksum_neon.c" ], - "cflags": ["-Wno-unused-function", "-std=gnu99"], + "cflags": ["-Wno-unused-function", "-std=c99"], "xcode_settings": { "OTHER_CFLAGS": ["-Wno-unused-function"], "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"] @@ -837,7 +838,7 @@ "gf16/gf_add_sve.c", "gf16/gf16_cksum_sve.c" ], - "cflags": ["-Wno-unused-function", "-std=gnu99"], + "cflags": ["-Wno-unused-function", "-std=c99"], "xcode_settings": { "OTHER_CFLAGS": ["-Wno-unused-function"], "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"] @@ -875,7 +876,7 @@ "gf16/gf16_clmul_sve2.c", "gf16/gf_add_sve2.c" ], - "cflags": ["-Wno-unused-function", "-std=gnu99"], + "cflags": ["-Wno-unused-function", "-std=c99"], "xcode_settings": { "OTHER_CFLAGS": ["-Wno-unused-function"], "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"] From 3888616089db3d86692bba837087f69694d1566e Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 22 Aug 2023 16:31:13 +1000 Subject: [PATCH 63/91] Add gf16/hasher test suite --- .github/workflows/test.yml | 243 ++++++++ test/gf16/CMakeLists.txt | 314 ++++++++++ test/gf16/p2c-inv/galois.cpp | 28 + test/gf16/p2c-inv/galois.h | 317 ++++++++++ test/gf16/p2c-inv/reedsolomon.cpp | 253 ++++++++ test/gf16/p2c-inv/reedsolomon.h | 45 ++ test/gf16/test-ctrl.cpp | 378 ++++++++++++ test/gf16/test-inv.cpp | 188 ++++++ test/gf16/test-pmul.cpp | 106 ++++ test/gf16/test.cpp | 956 ++++++++++++++++++++++++++++++ test/gf16/test.h | 115 ++++ test/hasher/CMakeLists.txt | 154 +++++ test/hasher/test.cpp | 312 ++++++++++ 13 files changed, 3409 insertions(+) create mode 100644 .github/workflows/test.yml create mode 100644 test/gf16/CMakeLists.txt create mode 100644 test/gf16/p2c-inv/galois.cpp create mode 100644 test/gf16/p2c-inv/galois.h create mode 100644 test/gf16/p2c-inv/reedsolomon.cpp create mode 100644 test/gf16/p2c-inv/reedsolomon.h create mode 100644 test/gf16/test-ctrl.cpp create mode 100644 test/gf16/test-inv.cpp create mode 100644 test/gf16/test-pmul.cpp create mode 100644 test/gf16/test.cpp create mode 100644 test/gf16/test.h create mode 100644 test/hasher/CMakeLists.txt create mode 100644 test/hasher/test.cpp diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..3bb6d5d1 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,243 @@ +name: Test GF16/Hasher +on: + workflow_dispatch: + +jobs: + test-win-x86: + strategy: + fail-fast: false + matrix: + config: [Debug, Release] + compiler: ['v141', 'v142', 'v143', 'ClangCL'] + arch: ['Win32', 'x64'] + name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} (${{ matrix.config }}) + runs-on: windows-latest + steps: + - uses: ilammy/setup-nasm@v1 + - uses: petarpetrovt/setup-sde@v2.1 + - uses: actions/checkout@v3 + - run: | + mkdir test\gf16\build + cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake --build test\gf16\build --config ${{ matrix.config }} + + mkdir test\hasher\build + cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake --build test\hasher\build --config ${{ matrix.config }} + - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe + - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f + if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }} + - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f + if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }} + - run: sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe + + # test SSE2-only to see if CPUID checking works + - run: | + sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe + sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe + if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} + + + # test building only + test-win-arm: + strategy: + fail-fast: false + matrix: + compiler: ['v142', 'v143', 'ClangCL'] + arch: ['ARM', 'ARM64'] + name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - run: | + mkdir test\gf16\build + cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake --build test\gf16\build --config Debug + + mkdir test\hasher\build + cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake --build test\hasher\build --config Debug + + # TODO: test mingw + # https://github.com/msys2/setup-msys2 + # https://www.msys2.org/docs/ci/ + # TODO: test libuv, OpenCL + + test-linux-gcc: + strategy: + fail-fast: false + matrix: + config: [Debug, Release] + # GCC 8 available in 20.04 + cc_ver: ['9','12'] + t: + # qemu x86 doesn't support AVX, so we use Intel SDE instead + - {arch: 'i386', target: 'i686-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icl --'} + - {arch: 'amd64', target: 'x86-64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icl --'} + #- {arch: 'amd64', target: 'x86-64-linux-gnux32', libc: 'x32', emu: 'qemu-x86_64-static -cpu max'} + # TODO: how to test x32? + - {arch: 'aarch64', target: 'aarch64-linux-gnu', libc: 'arm64', emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'} + - {arch: 'arm', target: 'arm-linux-gnueabihf', libc: 'armhf', emu: 'qemu-arm-static -L /usr/arm-linux-gnueabihf -cpu max'} + # RVV unavailable in Ubuntu 22.04's qemu + # TODO: consider using newer qemu + #- {arch: 'riscv64', target: 'riscv64-linux-gnu', libc: 'riscv64', emu: 'qemu-riscv64-static -L /usr/riscv64-linux-gnu -cpu rv64,v=true,vlen=512,elen=64,vext_spec=v1.0,zba=true,zbb=true,zbc=true'} + - {arch: 'ppc64', target: 'powerpc64-linux-gnu', libc: 'ppc64', emu: 'qemu-ppc64-static -L /usr/powerpc64-linux-gnu'} + name: Test Ubuntu GCC ${{ matrix.cc_ver }} ${{ matrix.t.arch }} (${{ matrix.config }}) + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - run: apt update + - uses: petarpetrovt/setup-sde@v2.1 + if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }} + - run: apt install -y qemu-user-static + if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }} + - run: | + apt install -y g++-${{ matrix.cc_ver }}-${{ matrix.t.target }} + echo "CC=${{ matrix.t.target }}-gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV + echo "CXX=${{ matrix.t.target }}-g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV + if: ${{ matrix.t.arch != 'amd64' }} + - run: | + apt install -y g++-${{ matrix.cc_ver }} + echo "CC=gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV + echo "CXX=g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV + if: ${{ matrix.t.arch == 'amd64' }} + - run: | + mkdir test/gf16/build + cmake -Btest/gf16/build -Stest/gf16 -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} \ + -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_FIND_ROOT_PATH=/usr/${{ matrix.t.target }} \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY + cmake --build test/gf16/build + + mkdir test/hasher/build + cmake -Btest/hasher/build -Stest/hasher -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} \ + -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_FIND_ROOT_PATH=/usr/${{ matrix.t.target }} \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY + cmake --build test/hasher/build + - run: ${{ matrix.t.emu }} test/gf16/build/test + - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul + - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f + if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }} + - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f + if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }} + - run: ${{ matrix.t.emu }} test/hasher/build/test + + test-linux-clang: + strategy: + fail-fast: false + matrix: + config: ['Debug', 'Release'] + # Clang 6 available in 20.04 + cc_ver: ['11','15'] + t: + - {arch: 'i386', target: 'i686-linux-gnu', cl_target: 'x86-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icl --'} + - {arch: 'amd64', target: 'x86-64-linux-gnu', cl_target: 'x86_64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icl --'} + #- {arch: 'amd64', target: 'x86-64-linux-gnux32', cl_target: 'x86-64-linux-gnux32', libc: 'x32', emu: 'qemu- -cpu max'} + # TODO: how to test x32? + - {arch: 'aarch64', target: 'aarch64-linux-gnu', cl_target: 'aarch64-linux-gnu', libc: 'arm64', emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'} + - {arch: 'arm', target: 'arm-linux-gnueabihf', cl_target: 'armv7a-linux-gnueabihf', libc: 'armhf', emu: 'qemu-arm-static -L /usr/arm-linux-gnueabihf -cpu max'} + # TODO: can't test ARM BE/RISCV32 without available libc + #- {arch: 'aarch64be', target: 'aarch64_be-linux-gnu', cl_target: 'aarch64_be-linux-gnu', libc: 'arm64be', emu: 'qemu-aarch64_be-static -L /usr/aarch64_be-linux-gnu -cpu max,sve-max-vq=4'} + #- {arch: 'arm_be', target: 'armeb-linux-gnu', cl_target: 'armebv7a-linux-gnu', libc: 'armeb', emu: 'qemu-armeb-static -L /usr/armeb-linux-gnu -cpu max'} + #- {arch: 'riscv32', target: 'riscv32-linux-gnu', cl_target: 'riscv32-linux-gnu', libc: 'riscv32', emu: 'qemu-riscv32-static -L /usr/riscv32-linux-gnu -cpu rv32,v=true,vlen=512,elen=64,vext_spec=v1.0,zba=true,zbb=true,zbc=true'} + # RVV unavailable in Ubuntu 22.04's qemu + #- {arch: 'riscv64', target: 'riscv64-linux-gnu', cl_target: 'riscv64-linux-gnu', libc: 'riscv64', emu: 'qemu-riscv64-static -L /usr/riscv64-linux-gnu -cpu rv64,v=true,vlen=512,elen=64,vext_spec=v1.0,zba=true,zbb=true,zbc=true'} + - {arch: 'ppc64', target: 'powerpc64-linux-gnu', cl_target: 'ppc64-linux-gnu', libc: 'ppc64', emu: 'qemu-ppc64-static -L /usr/powerpc64-linux-gnu'} + name: Test Ubuntu Clang ${{ matrix.cc_ver }} ${{ matrix.t.arch }} (${{ matrix.config }}) + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - run: apt update && apt install -y clang-${{ matrix.cc_ver }} + - uses: petarpetrovt/setup-sde@v2.1 + if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }} + - run: apt install -y qemu-user-static + if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }} + - run: apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross + if: ${{ matrix.t.arch != 'amd64' }} + - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV + if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }} + - run: | + mkdir test/gf16/build + cmake -Btest/gf16/build -Stest/gf16 -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} $SANITIZE \ + -DCMAKE_C_COMPILER=clang-${{ matrix.cc_ver }} \ + -DCMAKE_CXX_COMPILER=clang++-${{ matrix.cc_ver }} \ + -DCMAKE_C_COMPILER_TARGET=${{ matrix.t.cl_target }} \ + -DCMAKE_CXX_COMPILER_TARGET=${{ matrix.t.cl_target }} \ + -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_FIND_ROOT_PATH="/usr/${{ matrix.t.target }};/usr/lib/llvm-${{ matrix.cc_ver }}/lib/clang/" \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ + -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ + -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ + -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld + cmake --build test/gf16/build + + mkdir test/hasher/build + cmake -Btest/hasher/build -Stest/hasher -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} $SANITIZE \ + -DCMAKE_C_COMPILER=clang-${{ matrix.cc_ver }} \ + -DCMAKE_CXX_COMPILER=clang++-${{ matrix.cc_ver }} \ + -DCMAKE_C_COMPILER_TARGET=${{ matrix.t.cl_target }} \ + -DCMAKE_CXX_COMPILER_TARGET=${{ matrix.t.cl_target }} \ + -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_FIND_ROOT_PATH="/usr/${{ matrix.t.target }};/usr/lib/llvm-${{ matrix.cc_ver }}/lib/clang/" \ + -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ + -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ + -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ + -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld + cmake --build test/hasher/build + - run: ${{ matrix.t.emu }} test/gf16/build/test + - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul + - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f + if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }} + - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f + if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }} + - run: ${{ matrix.t.emu }} test/hasher/build/test + + + test-mac-x86: + strategy: + fail-fast: false + matrix: + config: ['Debug', 'Release'] + compiler: + - {cc: 'gcc-12', cxx: 'g++-12'} + - {cc: 'clang', cxx: 'clang++'} + name: Test MacOS ${{ matrix.compiler.cc }} (${{ matrix.config }}) + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - run: | + mkdir test/gf16/build + cmake -Btest/gf16/build -Stest/gf16 -DCMAKE_BUILD_TYPE=${{ matrix.config }} -DCMAKE_C_COMPILER=${{ matrix.compiler.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cxx }} + cmake --build test/gf16/build + + mkdir test/hasher/build + cmake -Btest/hasher/build -Stest/hasher -DCMAKE_BUILD_TYPE=${{ matrix.config }} -DCMAKE_C_COMPILER=${{ matrix.compiler.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cxx }} + cmake --build test/hasher/build + - run: test/gf16/build/test + - run: test/gf16/build/test-pmul + - run: test/gf16/build/test-ctrl -f + if: ${{ matrix.config == 'Release' && matrix.compiler.cc == 'clang' }} + - run: test/gf16/build/test-inv -f + if: ${{ matrix.config == 'Release' && matrix.compiler.cc == 'clang' }} + - run: test/hasher/build/test + + # TODO: test building on Mac ARM64? might not be necessary, given we build it in par2cmdline-turbo + + # TODO: BSD? + # https://github.com/marketplace/actions/freebsd-vm + # https://github.com/vmactions diff --git a/test/gf16/CMakeLists.txt b/test/gf16/CMakeLists.txt new file mode 100644 index 00000000..37f7b578 --- /dev/null +++ b/test/gf16/CMakeLists.txt @@ -0,0 +1,314 @@ +cmake_minimum_required(VERSION 2.8.9...3.22) +project(gf16_test) + +option(USE_LIBUV "Use libuv interface with callbacks, instead of C++11 threading + futures" OFF) +option(ENABLE_OCL "Enable OpenCL" OFF) +option(SKIP_AUX "Bypass getauxval checks (for testing purposes)" OFF) +option(ENABLE_SANITIZE "Enable sanitizers" OFF) + +include(CheckCXXCompilerFlag) +include(CheckIncludeFileCXX) +include(CheckCXXSymbolExists) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_C_STANDARD 99) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Debug) +endif() +if(NOT TARGET_ARCH) + if(CMAKE_GENERATOR_PLATFORM) + set(TARGET_ARCH ${CMAKE_GENERATOR_PLATFORM}) + else() + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) + endif() +endif() + +message("Building for ${TARGET_ARCH}") +if (${TARGET_ARCH} MATCHES "i386|i686|x86|x86_64|x64|amd64|AMD64|win32|Win32") + set(IS_X86 TRUE) + if(${TARGET_ARCH} MATCHES "x86_64|x64|amd64|AMD64") + set(IS_X64 TRUE) + endif() +endif() +if (${TARGET_ARCH} MATCHES "arm|ARM|aarch64|arm64|ARM64") + set(IS_ARM TRUE) +endif() +if (${TARGET_ARCH} MATCHES "riscv64|rv64") + set(IS_RISCV64 TRUE) +endif() +if (${TARGET_ARCH} MATCHES "riscv32|rv32") + set(IS_RISCV32 TRUE) +endif() + +if(ENABLE_OCL) + add_compile_definitions(ENABLE_OCL=1) + add_compile_definitions(PARPAR_LIBDL_SUPPORT=1) +endif() +if(USE_LIBUV) + add_compile_definitions(USE_LIBUV=1) +endif() +if(SKIP_AUX) + add_compile_definitions(PARPAR_SKIP_AUX_CHECK=1) +endif() + +set(GF16_DIR ../../gf16) +set(SRC_DIR ../../src) +set(GF16_C_SOURCES + ${GF16_DIR}/gf_add_avx2.c + ${GF16_DIR}/gf_add_avx512.c + ${GF16_DIR}/gf_add_generic.c + ${GF16_DIR}/gf_add_neon.c + ${GF16_DIR}/gf_add_rvv.c + ${GF16_DIR}/gf_add_sse2.c + ${GF16_DIR}/gf_add_sve.c + ${GF16_DIR}/gf_add_sve2.c + ${GF16_DIR}/gf16_affine_avx2.c + ${GF16_DIR}/gf16_affine_avx512.c + ${GF16_DIR}/gf16_affine_gfni.c + ${GF16_DIR}/gf16_cksum_avx2.c + ${GF16_DIR}/gf16_cksum_avx512.c + ${GF16_DIR}/gf16_cksum_generic.c + ${GF16_DIR}/gf16_cksum_neon.c + ${GF16_DIR}/gf16_cksum_rvv.c + ${GF16_DIR}/gf16_cksum_sse2.c + ${GF16_DIR}/gf16_cksum_sve.c + ${GF16_DIR}/gf16_clmul_neon.c + ${GF16_DIR}/gf16_clmul_sha3.c + ${GF16_DIR}/gf16_clmul_sve2.c + ${GF16_DIR}/gf16_lookup.c + ${GF16_DIR}/gf16_lookup_sse2.c + ${GF16_DIR}/gf16_shuffle_avx.c + ${GF16_DIR}/gf16_shuffle_avx2.c + ${GF16_DIR}/gf16_shuffle_avx512.c + ${GF16_DIR}/gf16_shuffle_neon.c + ${GF16_DIR}/gf16_shuffle_ssse3.c + ${GF16_DIR}/gf16_shuffle_vbmi.c + ${GF16_DIR}/gf16_shuffle2x128_sve2.c + ${GF16_DIR}/gf16_shuffle128_rvv.c + ${GF16_DIR}/gf16_shuffle128_sve.c + ${GF16_DIR}/gf16_shuffle128_sve2.c + ${GF16_DIR}/gf16_shuffle512_sve2.c + ${GF16_DIR}/gf16_xor_avx2.c + ${GF16_DIR}/gf16_xor_avx512.c + ${GF16_DIR}/gf16_xor_sse2.c + ${GF16_DIR}/gfmat_coeff.c + + ${GF16_DIR}/opencl-include/cl.c + ${SRC_DIR}/platform_warnings.c + + + ${GF16_DIR}/gf16pmul_avx2.c + ${GF16_DIR}/gf16pmul_neon.c + ${GF16_DIR}/gf16pmul_sse.c + ${GF16_DIR}/gf16pmul_sve2.c + ${GF16_DIR}/gf16pmul_vpclgfni.c + ${GF16_DIR}/gf16pmul_vpclmul.c +) + +if(MSVC AND IS_X64) + ENABLE_LANGUAGE(ASM_MASM) + set(GF16_C_SOURCES ${GF16_C_SOURCES} ${GF16_DIR}/xor_jit_stub_masm64.asm) +endif() + +set(GF16_CPP_SOURCES + ${GF16_DIR}/controller.cpp + ${GF16_DIR}/controller_cpu.cpp + ${GF16_DIR}/controller_ocl.cpp + ${GF16_DIR}/controller_ocl_init.cpp + ${GF16_DIR}/gf16mul.cpp + + + ${GF16_DIR}/gf16pmul.cpp + ${GF16_DIR}/gfmat_inv.cpp +) + +include_directories(${GF16_DIR}/opencl-include ${GF16_DIR}) + +if(MSVC) + set(RELEASE_COMPILE_FLAGS /GS- /Gy /sdl- /Oy /Oi) + set(RELEASE_LINK_FLAGS /OPT:REF /OPT:ICF) + add_compile_options(/W2 "$<$>:${RELEASE_COMPILE_FLAGS}>") + add_link_options("$<$>:${RELEASE_LINK_FLAGS}>") +else() + # TODO: consider -Werror + add_compile_options(-Wall -Wextra -Wno-unused-function) + if(${CMAKE_BUILD_TYPE} MATCHES "Debug") + add_compile_options(-ggdb) + else() + if(NOT ENABLE_SANITIZE) + add_compile_options(-fomit-frame-pointer) + endif() + endif() + + if(ENABLE_SANITIZE) + set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound) + # -fsanitize=pointer-overflow causes compilation of shuffle_avx512 to freeze on clang10 + # -fsanitize=memory requires instrumented libraries, so not useful + add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS}) + add_link_options(${SANITIZE_OPTS}) + endif() + + #if(ENABLE_OCL) + # add_compile_options(-fexceptions) + #else() + # add_compile_options(-fno-exceptions) + #endif() +endif() + +add_compile_definitions(PARPAR_INVERT_SUPPORT=1) +add_library(gf16_c STATIC ${GF16_C_SOURCES}) +add_library(gf16_ctl STATIC ${GF16_CPP_SOURCES}) +target_link_libraries(gf16_ctl gf16_c) + +if(NOT MSVC) + if(NOT ENABLE_SANITIZE) + target_compile_options(gf16_ctl PRIVATE -fno-rtti) + endif() + target_compile_definitions(gf16_c PRIVATE _POSIX_C_SOURCE=200112L) + target_compile_definitions(gf16_c PRIVATE _DARWIN_C_SOURCE=) + target_compile_definitions(gf16_c PRIVATE _GNU_SOURCE=) + + if(ENABLE_SANITIZE) + # not supported on all platforms? + #target_compile_options(gf16_ctl PRIVATE -fsanitize=thread) + endif() +endif() + +if(MSVC) + if(IS_X86) + set_source_files_properties(${GF16_DIR}/gf_add_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf_add_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${GF16_DIR}/gf16_affine_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf16_affine_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${GF16_DIR}/gf16_cksum_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf16_cksum_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx.c PROPERTIES COMPILE_OPTIONS /arch:AVX) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_vbmi.c PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${GF16_DIR}/gf16_xor_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf16_xor_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${GF16_DIR}/gf16pmul_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf16pmul_vpclgfni.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${GF16_DIR}/gf16pmul_vpclmul.c PROPERTIES COMPILE_OPTIONS /arch:AVX2) + endif() +endif() +if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(IS_X86) + set_source_files_properties(${GF16_DIR}/gf_add_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2) + set_source_files_properties(${GF16_DIR}/gf_add_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw") + set_source_files_properties(${GF16_DIR}/gf_add_sse2.c PROPERTIES COMPILE_OPTIONS -msse2) + set_source_files_properties(${GF16_DIR}/gf16_cksum_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2) + set_source_files_properties(${GF16_DIR}/gf16_cksum_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw") + set_source_files_properties(${GF16_DIR}/gf16_cksum_sse2.c PROPERTIES COMPILE_OPTIONS -msse2) + set_source_files_properties(${GF16_DIR}/gf16_lookup_sse2.c PROPERTIES COMPILE_OPTIONS -msse2) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx.c PROPERTIES COMPILE_OPTIONS -mavx) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw") + set_source_files_properties(${GF16_DIR}/gf16_shuffle_ssse3.c PROPERTIES COMPILE_OPTIONS -mssse3) + set_source_files_properties(${GF16_DIR}/gf16_xor_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2) + set_source_files_properties(${GF16_DIR}/gf16_xor_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw") + set_source_files_properties(${GF16_DIR}/gf16_xor_sse2.c PROPERTIES COMPILE_OPTIONS -msse2) + set_source_files_properties(${GF16_DIR}/gf16pmul_avx2.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mpclmul") + set_source_files_properties(${GF16_DIR}/gf16pmul_sse.c PROPERTIES COMPILE_OPTIONS "-msse4.1;-mpclmul") + + CHECK_CXX_COMPILER_FLAG("-mavx512vl -mavx512bw -mavx512vbmi" COMPILER_SUPPORTS_VBMI) + if(COMPILER_SUPPORTS_VBMI) + set_source_files_properties(${GF16_DIR}/gf16_shuffle_vbmi.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw;-mavx512vbmi") + endif() + CHECK_CXX_COMPILER_FLAG("-mgfni" COMPILER_SUPPORTS_GFNI) + if(COMPILER_SUPPORTS_GFNI) + set_source_files_properties(${GF16_DIR}/gf16_affine_avx2.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mgfni") + set_source_files_properties(${GF16_DIR}/gf16_affine_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw;-mgfni") + set_source_files_properties(${GF16_DIR}/gf16_affine_gfni.c PROPERTIES COMPILE_OPTIONS "-mssse3;-mgfni") + + set_source_files_properties(${SRC_DIR}/platform_warnings.c.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mgfni") + endif() + + CHECK_CXX_COMPILER_FLAG("-mvpclmulqdq" COMPILER_SUPPORTS_VPCLMULQDQ) + if(COMPILER_SUPPORTS_VPCLMULQDQ) + set_source_files_properties(${GF16_DIR}/gf16pmul_vpclmul.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mvpclmulqdq") + endif() + if(COMPILER_SUPPORTS_VPCLMULQDQ AND COMPILER_SUPPORTS_GFNI) + set_source_files_properties(${GF16_DIR}/gf16pmul_vpclgfni.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mvpclmulqdq;-mgfni") + endif() + endif() + + if(IS_ARM AND NOT APPLE) # M1 Macs don't seem to need these ARM options + CHECK_CXX_COMPILER_FLAG("-mfpu=neon -march=armv7-a" COMPILER_SUPPORTS_ARM32_NEON) + if(COMPILER_SUPPORTS_ARM32_NEON) + set_source_files_properties(${GF16_DIR}/gf_add_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a") + set_source_files_properties(${GF16_DIR}/gf16_cksum_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a") + set_source_files_properties(${GF16_DIR}/gf16_clmul_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a") + set_source_files_properties(${GF16_DIR}/gf16_shuffle_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a") + set_source_files_properties(${GF16_DIR}/gf16pmul_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a") + endif() + CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+sha3" COMPILER_SUPPORTS_SHA3) + if(COMPILER_SUPPORTS_SHA3) + set_source_files_properties(${GF16_DIR}/gf16_clmul_sha3.c PROPERTIES COMPILE_OPTIONS -march=armv8.2-a+sha3) + endif() + + CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" COMPILER_SUPPORTS_SVE) + if(COMPILER_SUPPORTS_SVE) + set_source_files_properties(${GF16_DIR}/gf_add_sve.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve) + set_source_files_properties(${GF16_DIR}/gf16_cksum_sve.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve) + set_source_files_properties(${GF16_DIR}/gf16_shuffle128_sve.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve) + endif() + + CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve2" COMPILER_SUPPORTS_SVE2) + if(COMPILER_SUPPORTS_SVE2) + set_source_files_properties(${GF16_DIR}/gf_add_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + set_source_files_properties(${GF16_DIR}/gf16_clmul_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + set_source_files_properties(${GF16_DIR}/gf16_shuffle2x128_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + set_source_files_properties(${GF16_DIR}/gf16_shuffle128_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + set_source_files_properties(${GF16_DIR}/gf16_shuffle512_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + set_source_files_properties(${GF16_DIR}/gf16pmul_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + endif() + endif() + + if(IS_RISCV64) + CHECK_CXX_COMPILER_FLAG("-march=rv64gcv" COMPILER_SUPPORTS_RVV) + if(COMPILER_SUPPORTS_RVV) + set_source_files_properties(${GF16_DIR}/gf_add_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv64gcv) + set_source_files_properties(${GF16_DIR}/gf16_cksum_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv64gcv) + set_source_files_properties(${GF16_DIR}/gf16_shuffle128_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv64gcv) + endif() + endif() + if(IS_RISCV32) + CHECK_CXX_COMPILER_FLAG("-march=rv32gcv" COMPILER_SUPPORTS_RVV) + if(COMPILER_SUPPORTS_RVV) + set_source_files_properties(${GF16_DIR}/gf_add_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv32gcv) + set_source_files_properties(${GF16_DIR}/gf16_cksum_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv32gcv) + set_source_files_properties(${GF16_DIR}/gf16_shuffle128_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv32gcv) + endif() + endif() +endif() + + + + +# binaries +set(TEST_DIR .) +add_executable(test ${TEST_DIR}/test.cpp) +target_link_libraries(test gf16_ctl) +add_executable(test-ctrl ${TEST_DIR}/test-ctrl.cpp) +target_link_libraries(test-ctrl gf16_ctl) +add_executable(test-inv ${TEST_DIR}/test-inv.cpp ${TEST_DIR}/p2c-inv/reedsolomon.cpp) +target_link_libraries(test-inv gf16_ctl) +add_executable(test-pmul ${TEST_DIR}/test-pmul.cpp) +target_link_libraries(test-pmul gf16_ctl) + +if(NOT MSVC) + target_link_libraries(test-ctrl -pthread) + target_link_libraries(test-inv -pthread) + + if(ENABLE_OCL) + target_link_libraries(test-ctrl dl) + endif() +endif() + +if(USE_LIBUV) + target_link_libraries(test-ctrl uv) + target_link_libraries(test-inv uv) +endif() diff --git a/test/gf16/p2c-inv/galois.cpp b/test/gf16/p2c-inv/galois.cpp new file mode 100644 index 00000000..a1a39030 --- /dev/null +++ b/test/gf16/p2c-inv/galois.cpp @@ -0,0 +1,28 @@ +// This file is part of par2cmdline (a PAR 2.0 compatible file verification and +// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. +// +// Copyright (c) 2003 Peter Brian Clements +// +// par2cmdline is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// par2cmdline is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#include "libpar2internal.h" + +#ifdef _MSC_VER +#ifdef _DEBUG +#undef THIS_FILE +static char THIS_FILE[]=__FILE__; +#define new DEBUG_NEW +#endif +#endif diff --git a/test/gf16/p2c-inv/galois.h b/test/gf16/p2c-inv/galois.h new file mode 100644 index 00000000..7671c98b --- /dev/null +++ b/test/gf16/p2c-inv/galois.h @@ -0,0 +1,317 @@ +// This file is part of par2cmdline (a PAR 2.0 compatible file verification and +// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. +// +// Copyright (c) 2003 Peter Brian Clements +// +// par2cmdline is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// par2cmdline is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#ifndef __GALOIS_H__ +#define __GALOIS_H__ + +#include + +template class GaloisTable; +template class Galois; + +template class GaloisLongMultiplyTable; + +// This source file defines the Galois object for carrying out +// arithmetic in GF(2^16) using the generator 0x1100B. + +// Also defined are the GaloisTable object (which contains log and +// anti log tables for use in multiplication and division), and +// the GaloisLongMultiplyTable object (which contains tables for +// carrying out multiplation of 16-bit galois numbers 8 bits at a time). + +template +class GaloisTable +{ +public: + typedef valuetype ValueType; + + GaloisTable(void); + + enum + { + Bits = bits, + Count = 1< +class Galois +{ +public: + typedef valuetype ValueType; + + // Basic constructors + Galois(void) {}; + Galois(ValueType v); + + // Copy and assignment + Galois(const Galois &right) {value = right.value;} + Galois& operator = (const Galois &right) { value = right.value; return *this;} + + // Addition + Galois operator + (const Galois &right) const { return (value ^ right.value); } + Galois& operator += (const Galois &right) { value ^= right.value; return *this;} + + // Subtraction + Galois operator - (const Galois &right) const { return (value ^ right.value); } + Galois& operator -= (const Galois &right) { value ^= right.value; return *this;} + + // Multiplication + Galois operator * (const Galois &right) const; + Galois& operator *= (const Galois &right); + + // Division + Galois operator / (const Galois &right) const; + Galois& operator /= (const Galois &right); + + // Power + Galois pow(unsigned int right) const; + Galois operator ^ (unsigned int right) const; + Galois& operator ^= (unsigned int right); + + // Cast to value and value access + operator ValueType(void) const {return value;} + ValueType Value(void) const {return value;} + + // Direct log and antilog + ValueType Log(void) const; + ValueType ALog(void) const; + + enum + { + Bits = GaloisTable::Bits, + Count = GaloisTable::Count, + Limit = GaloisTable::Limit, + }; + +protected: + ValueType value; + + static GaloisTable table; +}; + +#ifdef LONGMULTIPLY +template +class GaloisLongMultiplyTable +{ +public: + GaloisLongMultiplyTable(void); + + typedef g G; + + enum + { + Bytes = ((G::Bits + 7) >> 3), + Count = ((Bytes * (Bytes+1)) / 2), + }; + + G tables[Count * 256 * 256]; +}; +#endif + +// Construct the log and antilog tables from the generator + +template +inline GaloisTable::GaloisTable(void) +{ + u32 b = 1; + + for (u32 l=0; l +GaloisTable Galois::table; + + +template +inline Galois::Galois(typename Galois::ValueType v) +{ + value = v; +} + +template +inline Galois Galois::operator * (const Galois &right) const +{ + if (value == 0 || right.value == 0) return 0; + unsigned int sum = table.log[value] + table.log[right.value]; + if (sum >= Limit) + { + return table.antilog[sum-Limit]; + } + else + { + return table.antilog[sum]; + } +} + +template +inline Galois& Galois::operator *= (const Galois &right) +{ + if (value == 0 || right.value == 0) + { + value = 0; + } + else + { + unsigned int sum = table.log[value] + table.log[right.value]; + if (sum >= Limit) + { + value = table.antilog[sum-Limit]; + } + else + { + value = table.antilog[sum]; + } + } + + return *this; +} + +template +inline Galois Galois::operator / (const Galois &right) const +{ + if (value == 0) return 0; + + assert(right.value != 0); + if (right.value == 0) {return 0;} // Division by 0! + + int sum = table.log[value] - table.log[right.value]; + if (sum < 0) + { + return table.antilog[sum+Limit]; + } + else + { + return table.antilog[sum]; + } +} + +template +inline Galois& Galois::operator /= (const Galois &right) +{ + if (value == 0) return *this; + + assert(right.value != 0); + if (right.value == 0) {return *this;} // Division by 0! + + int sum = table.log[value] - table.log[right.value]; + if (sum < 0) + { + value = table.antilog[sum+Limit]; + } + else + { + value = table.antilog[sum]; + } + + return *this; +} + +template +inline Galois Galois::pow(unsigned int right) const +{ + if (right == 0) return 1; + if (value == 0) return 0; + + unsigned int sum = table.log[value] * right; + + sum = (sum >> Bits) + (sum & Limit); + if (sum >= Limit) + { + return table.antilog[sum-Limit]; + } + else + { + return table.antilog[sum]; + } +} + +template +inline Galois Galois::operator ^ (unsigned int right) const +{ + if (right == 0) return 1; + if (value == 0) return 0; + + unsigned int sum = table.log[value] * right; + + sum = (sum >> Bits) + (sum & Limit); + if (sum >= Limit) + { + return table.antilog[sum-Limit]; + } + else + { + return table.antilog[sum]; + } +} + +template +inline Galois& Galois::operator ^= (unsigned int right) +{ + if (right == 0) {value = 1; return *this;} + if (value == 0) return *this; + + unsigned int sum = table.log[value] * right; + + sum = (sum >> Bits) + (sum & Limit); + if (sum >= Limit) + { + value = table.antilog[sum-Limit]; + } + else + { + value = table.antilog[sum]; + } + + return *this; +} + +template +inline valuetype Galois::Log(void) const +{ + return table.log[value]; +} + +template +inline valuetype Galois::ALog(void) const +{ + return table.antilog[value]; +} + +typedef Galois<16,0x1100B,u16> Galois16; + +#endif // __GALOIS_H__ diff --git a/test/gf16/p2c-inv/reedsolomon.cpp b/test/gf16/p2c-inv/reedsolomon.cpp new file mode 100644 index 00000000..17e41dec --- /dev/null +++ b/test/gf16/p2c-inv/reedsolomon.cpp @@ -0,0 +1,253 @@ +#include "reedsolomon.h" +#include +using namespace std; + + +static u32 gcd(u32 a, u32 b) +{ + if (a && b) + { + while (a && b) + { + if (a>b) + { + a = a%b; + } + else + { + b = b%a; + } + } + + return a+b; + } + else + { + return 0; + } +} + + +inline bool ReedSolomon_GaussElim(unsigned int rows, unsigned int leftcols, Galois16 *leftmatrix, Galois16 *rightmatrix, unsigned int datamissing) +{ + // Because the matrices being operated on are Vandermonde matrices + // they are guaranteed not to be singular. + + // Additionally, because Galois arithmetic is being used, all calculations + // involve exact values with no loss of precision. It is therefore + // not necessary to carry out any row or column swapping. + + // Solve one row at a time + + // For each row in the matrix + for (unsigned int row=0; row &present, vector outputrows, Galois16*& leftmatrix) +{ + // SetInput + u32 inputcount = (u32)present.size(); + + u32* datapresentindex = new u32[inputcount]; + u32* datamissingindex = new u32[inputcount]; + Galois16::ValueType* database = new Galois16::ValueType[inputcount]; + u32 datapresent = 0, datamissing = 0; + + unsigned int logbase = 0; + + for (unsigned int index=0; index= Galois16::Limit) + { + return false; + } + Galois16::ValueType base = Galois16(logbase++).ALog(); + + database[index] = base; + } + + + + + // Compute + u32 outcount = datamissing; + u32 incount = datapresent + datamissing; + + if (datamissing > outputrows.size()) return false; + if (outcount == 0) + { + return false; + } + + // Allocate the left hand matrix + + leftmatrix = new Galois16[outcount * incount]; + for (unsigned int index=0; index < outcount * incount; index++) + leftmatrix[index] = 0; + + // Allocate the right hand matrix only if we are recovering + + Galois16 *rightmatrix = 0; + if (datamissing > 0) + { + rightmatrix = new Galois16[outcount * outcount]; + for (unsigned int index=0; index < outcount * outcount; index++) + rightmatrix[index] = 0; + } + + // Fill in the two matrices: + + vector::const_iterator outputrow = outputrows.begin(); + + // One row for each present recovery block that will be used for a missing data block + for (unsigned int row=0; rowpresent) + { + outputrow++; + } + u16 exponent = outputrow->exponent; + + // One column for each present data block + for (unsigned int col=0; col 0) + { + // One column for each missing data block + for (unsigned int col=0; col 0) + { + // Perform Gaussian Elimination and then delete the right matrix (which + // will no longer be required). + bool success = ReedSolomon_GaussElim(outcount, incount, leftmatrix, rightmatrix, datamissing); + delete [] rightmatrix; + return success; + } + + return true; +} + +// Use Gaussian Elimination to solve the matrices + diff --git a/test/gf16/p2c-inv/reedsolomon.h b/test/gf16/p2c-inv/reedsolomon.h new file mode 100644 index 00000000..ba4aaab7 --- /dev/null +++ b/test/gf16/p2c-inv/reedsolomon.h @@ -0,0 +1,45 @@ +// This file is part of par2cmdline (a PAR 2.0 compatible file verification and +// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0. +// +// Copyright (c) 2003 Peter Brian Clements +// Copyright (c) 2019 Michael D. Nahas +// +// par2cmdline is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// par2cmdline is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#ifndef __REEDSOLOMON_H__ +#define __REEDSOLOMON_H__ + +#include +#include +typedef uint16_t u16; +typedef uint32_t u32; + +#include "galois.h" + +class RSOutputRow +{ +public: + RSOutputRow(void) {}; + RSOutputRow(bool _present, u16 _exponent) : present(_present), exponent(_exponent) {} + +public: + bool present; + u16 exponent; +}; + + +bool ReedSolomon_Compute(const std::vector &present, std::vector outputrows, Galois16*& leftmatrix); + +#endif // __REEDSOLOMON_H__ diff --git a/test/gf16/test-ctrl.cpp b/test/gf16/test-ctrl.cpp new file mode 100644 index 00000000..cd9d9193 --- /dev/null +++ b/test/gf16/test-ctrl.cpp @@ -0,0 +1,378 @@ +#if defined(_MSC_VER) && !defined(NDEBUG) +#define _CRTDBG_MAP_ALLOC +#include +#include +#endif + +#define NOMINMAX + +#include "controller.h" +#include "controller_cpu.h" +#include "controller_ocl.h" +#include "gfmat_coeff.h" +#include +#include +#include +#include +#include +#include +#include "test.h" + +const int MAX_TEST_REGIONS = 20; +const int MAX_TEST_OUTPUTS = 20; +const int REGION_SIZE = 20000; + + + +// globals +uint16_t* src[MAX_TEST_REGIONS]; +uint16_t* dst[MAX_TEST_OUTPUTS]; +uint16_t* ref[MAX_TEST_OUTPUTS]; +uint16_t inputIndicies[MAX_TEST_REGIONS]; +uint16_t outputIndicies[MAX_TEST_OUTPUTS*2]; +#ifdef USE_LIBUV +uv_loop_t *loop; +#endif + + +struct testProps { + size_t sliceSize, lastSliceSize; + unsigned numInputs, numOutputs; + Galois16Methods cpuMethod; + int cpuThreads; + Galois16OCLMethods oclMethod; + bool useCpu, useOcl; + + void print(const char* label) const { + std::cout << label << "(" << numInputs << "x" << numOutputs << ", sliceSize " << sliceSize << ", lastSliceSize " << lastSliceSize; + if(useCpu && !useOcl) + std::cout << ", method " << PAR2ProcCPU::info(cpuMethod).name << ", threads " << cpuThreads; + if(!useCpu && useOcl) + std::cout << ", method " << PAR2ProcOCL::methodToText(oclMethod); + std::cout << ")"; + } +}; + +static void run_test(struct testProps test IF_LIBUV(, std::function cb)) { + auto* par2 = new PAR2Proc(); + PAR2ProcCPU* par2cpu = nullptr; + PAR2ProcOCL* par2ocl = nullptr; + + if(test.useCpu && test.useOcl && test.sliceSize < 3) + test.useOcl = false; // not enable space to split + if(test.useCpu) par2cpu = new PAR2ProcCPU(IF_LIBUV(loop)); + if(test.useOcl) par2ocl = new PAR2ProcOCL(IF_LIBUV(loop)); + // note the above needs to be allocated before this lambda, so that it captures the allocated values as opposed to nullptr + + auto endCb = [=]() { + std::shared_ptr doneCount(new unsigned(0)); + for(unsigned outputNum=0; outputNum>= 8; +#else + src[region][regionSize/2] &= 0xff; +#endif + regionSize++; + } + regionSize /= 2; + int printFrom = loc; + if(loc > regionSize) printFrom = 0; + size_t printTo = std::min((int)regionSize, printFrom+32); + std::cout << "Input " << region << ":" << std::endl; + print_mem_region(src[region], printFrom, printTo); + + + uint16_t coeff = gfmat_coeff(inputIndicies[region], outputIndicies[outputNum]); + std::cout << "Input " << region << " (*" << coeff << "):" << std::endl; + // since we're exiting, just edit in-place + for(unsigned iidx=printFrom; iidxdeinit(deinitCb); +#else + par2->deinit(); + deinitCb(); +#endif + } + }; +#ifdef USE_LIBUV + par2->getOutput(outputNum, buffer, outputCb); +#else + outputCb(par2->getOutput(outputNum, buffer).get()); +#endif + } + }; + + std::shared_ptr input(new unsigned(0)); + auto addInputCb = [=](unsigned) { + if(*input >= test.numInputs) return; + // TODO: make last chunk smaller + while(1) { + IF_NOT_LIBUV(par2->waitForAdd()); + auto added = par2->addInput(src[*input], *input == test.numInputs-1 ? test.lastSliceSize : test.sliceSize, inputIndicies[*input], false IF_LIBUV(, nullptr)); +#ifdef USE_LIBUV + if(!added) break; +#else + (void)added; +#endif + if(++(*input) == test.numInputs) { +#ifdef USE_LIBUV + par2->endInput(endCb); +#else + par2->endInput().get(); + endCb(); +#endif + break; + } + } + }; + + std::vector par2backends; + if(test.useCpu && test.useOcl) { + // split between the two evenly + // TODO: test different splits + size_t half = test.sliceSize >> 1; + half += half&1; + par2backends.push_back({par2ocl, 0, half}); + par2backends.push_back({par2cpu, half, test.sliceSize-half}); + } else if(test.useCpu) { + par2backends.push_back({par2cpu, 0, test.sliceSize}); + } else { + par2backends.push_back({par2ocl, 0, test.sliceSize}); + } + + par2->init(test.sliceSize, par2backends IF_LIBUV(, addInputCb)); + if(par2cpu) par2cpu->init(test.cpuMethod); + if(test.cpuThreads) par2cpu->setNumThreads(test.cpuThreads); + if(par2ocl) par2ocl->init(test.oclMethod); + if(!par2->setRecoverySlices(test.numOutputs, outputIndicies)) { + std::cout << "Init failed" << std::endl; + exit(1); + } + + // generate reference + for(unsigned output=0; output outputSizeTests{1, 15, 16}; // must be less than MAX_TEST_OUTPUTS + gf16_generate_log_tables(); + gfmat_init(); + + if(useOcl) { + if(PAR2ProcOCL::load_runtime()) { + std::cerr << "OpenCL load failed" << std::endl; + return 1; + } + } + + // generate source regions + srand(0x01020304); + for(unsigned i=0; i tests; + const std::vector sliceSizes{2, REGION_SIZE-2, REGION_SIZE}; + for(size_t sliceSize : sliceSizes) { + + std::vector lastSliceSizes{1, 2}; + if(sliceSize > 2) { + lastSliceSizes.push_back(sliceSize-1); + lastSliceSizes.push_back(sliceSize); + } + for(const auto& lastSliceSize : lastSliceSizes) { + if(lastSliceSize < 1) continue; + + for(unsigned numOutputs : outputSizeTests) { + const std::vector inputSizes{1, 15, 16}; // must be less than MAX_TEST_REGIONS + for(const auto& numRegions : inputSizes) { + if(numRegions == 1 && lastSliceSize != sliceSize) continue; // pointless test + if(lastSliceSize != sliceSize && lastSliceSize != 1 && (numRegions > 15 || numOutputs > 2)) + continue; // don't bother testing every lastSliceSize against all input/output region combinations (only test partial and full) + + + if(useCpu && useOcl) { + tests.push({ + sliceSize, lastSliceSize, numRegions, numOutputs, GF16_AUTO, 0, GF16OCL_AUTO, useCpu, useOcl + }); + } else if(useCpu) { + const std::vector methods = skipMethods ? std::vector{GF16_AUTO} : PAR2ProcCPU::availableMethods(); + const std::vector threadTests{1, 2, 23}; + for(auto threads : threadTests) { + for(const auto& method : methods) { + tests.push({ + sliceSize, lastSliceSize, numRegions, numOutputs, method, threads, GF16OCL_AUTO, useCpu, useOcl + }); + } + } + } else { + const std::vector methods = skipMethods ? std::vector{GF16OCL_AUTO} : PAR2ProcOCL::availableMethods(); + for(const auto& method : methods) { + tests.push({ + sliceSize, lastSliceSize, numRegions, numOutputs, GF16_AUTO, 0, method, useCpu, useOcl + }); + } + } + + } + } + } + } + + std::function testRunner; + testRunner = [=, &tests, &testRunner]() -> bool { + if(tests.empty()) return false; +#ifndef USE_LIBUV + (void)testRunner; +#endif + + auto test = tests.front(); + tests.pop(); + if(verbose) { + test.print("Test "); + std::cout << std::endl; + } + run_test(test IF_LIBUV(, testRunner)); + return true; + }; + +#ifdef USE_LIBUV + testRunner(); + uv_run(loop, UV_RUN_DEFAULT); + uv_loop_close(loop); + delete loop; +#else + while(testRunner()); +#endif + + + for(int i=0; i +#include +#include + +static bool p2c_invert(std::vector inputValid, std::vector recovery, Galois16*& leftmatrix) { + // get reference from par2cmdline + std::vector outputrows; + for(uint16_t r : recovery) + outputrows.push_back(RSOutputRow(true, r)); + return ReedSolomon_Compute(inputValid, outputrows, leftmatrix); +} + +static void compare_invert(const Galois16RecMatrix& mat, Galois16* leftmatrix, std::vector inputValid, std::vector recovery) { + unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true); + unsigned invalidCount = inputValid.size()-validCount; + + if(recovery.size() != invalidCount) abort(); + + // compare + for(unsigned outRow = 0; outRow < invalidCount; outRow++) + for(unsigned inCol = 0; inCol < inputValid.size(); inCol++) { + if(leftmatrix[outRow * inputValid.size() + inCol] != mat.GetFactor(inCol, outRow)) + abort(); + } +} + +static void do_test(std::vector inputValid, std::vector recovery, Galois16Methods method) { + std::sort(recovery.begin(), recovery.end()); + + // get reference from par2cmdline + Galois16* leftmatrix = nullptr; + bool canInvert = p2c_invert(inputValid, recovery, leftmatrix); + + // do inversion + unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true); + Galois16RecMatrix mat; + mat.regionMethod = (int)method; + if(mat.Compute(inputValid, validCount, recovery) != canInvert) abort(); + if(canInvert) { + compare_invert(mat, leftmatrix, inputValid, recovery); + } + if(leftmatrix) + delete[] leftmatrix; +} + +static void show_help() { + std::cout << "test-inv [-v] [-f]" << std::endl; + exit(0); +} + +int main(int argc, char** argv) { + bool verbose = false; + bool fast = false; // faster test: only test default method + fewer iterations + + for(int i=1; i methods = fast ? std::vector{GF16_AUTO} : Galois16Mul::availableMethods(true); + + for(auto method : methods) { + // one block only + do_test(std::vector{false}, std::vector{0}, method); + do_test(std::vector{false}, std::vector{1}, method); + do_test(std::vector{false}, std::vector{65534}, method); + // first block is bad + do_test(std::vector{false, true}, std::vector{0}, method); + // 3/4 bad blocks, just enough recovery + do_test(std::vector{false, false, true, false}, std::vector{0,1,2}, method); + // all bad blocks, insufficient recovery + do_test(std::vector{false, false, false, false}, std::vector{0,1,5}, method); + // all bad blocks, sufficient recovery + do_test(std::vector{false, false, false, false}, std::vector{1,5,8,100}, method); + // PAR2 flaw (can't invert matrix) [https://sourceforge.net/p/parchive/mailman/parchive-devel/thread/202374635.20040218104317%40pbclements.co.uk/] + std::vector flawedInput(6555, true); + flawedInput[0] = false; + flawedInput[6554] = false; + do_test(flawedInput, std::vector{0,5}, method); + // invertible + do_test(flawedInput, std::vector{0,6}, method); + + // PAR2 flaw, but invertible by discarding a bad recovery + { + Galois16RecMatrix mat; + std::vector recovery{0,5,6}; + mat.regionMethod = (int)method; + + unsigned validCount = std::count(flawedInput.begin(), flawedInput.end(), true); + if(!mat.Compute(flawedInput, validCount, recovery)) abort(); + if(recovery.size() != 2) abort(); + if(!((recovery.at(0) == 0 || recovery.at(0) == 5) && recovery.at(1) == 6)) abort(); + + Galois16* leftmatrix = nullptr; + bool canInvert = p2c_invert(flawedInput, recovery, leftmatrix); + if(!canInvert) abort(); + + compare_invert(mat, leftmatrix, flawedInput, recovery); + delete[] leftmatrix; + } + + // a few more tests to check multi-region multiplies work + do_test(std::vector{false, false, false, false, false}, std::vector{0,3,5,17,65534}, method); + do_test(std::vector{false, false, false, false, false, false}, std::vector{0,1,2,3,32768,65534}, method); + do_test(std::vector{false, false, false, false, false, false, false}, std::vector{0,1,2,3,4,5,6}, method); + do_test(std::vector{true, false, false, false, false, false, false, false, false}, std::vector{0,1,2,3,5,6,7,8}, method); + } + + + + std::cout << "Random tests..." << std::endl; + + std::mt19937 rnd; + rnd.seed(0x01020304); + std::vector recIdx(65535); + for(int i=0; i<65535; i++) recIdx[i] = i; + + const std::vector inputSizeTests{2, 100, 1234, 32768}; + for(uint16_t iSize : inputSizeTests) { + std::vector inputValid(iSize); + std::vector validProb{0.1f, 0.5f, 0.9f}; + if(iSize == 32768) { + validProb.clear(); + validProb.push_back(0.01f); // otherwise would be too slow + } + + for(int round=0; round<(iSize>100?(fast?1:2):10); round++) { + for(float pValid : validProb) { + uint16_t invalidCount = 0; + // generate distribution + for(int v=0; v pValid; + invalidCount += inputValid[v] ? 0 : 1; + } + if(invalidCount < 1) continue; + + + // num outputs = num failures + std::shuffle(recIdx.begin(), recIdx.end(), rnd); + std::vector recovery(recIdx.begin(), recIdx.begin() + invalidCount); + std::sort(recovery.begin(), recovery.end()); + + // get reference from par2cmdline + Galois16* leftmatrix = nullptr; + bool canInvert = p2c_invert(inputValid, recovery, leftmatrix); + + for(auto method : methods) { + if(verbose) std::cout << " " << iSize << "x" << invalidCount << " [" << (pValid*100) << "% validity] (" << Galois16Mul::methodToText(method) << ")" << std::endl; + + + recovery = std::vector(recIdx.begin(), recIdx.begin() + invalidCount); + std::sort(recovery.begin(), recovery.end()); + + // do inversion + Galois16RecMatrix mat; + mat.regionMethod = (int)method; + if(mat.Compute(inputValid, iSize-invalidCount, recovery) != canInvert) abort(); + if(canInvert) { + compare_invert(mat, leftmatrix, inputValid, recovery); + } + } + + if(leftmatrix) + delete[] leftmatrix; + } + } + } + + gfmat_free(); + std::cout << "Tests passed" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/test/gf16/test-pmul.cpp b/test/gf16/test-pmul.cpp new file mode 100644 index 00000000..159ebbfa --- /dev/null +++ b/test/gf16/test-pmul.cpp @@ -0,0 +1,106 @@ + +#include "gf16pmul.h" +#include "test.h" + +const int MAX_TEST_REGIONS = 20; +// earlier GCC doesn't like `const int` used for alignment statements, so use a define instead +#define REGION_ALIGNMENT 4096 +const int REGION_SIZE = MAX_TEST_REGIONS * 1024; // largest stride = 1024 bytes from Xor512 + +struct TestFunc { + Galois16PointMulMethods id; + Gf16PMulFunc fn; + unsigned blocklen; +}; +static void show_help() { + std::cout << "test-pmul [-v]" << std::endl; + exit(0); +} + +int main(int argc, char** argv) { + bool verbose = false; + int seeds[] = {0x01020304, 0x50607080 }; + + for(int i=1; i funcs; + if(gf16pmul_available_sse) + funcs.push_back({ + GF16PMUL_PCLMUL, &gf16pmul_sse, 16 + }); + if(gf16pmul_available_avx2) + funcs.push_back({ + GF16PMUL_AVX2, &gf16pmul_avx2, 32 + }); + if(gf16pmul_available_vpclmul) + funcs.push_back({ + GF16PMUL_VPCLMUL, &gf16pmul_vpclmul, 32 + }); + if(gf16pmul_available_vpclgfni) + funcs.push_back({ + GF16PMUL_VPCLMUL_GFNI, &gf16pmul_vpclgfni, 64 + }); + if(gf16pmul_available_neon) + funcs.push_back({ + GF16PMUL_NEON, &gf16pmul_neon, 32 + }); + if(gf16pmul_available_sve2) + funcs.push_back({ + GF16PMUL_SVE2, &gf16pmul_sve2, gf16pmul_sve2_width()*2 + }); + + for(int seed : seeds) { + // generate source regions + ref + srand(seed); + for(size_t i=0; i methods = Galois16Mul::availableMethods(true); + std::vector gf; + std::vector gfScratch; + for(auto method : methods) { + gf.emplace_back(method); + } + gfScratch.reserve(methods.size()); + for(const auto& g : gf) { + gfScratch.push_back(g.mutScratch_alloc()); + } + + bool testAllFuncs = true; + bool testCksum = false, testPrep = false, testMul = false, testAdd = false, testPow = false, testWord = false; + for(int i=1; i outputSizeTests{1, 2, 15, 16, 17}; // must be less than MAX_TEST_OUTPUTS + + // allocate src/tmp regions + uint16_t* src; + uint16_t* tmp, * tmp2; + ALIGN_ALLOC(src, REGION_SIZE*MAX_TEST_REGIONS, REGION_ALIGNMENT); + ALIGN_ALLOC(tmp, REGION_SIZE*MAX_TEST_REGIONS, REGION_ALIGNMENT); + ALIGN_ALLOC(tmp2, REGION_SIZE*MAX_TEST_REGIONS, REGION_ALIGNMENT); + uint16_t* dst; + uint16_t* ref; + const unsigned allocOutputs = MAX_TEST_OUTPUTS > MAX_PACK_REGIONS ? MAX_TEST_OUTPUTS : MAX_PACK_REGIONS; + ALIGN_ALLOC(dst, (REGION_SIZE+MAX_MISALIGN*2)*allocOutputs, REGION_ALIGNMENT); + ALIGN_ALLOC(ref, REGION_SIZE*allocOutputs, REGION_ALIGNMENT); + if(!src || !tmp || !dst || !ref) { + std::cout << "Failed to allocate memory" << std::endl; + return 2; + } + + uint16_t* srcM[MAX_TEST_REGIONS]; + uint16_t* tmpM[MAX_TEST_REGIONS]; + for(size_t i=0; i regionSizes{g.info().stride, g.info().stride-1, REGION_SIZE, REGION_SIZE-1, REGION_SIZE+1}; + for(unsigned regionSize : regionSizes) { + if(verbose) std::cout << " " << g.info().name << ": regionSize=" << regionSize << std::endl; + memset(tmp, seed&0xff, REGION_SIZE*2); + memset(dst, seed&0xff, REGION_SIZE*2); + g.copy_cksum(tmp, src, regionSize, regionSize); + unsigned totalSize = regionSize + g.info().cksumSize; + if(memcmp(dst, (char*)tmp+totalSize, REGION_SIZE*2 - totalSize)) { + std::cout << "Cksum copy checksum wrote too much data: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl; + return 1; + } + if(!g.copy_cksum_check(dst, tmp, regionSize)) { + std::cout << "Cksum copy checksum failure: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl; + std::cout << "Checksum:" << std::endl; + print_mem_region((uint16_t*)((uintptr_t)tmp + regionSize), 0, g.info().cksumSize/2); + if(regionSize <= g.info().stride*2) { + std::cout << "Data:" << std::endl; + print_mem_region(src, 0, (regionSize+1)/2); + } + return 1; + } + if(memcmp(dst, src, regionSize)) { + std::cout << "Cksum copy data failure: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl; + display_mem_diff(src, dst, regionSize/2); + return 1; + } + // check that it detects failure + tmp[0] ^= 0x1111; + if(g.copy_cksum_check(dst, tmp, regionSize)) { + std::cout << "Cksum copy failed to detect checksum error: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl; + std::cout << "Checksum:" << std::endl; + print_mem_region((uint16_t*)((uintptr_t)tmp + regionSize), 0, g.info().cksumSize/2); + return 1; + } + + + // test with add + const std::vector lastRegionSizes{1, 2, REGION_SIZE/2-1, REGION_SIZE/2, REGION_SIZE/2+1, regionSize-1, regionSize}; + for(auto lastRegionSize : lastRegionSizes) { + if(lastRegionSize > regionSize) continue; + g.copy_cksum(tmp2, srcM[0], regionSize, regionSize); + g.copy_cksum(tmp, srcM[1], lastRegionSize, regionSize); + unsigned addSize = regionSize + g.info().stride; + while(addSize % g.info().stride) + addSize++; + g.mul_add(tmp2, tmp, addSize, 1, gfScratch[gi]); + if(!g.copy_cksum_check(dst, tmp2, regionSize)) { + std::cout << "Cksum copy checksum (with add) failure: " << g.info().name << " (regionSize=" << regionSize << ", lastRegionSize=" << lastRegionSize << ")" << std::endl; + return 1; + } + // the zeroed section of the second region should be the same + if(memcmp((char*)dst + lastRegionSize, (char*)src + lastRegionSize, regionSize - lastRegionSize)) { + std::cout << "Cksum copy data (with add) failure: " << g.info().name << " (regionSize=" << regionSize << ", lastRegionSize=" << lastRegionSize << ")" << std::endl; + display_mem_diff(src + lastRegionSize/2, dst + lastRegionSize/2, (regionSize-lastRegionSize+1)/2); + return 1; + } + } + } + } + } + + // test prepare/finish + if(testPrep) { + std::cout << "Testing prepare/finish..." << std::endl; + for(const auto& g : gf) { + if(!g.needPrepare()) continue; + //const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride); + const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride; + if(verbose) std::cout << " " << g.info().name << std::endl; + memset(dst, seed&0xff, REGION_SIZE); // scramble, to ensure we're actually doing something + g.prepare(dst, src, regionSize); + g.finish(dst, regionSize); + if(memcmp(dst, src, regionSize)) { + std::cout << "Prepare/finish failure: " << g.info().name << std::endl; + display_mem_diff(src, dst, regionSize/2); + return 1; + } + // test prepare not aligned to stride + for(int offset = -(int)g.info().stride+1; offset < 0; offset++) { + memset(dst, seed&0xff, REGION_SIZE); // fill with non-zero to test zero-fill + g.prepare(dst, src, regionSize + offset); + g.finish(dst, regionSize); + if(memcmp(dst, src, regionSize + offset)) { + std::cout << "Prepare/finish misaligned (" << offset << ") failure: " << g.info().name << std::endl; + display_mem_diff(src, dst, regionSize/2); + return 1; + } + if(memcmp((uint8_t*)dst + regionSize + offset, zeroes, -offset)) { + std::cout << "Prepare/finish misaligned zero-fill (" << offset << ") failure: " << g.info().name << std::endl; + print_mem_region(dst, (regionSize-g.info().stride)>>1, regionSize>>1); + return 1; + } + } + // test in-situ prepare + memcpy(dst, src, regionSize); + g.prepare(dst, dst, regionSize); + g.finish(dst, regionSize); + if(memcmp(dst, src, regionSize)) { + std::cout << "Prepare/finish in-situ failure: " << g.info().name << std::endl; + display_mem_diff(src, dst, regionSize/2); + return 1; + } + } + + // test prepare packed + accumulate + std::cout << "Testing prepare packed..." << std::endl; + for(unsigned gi = 0; gi < gf.size(); gi++) { + const auto& g = gf[gi]; + + const unsigned stride = g.info().stride; + //const unsigned regionSize = rounddown_to(REGION_SIZE, stride); + const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride; + const std::vector srcLenOffsets{0, 1, 2, 3, stride, stride+1, regionSize/2, regionSize/2+1, regionSize/2+stride, regionSize-stride, regionSize-1}; + for(const auto& srcLenOffset : srcLenOffsets) { + size_t srcLen = regionSize - srcLenOffset; + for(const auto& srcLenLastOffset : srcLenOffsets) { + size_t srcLenLast = regionSize - srcLenLastOffset; + if(srcLenLast > srcLen) continue; + + const std::vector chunkLenOffsets{-(int)stride, 0, (int)stride, (int)stride*2, (int)rounddown_to(regionSize/2, (int)stride), (int)rounddown_to(regionSize/2, (int)stride)+(int)stride, (int)roundup_to(regionSize/3, (int)stride), (int)(regionSize-stride)}; + for(const auto& chunkLenOffset : chunkLenOffsets) { + size_t chunkLen = regionSize - chunkLenOffset; + for(unsigned inputPackSize = 1; inputPackSize <= MAX_PACK_REGIONS; inputPackSize++) { + if(inputPackSize == 1 && srcLenLast != srcLen) continue; // pointless test + + if(verbose) std::cout << " " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl; + + // generate reference + memset(ref, 0, REGION_SIZE); + for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) { + size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen; + for(size_t i=0; i= 0) { + memset(tmp, seed&0xff, REGION_SIZE*MAX_PACK_REGIONS); // scramble, to ensure we're actually doing something + memset(dst, 0, REGION_SIZE); + + // pack input + for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) { + size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen; + g.prepare_packed(tmp, srcM[inputNum], len, regionSize, inputPackSize, inputNum, chunkLen); + } + // compute output + for(size_t sliceOffset=0; sliceOffset < regionSize; sliceOffset += chunkLen) { + size_t len = chunkLen; + if(regionSize - sliceOffset < len) + len = roundup_to(regionSize - sliceOffset, stride); + g.add_multi_packed(inputPackSize, inputPackSize, (uint8_t*)dst + sliceOffset, (uint8_t*)tmp + sliceOffset*inputPackSize, len); + } + g.finish(dst, regionSize); + + // test result + if(memcmp(dst, ref, regionSize)) { + std::cout << "Prepare packed failure: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl; + display_mem_diff(ref, dst, regionSize/2); + return 1; + } + } + + + // test again using checksumming variant + const size_t regionSizeWithCksum = regionSize+stride; + memset(tmp, seed&0xff, regionSizeWithCksum*MAX_PACK_REGIONS); + memset(dst, (seed>>8)&0xff, REGION_SIZE); + + for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) { + size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen; + g.prepare_packed_cksum(tmp, srcM[inputNum], len, regionSize, inputPackSize, inputNum, chunkLen); + } + // check that the partial prepare matches against full prepare + const std::vector lastPartLens{0, (int)stride, (int)stride*2, -(int)stride}; + for(const int lastPartLen : lastPartLens) if(srcLenLast >= (unsigned)abs(lastPartLen)) { + memset(tmp2, seed&0xff, regionSizeWithCksum*MAX_PACK_REGIONS); + for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) { + size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen; + size_t first, last; + if(lastPartLen < 0) { + first = -lastPartLen; + } else { + first = len-lastPartLen; + if(first % stride && lastPartLen) // align to stride if this is the first part + first += stride - (first % stride); + } + if(first > len) first = len; + last = len-first; + g.prepare_partial_packsum(tmp2, srcM[inputNum], len, regionSize, inputPackSize, inputNum, chunkLen, 0, first); + if(last) + g.prepare_partial_packsum(tmp2, (char*)(srcM[inputNum]) + first, len, regionSize, inputPackSize, inputNum, chunkLen, len-last, last); + } + if(memcmp(tmp2, tmp, regionSizeWithCksum*MAX_PACK_REGIONS)) { + std::cout << "Prepare packed-cksum differs from partial version: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << ", lastPartLen=" << lastPartLen << std::endl; + display_mem_diff(tmp, tmp2, (regionSizeWithCksum*MAX_PACK_REGIONS)/2); + return 1; + } + } + memset(tmp2, 0, regionSizeWithCksum); + + for(size_t sliceOffset=0; sliceOffset < regionSizeWithCksum; sliceOffset += chunkLen) { + size_t len = chunkLen; + if(regionSizeWithCksum - sliceOffset < len) + len = roundup_to(regionSizeWithCksum - sliceOffset, stride); + g.add_multi_packed(inputPackSize, inputPackSize, (uint8_t*)tmp2 + sliceOffset, (uint8_t*)tmp + sliceOffset*inputPackSize, len); + } + int checksumResult = g.finish_packed_cksum(dst, tmp2, regionSize, 1, 0, regionSizeWithCksum); + if(memcmp(dst, ref, regionSize)) { + std::cout << "Prepare packed-cksum failure: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl; + display_mem_diff(ref, dst, regionSize/2); + return 1; + } + if(!checksumResult) { + std::cout << "Prepare/finish packed checksum failure: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl; + return 1; + } + } + } + } + } + } + + std::cout << "Testing finish packed..." << std::endl; + { + uint16_t coeffs[MAX_PACK_REGIONS]; // used for finish-cksum + for(auto& coeff : coeffs) + coeff = rand() & 0xffff; + + for(unsigned gi = 0; gi < gf.size(); gi++) { + const auto& g = gf[gi]; + + const unsigned stride = g.info().stride; + //const unsigned alignedRegionSize = rounddown_to(REGION_SIZE, stride); + const unsigned alignedRegionSize = MAX_TEST_REGIONS * g.info().stride; + + const std::vector srcLenOffsets{0, 2, stride-2}; + for(const auto& srcLenOffset : srcLenOffsets) { + size_t srcLen = alignedRegionSize - srcLenOffset; + + const std::vector chunkLenOffsets{-(int)stride, 0, (int)stride, (int)stride*2, (int)rounddown_to(alignedRegionSize/2, (int)stride), (int)rounddown_to(alignedRegionSize/2, (int)stride)+(int)stride, (int)roundup_to(alignedRegionSize/3, (int)stride), (int)(alignedRegionSize-stride)}; + for(const auto& chunkLenOffset : chunkLenOffsets) { + size_t chunkLen = alignedRegionSize - chunkLenOffset; + for(unsigned numOutputs = 1; numOutputs <= MAX_PACK_REGIONS; numOutputs++) { + if(verbose) std::cout << " " << g.info().name << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << std::endl; + + if(chunkLenOffset >= 0) { + memset(dst, seed&0xff, REGION_SIZE*MAX_PACK_REGIONS); // scramble, to ensure we're actually doing something + + // pack input + // TODO: if there's output interleaving, this won't work :( + for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) { + unsigned chunk = 0; + for(size_t pos = 0; pos < srcLen; pos += chunkLen) { + size_t len = srcLen - pos; + if(len > chunkLen) len = chunkLen; + g.prepare(tmp + (chunk*numOutputs*chunkLen + outputNum*roundup_to(len, stride))/2, srcM[outputNum] + pos/2, len); + ++chunk; + } + } + /* + for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) { + g.prepare_packed(tmp, srcM[outputNum], srcLen, alignedRegionSize, numOutputs, outputNum, chunkLen); + } + // TODO: need to fix the below + for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) { + g.mul_add_multi_packed(numOutputs, numOutputs, tmp2, tmp, chunkLen, <0s>, gfScratch[gi]); + } + */ + // unpack output + for(unsigned misalign = 0; misalign < MAX_MISALIGN; misalign++) { + for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) { + // because dstM is region aligned and aliased, we need to hack around the fact that misalignment overflows the regions + uint8_t* outputDst = (uint8_t*)dstM[outputNum] + misalign + misalign * outputNum*2; + uint16_t* odPre = (uint16_t*)(outputDst - misalign); + uint16_t* odPost = (uint16_t*)(outputDst + srcLen); + memcpy(odPre, guard_magic, misalign); + memcpy(odPost, guard_magic, misalign); + g.finish_packed(outputDst, tmp, srcLen, numOutputs, outputNum, chunkLen); + + // test result + if(memcmp(outputDst, srcM[outputNum], srcLen)) { + std::cout << "Packed finish failure: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << std::endl; + display_mem_diff(srcM[outputNum], (uint16_t*)outputDst, (alignedRegionSize*numOutputs)/2); + return 1; + } + if(memcmp(odPre, guard_magic, misalign)) { + std::cout << "Packed finish pre-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << std::endl; + print_mem_region(odPre, 0, (misalign+1)/2); + return 1; + } + if(memcmp(odPost, guard_magic, misalign)) { + std::cout << "Packed finish post-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << std::endl; + print_mem_region(odPost, 0, (misalign+1)/2); + return 1; + } + } + } + } + + // test finish with checksum + const size_t regionSizeWithCksum = alignedRegionSize+stride; + memset(tmp, seed&0xff, regionSizeWithCksum*numOutputs); + memset(dst, seed&0xff, REGION_SIZE*numOutputs); + + g.prepare_packed_cksum(tmp2, src, srcLen, alignedRegionSize, 1, 0, chunkLen); + for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) { + for(size_t sliceOffset=0; sliceOffset < regionSizeWithCksum; sliceOffset += chunkLen) { + size_t len = chunkLen; + if(regionSizeWithCksum - sliceOffset < len) + len = roundup_to(regionSizeWithCksum - sliceOffset, stride); + //g.mul((uint8_t*)tmp + outputNum*len + sliceOffset*numOutputs, (uint8_t*)tmp2 + sliceOffset, len, coeffs[outputNum], gfScratch[gi]); + + uint8_t* tmpPtr = (uint8_t*)tmp + outputNum*len + sliceOffset*numOutputs; + memset(tmpPtr, 0, len); + g.mul_add_multi_packed(1, 1, tmpPtr, (uint8_t*)tmp2 + sliceOffset, len, coeffs + outputNum, gfScratch[gi]); + } + } + for(unsigned misalign = 0; misalign < MAX_MISALIGN; misalign++) { + for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) { + uint8_t* outputDst = (uint8_t*)dstM[outputNum] + misalign + misalign * outputNum*2; + uint16_t* odPre = (uint16_t*)(outputDst - misalign); + uint16_t* odPost = (uint16_t*)(outputDst + srcLen); + memcpy(odPre, guard_magic, misalign); + memcpy(odPost, guard_magic, misalign); + + // compute reference + for(size_t i=0; i firstLens{srcLen, 0, stride, stride*2}; + if(srcLen % stride) { + size_t srcLenAligned = srcLen - (srcLen % stride); + firstLens.push_back(srcLenAligned); + firstLens.push_back(srcLenAligned - stride); + } else + firstLens.push_back(srcLen - stride); + for(size_t firstLen : firstLens) { + int checksumResult; + if(firstLen == srcLen) + checksumResult = g.finish_packed_cksum(outputDst, tmp, srcLen, numOutputs, outputNum, chunkLen); + else { + memcpy(tmp2, tmp, regionSizeWithCksum*numOutputs); + if(firstLen) + g.finish_partial_packsum(outputDst, tmp2, srcLen, numOutputs, outputNum, chunkLen, 0, firstLen); + checksumResult = g.finish_partial_packsum(outputDst+firstLen, tmp2, srcLen, numOutputs, outputNum, chunkLen, firstLen, srcLen-firstLen); + } + if(memcmp(outputDst, ref, srcLen)) { + std::cout << "Packed finish-cksum failure: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", firstLen=" << firstLen << std::endl; + display_mem_diff(ref, (uint16_t*)outputDst, srcLen/2); + return 1; + } + if(memcmp(odPre, guard_magic, misalign)) { + std::cout << "Packed finish pre-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << ", firstLen=" << firstLen << std::endl; + print_mem_region(odPre, 0, (misalign+1)/2); + return 1; + } + if(memcmp(odPost, guard_magic, misalign)) { + std::cout << "Packed finish post-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << ", firstLen=" << firstLen << std::endl; + print_mem_region(odPost, 0, (misalign+1)/2); + return 1; + } + if(!checksumResult) { + std::cout << "Prepare/finish packed checksum failure: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << ", firstLen=" << firstLen << std::endl; + return 1; + } + } + } + } + } + } + } + } + } + } + + // test mul/mul_add + if(testMul) { + std::cout << "Testing mul/muladd..." << std::endl; + for(int test=0; test<(fastMul ? 256 : 65536); test++) { + int coeff = test; + if(fastMul && test > 1) + coeff = rand() & 0xffff; + // compute mul reference + for(size_t i=0; i= MAX_TEST_REGIONS) break; + + // packed muladd_multi + g.prepare(dst, src2, regionSize); + for(unsigned region = 0; region < maxRegions; region++) + g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize); + g.mul_add_multi_packed(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize, coeffs, gfScratch[gi]); + g.finish(dst, regionSize); + if(memcmp(dst, ref, regionSize)) { + std::cout << "Mul_add_multi_packed (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl; + display_mem_diff(ref, dst, regionSize/2); + return 1; + } + + // packed muladd_multi with prefetch + // can't really test prefetch functionality, so just test it like above + g.prepare(dst, src2, regionSize); + for(unsigned region = 0; region < maxRegions; region++) + g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize); + g.mul_add_multi_packpf(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize, coeffs, gfScratch[gi], tmp, tmp2 /*prefetch - any memory will do*/); + g.finish(dst, regionSize); + if(memcmp(dst, ref, regionSize)) { + std::cout << "Mul_add_multi_packpf (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl; + display_mem_diff(ref, dst, regionSize/2); + return 1; + } + } + } + } + } + } + + + // test multi_add + if(testAdd) { + std::cout << "Testing multi add..." << std::endl; + for(unsigned maxRegions=1; maxRegions= MAX_TEST_REGIONS) break; + + g.prepare(dst, src2, regionSize); + for(unsigned region = 0; region < maxRegions; region++) + g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize); + g.add_multi_packed(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize); + g.finish(dst, regionSize); + if(memcmp(dst, ref, regionSize)) { + std::cout << "Add_multi_packed (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl; + display_mem_diff(ref, dst, regionSize/2); + return 1; + } + + // packed add_multi with prefetch + // can't really test prefetch functionality, so just test it like above + g.prepare(dst, src2, regionSize); + for(unsigned region = 0; region < maxRegions; region++) + g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize); + g.add_multi_packpf(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize, tmp, tmp2 /*prefetch - any memory will do*/); + g.finish(dst, regionSize); + if(memcmp(dst, ref, regionSize)) { + std::cout << "Add_multi_packpf (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl; + display_mem_diff(ref, dst, regionSize/2); + return 1; + } + } + } + } + } + + + if(testPow) { + std::cout << "Testing pow..." << std::endl; + for(int outputs : outputSizeTests) { + for(int test=0; test<(fastMul ? 256 : 65536); test++) { + int coeff = test; + if(fastMul && test > 1) + coeff = rand() & 0xffff; + + // compute pow reference + for(int output=0, curCoeff=coeff; output < outputs; output++, curCoeff = gf16_mul(curCoeff, coeff)) { + for(size_t i=0; i 1) + coeff = rand() & 0xffff; + + // compute pow reference + for(int output=0, curCoeff=coeff; output < outputs; output++, curCoeff = gf16_mul(curCoeff, coeff)) { + for(size_t i=0; i +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +# define ALIGN_TO(a, v) __declspec(align(a)) v +#else +# define ALIGN_TO(a, v) v __attribute__((aligned(a))) +#endif + +#include +#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) + // MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007 + #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align) + #define ALIGN_FREE _aligned_free +#elif defined(_ISOC11_SOURCE) + // C11 method + // len needs to be a multiple of alignment, although it sometimes works if it isn't... + #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1)) + #define ALIGN_FREE free +#elif defined(__cplusplus) && __cplusplus >= 201700 + // C++17 method + #include + #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1)) + #define ALIGN_FREE free +#else + #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL + #define ALIGN_FREE free +#endif + +#ifdef _MSC_VER +# ifndef __BYTE_ORDER__ +# define __BYTE_ORDER__ 1234 +# endif +# ifndef __ORDER_BIG_ENDIAN__ +# define __ORDER_BIG_ENDIAN__ 4321 +# endif +#endif + + +const uint8_t guard_magic[] = { 0xdb, 0xef, 0x55, 0xf4 }; + +static inline size_t roundup_to(size_t n, size_t rounding) { + return ((n + rounding-1) / rounding) * rounding; +} +static inline size_t rounddown_to(size_t n, size_t rounding) { + return (n / rounding) * rounding; +} + +static uint16_t gf16_log[65536]; +static uint16_t gf16_antilog[65536]; +static void gf16_generate_log_tables(int polynomial = 0x1100b) { + int n = 1; + memset(gf16_log, 0, sizeof(gf16_log)); + for(int i=0; i<65535; i++) { + gf16_log[n] = i; + gf16_antilog[i] = n; + n <<= 1; + if(n > 0xffff) n ^= polynomial; + } + gf16_antilog[65535] = gf16_antilog[0]; +} +static inline uint16_t gf16_mul(uint16_t a, uint16_t b) { + if(a == 0 || b == 0) return 0; + int log_prod = (int)gf16_log[a] + (int)gf16_log[b]; + return gf16_antilog[(log_prod >> 16) + (log_prod & 0xffff)]; +} + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +static inline uint16_t gf16_mul_le(uint16_t src, uint16_t coeff) { + uint16_t r = gf16_mul((src>>8) | ((src&0xff)<<8), coeff); + return (r >> 8) | ((r & 0xff) << 8); +} +#else +# define gf16_mul_le gf16_mul +#endif + +static int find_mem_diff(const uint16_t* a, const uint16_t* b, int n) { + for(int i=0; i n) to = n; + + printf("Expected:\n"); + print_mem_region(a, from, to); + printf("Actual:\n"); + print_mem_region(b, from, to); + return from; +} diff --git a/test/hasher/CMakeLists.txt b/test/hasher/CMakeLists.txt new file mode 100644 index 00000000..507dd536 --- /dev/null +++ b/test/hasher/CMakeLists.txt @@ -0,0 +1,154 @@ +cmake_minimum_required(VERSION 2.8.9...3.22) +project(hasher_test) + +option(SKIP_AUX "Bypass getauxval checks (for testing purposes)" OFF) + +include(CheckCXXCompilerFlag) +include(CheckIncludeFileCXX) +include(CheckCXXSymbolExists) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_C_STANDARD 99) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Debug) +endif() +if(NOT TARGET_ARCH) + if(CMAKE_GENERATOR_PLATFORM) + set(TARGET_ARCH ${CMAKE_GENERATOR_PLATFORM}) + else() + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) + endif() +endif() + +message("Building for ${TARGET_ARCH}") +if (${TARGET_ARCH} MATCHES "i386|i686|x86|x86_64|x64|amd64|AMD64|win32|Win32") + set(IS_X86 TRUE) + if(${TARGET_ARCH} MATCHES "x86_64|x64|amd64|AMD64") + set(IS_X64 TRUE) + endif() +endif() +if (${TARGET_ARCH} MATCHES "arm|ARM|aarch64|arm64|ARM64") + set(IS_ARM TRUE) +endif() +if (${TARGET_ARCH} MATCHES "riscv64|rv64") + set(IS_RISCV64 TRUE) +endif() +if (${TARGET_ARCH} MATCHES "riscv32|rv32") + set(IS_RISCV32 TRUE) +endif() + +if(SKIP_AUX) + add_compile_definitions(PARPAR_SKIP_AUX_CHECK=1) +endif() + +set(HASHER_DIR ../../hasher) +set(SRC_DIR ../../src) +set(HASHER_C_SOURCES + ${HASHER_DIR}/crc_zeropad.c + ${HASHER_DIR}/md5-final.c +) + +set(HASHER_CPP_SOURCES + ${HASHER_DIR}/hasher.cpp + ${HASHER_DIR}/hasher_armcrc.cpp + ${HASHER_DIR}/hasher_avx2.cpp + ${HASHER_DIR}/hasher_avx512.cpp + ${HASHER_DIR}/hasher_avx512vl.cpp + ${HASHER_DIR}/hasher_bmi1.cpp + ${HASHER_DIR}/hasher_clmul.cpp + ${HASHER_DIR}/hasher_neon.cpp + ${HASHER_DIR}/hasher_neoncrc.cpp + ${HASHER_DIR}/hasher_scalar.cpp + ${HASHER_DIR}/hasher_sse.cpp + ${HASHER_DIR}/hasher_sve2.cpp + ${HASHER_DIR}/hasher_xop.cpp +) + +include_directories(${HASHER_DIR}) + +if(MSVC) + set(RELEASE_COMPILE_FLAGS /GS- /Gy /sdl- /Oy /Oi) + set(RELEASE_LINK_FLAGS /OPT:REF /OPT:ICF) + add_compile_options(/W2 "$<$>:${RELEASE_COMPILE_FLAGS}>") + add_link_options("$<$>:${RELEASE_LINK_FLAGS}>") +else() + add_compile_options(-Wall -Wextra -Wno-unused-function) + if(${CMAKE_BUILD_TYPE} MATCHES "Debug") + add_compile_options(-ggdb) + else() + if(NOT ENABLE_SANITIZE) + add_compile_options(-fomit-frame-pointer) + endif() + endif() + + if(ENABLE_SANITIZE) + set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined) + add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS}) + add_link_options(${SANITIZE_OPTS}) + endif() +endif() + +add_compile_definitions(PARPAR_INVERT_SUPPORT=1) +add_library(hasher_c STATIC ${HASHER_C_SOURCES}) +add_library(hasher STATIC ${HASHER_CPP_SOURCES}) +target_link_libraries(hasher hasher_c) + +if(NOT MSVC) + if(ENABLE_SANITIZE) + target_compile_options(hasher PRIVATE -fno-exceptions) + else() + target_compile_options(hasher PRIVATE -fno-rtti -fno-exceptions) + endif() + target_compile_definitions(hasher_c PRIVATE _POSIX_C_SOURCE=200112L) + target_compile_definitions(hasher_c PRIVATE _DARWIN_C_SOURCE=) + target_compile_definitions(hasher_c PRIVATE _GNU_SOURCE=) +endif() + +if(MSVC) + if(IS_X86) + set_source_files_properties(${HASHER_DIR}/hasher_avx2.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX2) + set_source_files_properties(${HASHER_DIR}/hasher_avx512.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${HASHER_DIR}/hasher_avx512vl.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX512) + set_source_files_properties(${HASHER_DIR}/hasher_bmi1.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX) + set_source_files_properties(${HASHER_DIR}/hasher_xop.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX) + endif() +endif() +if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(IS_X86) + set_source_files_properties(${HASHER_DIR}/hasher_avx2.cpp PROPERTIES COMPILE_OPTIONS -mavx2) + set_source_files_properties(${HASHER_DIR}/hasher_avx512.cpp PROPERTIES COMPILE_OPTIONS "-mavx512f") + set_source_files_properties(${HASHER_DIR}/hasher_avx512vl.cpp PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw;-mbmi2;-mpclmul") + set_source_files_properties(${HASHER_DIR}/hasher_bmi1.cpp PROPERTIES COMPILE_OPTIONS "-mpclmul;-mavx;-mbmi") + set_source_files_properties(${HASHER_DIR}/hasher_clmul.cpp PROPERTIES COMPILE_OPTIONS "-mpclmul;-msse4.1") + set_source_files_properties(${HASHER_DIR}/hasher_sse.cpp PROPERTIES COMPILE_OPTIONS -msse2) + set_source_files_properties(${HASHER_DIR}/hasher_xop.cpp PROPERTIES COMPILE_OPTIONS "-mxop;-mavx") + endif() + + if(IS_ARM AND NOT APPLE) # M1 Macs don't seem to need these ARM options + CHECK_CXX_COMPILER_FLAG("-mfpu=neon -march=armv7-a" COMPILER_SUPPORTS_ARM32_NEON) + if(COMPILER_SUPPORTS_ARM32_NEON) + set_source_files_properties(${HASHER_DIR}/hasher_neon.cpp PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a") + set_source_files_properties(${HASHER_DIR}/hasher_neoncrc.cpp PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv8-a+crc") + set_source_files_properties(${HASHER_DIR}/hasher_armcrc.cpp PROPERTIES COMPILE_OPTIONS "-mfpu=fp-armv8;-march=armv8-a+crc") + else() + CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" COMPILER_SUPPORTS_ARM_CRC) + if(COMPILER_SUPPORTS_ARM_CRC) + set_source_files_properties(${HASHER_DIR}/hasher_neoncrc.cpp PROPERTIES COMPILE_OPTIONS -march=armv8-a+crc) + set_source_files_properties(${HASHER_DIR}/hasher_armcrc.cpp PROPERTIES COMPILE_OPTIONS -march=armv8-a+crc) + endif() + endif() + CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve2" COMPILER_SUPPORTS_SVE2) + if(COMPILER_SUPPORTS_SVE2) + set_source_files_properties(${HASHER_DIR}/hasher_sve2.cpp PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2) + endif() + endif() +endif() + + + + +# binaries +set(TEST_DIR .) +add_executable(test ${TEST_DIR}/test.cpp) +target_link_libraries(test hasher) diff --git a/test/hasher/test.cpp b/test/hasher/test.cpp new file mode 100644 index 00000000..57203fd6 --- /dev/null +++ b/test/hasher/test.cpp @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include +#include "hasher.h" + +typedef char md5hash[16]; // add null byte for convenience + +typedef void(*MD5SingleUpdate_t)(uint32_t*, const void*, size_t); +typedef uint32_t(*CRC32_Calc_t)(const void*, size_t); +typedef uint32_t(*MD5CRC_Calc_t)(const void*, size_t, size_t, void*); + +uint32_t readUint32LE(uint8_t* p) { + return (*p) | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); +} +void writeUint32LE(uint8_t* p, uint32_t v) { + p[0] = v & 0xff; + p[1] = (v >> 8) & 0xff; + p[2] = (v >> 16) & 0xff; + p[3] = (v >> 24) & 0xff; +} + +// TODO: test MD5Single::updateZero + +bool do_tests(IHasherInput* hasher, MD5SingleUpdate_t md5sgl, MD5CRC_Calc_t md5crcImpl, CRC32_Calc_t crc32impl) { + md5hash md5; + uint8_t md5crc[20]; + MD5Single md5hasher, md5extract; + if(md5sgl) md5hasher._update = md5sgl; + #define MD5_ACTION(act) if(hasher) hasher->act; if(md5sgl) md5hasher.act + #define DO_MD5CRC(data, zp) \ + if(md5crcImpl) { \ + uint32_t c = md5crcImpl(data, sizeof(data)-1, zp, md5crc); \ + writeUint32LE(md5crc+16, c); \ + } + #define ADD_DATA(data, zpMd5Crc) \ + MD5_ACTION(update(data, sizeof(data)-1)); \ + DO_MD5CRC(data, zpMd5Crc) + #define CHECK_BLOCK(zp, xMd5, xCrc, t) \ + if(md5crcImpl) { \ + if(memcmp(md5crc, xMd5, 16)) { printf("md5crc-md5 (" t "): "); return true; } \ + if(readUint32LE(md5crc+16) != xCrc) { printf("md5crc-crc (" t ") [%x <> %x]: ", readUint32LE(md5crc+16), xCrc); return true; } \ + } \ + if(hasher) { \ + hasher->getBlock(md5crc, zp); \ + if(memcmp(md5crc, xMd5, 16)) { printf("getBlock-md5 (" t "): "); return true; } \ + if(readUint32LE(md5crc+16) != xCrc) { printf("getBlock-crc (" t ") [%x <> %x]: ", readUint32LE(md5crc+16), xCrc); return true; } \ + } + #define CHECK_END(xMd5, t) \ + if(hasher) { \ + hasher->extractFileMD5(md5extract); \ + md5extract.end(md5); \ + if(memcmp(md5, xMd5, 16)) { printf("input-extract (" t "): "); return true; } \ + hasher->end(md5); \ + if(memcmp(md5, xMd5, 16)) { printf("input-end (" t "): "); return true; } \ + } \ + if(md5sgl) { \ + md5hasher.end(md5); \ + if(memcmp(md5, xMd5, 16)) { printf("single (" t "): "); return true; } \ + } + #define CHECK_CRC(data, xCrc, t) \ + if(crc32impl) { \ + if(crc32impl(data, sizeof(data)-1) != xCrc) { printf("crc (" t "): "); return true; } \ + } + // test blank + DO_MD5CRC("", 0) + CHECK_BLOCK(0, "\xd4\x1d\x8c\xd9\x8f\0\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42\x7e", 0, "blank") + CHECK_END("\xd4\x1d\x8c\xd9\x8f\0\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42\x7e", "blank") + CHECK_CRC("", 0, "blank") + + // zero padding tests + MD5_ACTION(reset()); + DO_MD5CRC("", 1) + CHECK_BLOCK(1, "\x93\xb8\x85\xad\xfe\x0d\xa0\x89\xcd\xf6\x34\x90\x4f\xd5\x9f\x71", 0xd202ef8d, "blank + 1 zero") + DO_MD5CRC("", 4) + CHECK_BLOCK(4, "\xf1\xd3\xff\x84\x43\x29\x77\x32\x86\x2d\xf2\x1d\xc4\xe5\x72\x62", 0x2144df1c, "blank + 4 zeroes") + DO_MD5CRC("", 55) + CHECK_BLOCK(55, "\xc9\xea\x33\x14\xb9\x1c\x9f\xd4\xe3\x8f\x94\x32\x06\x4f\xd1\xf2", 0x113bc241, "blank + 55 zeroes") + DO_MD5CRC("", 56) + CHECK_BLOCK(56, "\xe3\xc4\xdd\x21\xa9\x17\x1f\xd3\x9d\x20\x8e\xfa\x09\xbf\x78\x83", 0xd3c8a549, "blank + 56 zeroes") + DO_MD5CRC("", 57) + CHECK_BLOCK(57, "\xab\x9d\x8e\xf2\xff\xa9\x14\x5d\x6c\x32\x5c\xef\xa4\x1d\x5d\x4e", 0xddd1de1c, "blank + 57 zeroes") + DO_MD5CRC("", 63) + CHECK_BLOCK(63, "\x65\xce\xcf\xb9\x80\xd7\x2f\xde\x57\xd1\x75\xd6\xec\x1c\x3f\x64", 0xe8aadae4, "blank + 63 zeroes") + DO_MD5CRC("", 64) + CHECK_BLOCK(64, "\x3b\x5d\x3c\x7d\x20\x7e\x37\xdc\xee\xed\xd3\x01\xe3\x5e\x2e\x58", 0x758d6336, "blank + 64 zeroes") + DO_MD5CRC("", 65) + CHECK_BLOCK(65, "\x1e\xf5\xe8\x29\x30\x3a\x13\x9c\xe9\x67\x44\x0e\x0c\xdc\xa1\x0c", 0x1dcdf777, "blank + 65 zeroes") + DO_MD5CRC("", 128) + CHECK_BLOCK(128, "\xf0\x9f\x35\xa5\x63\x78\x39\x45\x8e\x46\x2e\x63\x50\xec\xbc\xe4", 0xc2a8fa9d, "blank + 128 zeroes") + + ADD_DATA("a", 0) + CHECK_BLOCK(0, "\x0c\xc1\x75\xb9\xc0\xf1\xb6\xa8\x31\xc3\x99\xe2\x69\x77\x26\x61", 0xe8b7be43, "single byte") + CHECK_END("\x0c\xc1\x75\xb9\xc0\xf1\xb6\xa8\x31\xc3\x99\xe2\x69\x77\x26\x61", "single byte") + CHECK_CRC("a", 0xe8b7be43, "single byte") + + MD5_ACTION(reset()); + MD5_ACTION(update("ab", 2)); + DO_MD5CRC("ab", 1) + CHECK_BLOCK(1, "\x5d\x36\xfe\x0e\x22\x1c\x3f\xd9\x7c\x6b\x87\xa4\x6c\x9f\xaf\x43", 0xe19f7120, "two bytes + 1 zero") + MD5_ACTION(update("cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012", 53)); + DO_MD5CRC("cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012", 3) + CHECK_BLOCK(3, "\x39\xf1\xb4\x77\xb1\x7a\x07\xb1\xa4\x73\xef\xe9\x2c\x28\xc1\x1f", 0x13c041ef, "53 bytes + 3 zeroes") + CHECK_END("\x3d\x37\x3b\x8c\xd6\xfd\x06\x9d\x31\x3c\xdc\x3f\x38\xa1\x89\x63", "55 bytes") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123", 7); + CHECK_BLOCK(7, "\x85\xea\x2f\x1f\xb8\x4a\x41\x48\x6b\xfe\xc6\x74\x69\x65\x7f\xae", 0x776f469a, "56 bytes + 7 zeroes") + CHECK_END("\xd4\x3e\x61\xe9\xb5\xf8\xc9\xd2\x2c\x4d\xc5\xdb\x6e\x6d\xf7\x75", "56 bytes") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 1); + CHECK_BLOCK(1, "\xe2\x11\x99\xfd\x5d\x1c\xc7\xe4\x20\xd5\xd2\xec\xd6\xa2\x62\xb3", 0xc65ef97b, "63 bytes + 1 zero") + DO_MD5CRC("", 0) + CHECK_BLOCK(0, "\xd4\x1d\x8c\xd9\x8f\0\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42\x7e", 0, "2nd block blank") + CHECK_END("\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", "63 bytes") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 1); + CHECK_BLOCK(1, "\x70\x4f\x4c\x47\x80\xc9\x02\x56\x4a\x7b\xcc\xe6\x6a\x6d\x03\x3a", 0x2830585b, "64 bytes + 1 zero") + CHECK_END("\x2a\x37\x87\xf9\x92\x07\xe3\x6b\x2c\xb2\xc3\x40\x68\x92\xde\xf0", "64 bytes") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_=", 0); + CHECK_BLOCK(0, "\x77\xf8\x6b\xd2\x20\x76\xca\x4e\x99\x0f\xc7\xba\x77\x78\x11\x13", 0x7058144a, "65 bytes") + CHECK_END("\x77\xf8\x6b\xd2\x20\x76\xca\x4e\x99\x0f\xc7\xba\x77\x78\x11\x13", "65 bytes") + CHECK_CRC("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_=", 0x7058144a, "65 bytes") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 2); + CHECK_BLOCK(2, "\x93\xc9\x82\x8d\x41\x99\xd6\xb6\xfa\xee\x9b\xe5\xef\xfd\xd9\xee", 0x151319c0, "63 bytes + 2 zeroes") + MD5_ACTION(update("_a", 2)); + MD5_ACTION(update("b", 1)); + MD5_ACTION(update("cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 62)); + CHECK_END("\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", "128 bytes") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0); + CHECK_BLOCK(0, "\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", 0x5d4ab91c, "63 bytes") + CHECK_CRC("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0x5d4ab91c, "63 bytes") + MD5_ACTION(update("_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 65)); + CHECK_END("\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", "128 bytes (2)") + + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 0); + CHECK_BLOCK(0, "\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", 0xcf479cf1, "128 bytes") + CHECK_CRC("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 0xcf479cf1, "128 bytes") + CHECK_END("\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", "128 bytes (single update)") + + // test block slipping case + MD5_ACTION(reset()); + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0); + CHECK_BLOCK(0, "\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", 0x5d4ab91c, "63 bytes (1)") + ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0); + CHECK_BLOCK(0, "\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", 0x5d4ab91c, "63 bytes (2)") + CHECK_END("\xb7\x8f\x77\xf2\x49\xd1\x1b\xab\x5f\xcd\x04\xc3\x34\x85\xde\x56", "2x63 bytes") + + + // TODO: need more tests with mismatched block/file offsets + + // long tests + MD5_ACTION(reset()); + DO_MD5CRC("", 10000) + CHECK_BLOCK(10000, "\xb8\x5d\x6f\xb9\xef\x42\x60\xdc\xf1\xce\x0a\x1b\x0b\xff\x80\xd3", 0x4d3bca2e, "10000 zeroes") + // randomish mix + uint8_t stuff[8128]; // == (1+127)*63.5 + for(unsigned c=0; creset(); + hasher->update(src, len); + hasher->end(); + +#ifdef _MSC_VER + md5hash results[MAX_REGIONS]; +#else + md5hash results[regions]; +#endif + hasher->get(results); + for(int i=0; iget1(i, result); + if(memcmp(expected[i], result, 16)) { + return true; + } + if(memcmp(results[i], result, 16)) { + return true; + } + } + + // test multi-part update + if(len > 1) { + int firstChunk = len >= 64 ? 64-1 : 1; + const void* src2[MAX_REGIONS]; + for(int i=0; ireset(); + hasher->update(src, firstChunk); + hasher->update(src2, len - firstChunk); + hasher->end(); + hasher->get(results); + + for(int i=0; iget1(i, result); + if(memcmp(expected[i], result, 16)) { + return true; + } + if(memcmp(results[i], result, 16)) { + return true; + } + } + } + + return false; +} + + +int main(void) { + #define ERROR(s) { std::cout << s << std::endl; return 1; } + + std::cout << "Testing individual hashers..." << std::endl; + auto singleHashers = hasherMD5CRC_availableMethods(true); + for(auto hId : singleHashers) { + set_hasherMD5CRC(hId); + std::cout << " " << md5crc_methodName(); + if(do_tests(nullptr, MD5Single::_update, MD5CRC_Calc, CRC32_Calc)) ERROR(" - FAILED"); + std::cout << std::endl; + } + + std::cout << "Testing input hashers..." << std::endl; + auto inputHashers = hasherInput_availableMethods(true); + for(auto hId : inputHashers) { + set_hasherInput(hId); + std::cout << " " << hasherInput_methodName(); + auto hasher = HasherInput_Create(); + if(do_tests(hasher, nullptr, nullptr, nullptr)) ERROR(" - FAILED"); + hasher->destroy(); + std::cout << std::endl; + } + + set_hasherInput(inputHashers[0]); + IHasherInput* hiScalar = HasherInput_Create(); + + srand(0x12345678); + // test multi-buffer + // (this assumes the input hasher works) + char data[MAX_REGIONS][128]; + const void* dataPtr[MAX_REGIONS]; + md5hash ref[MAX_REGIONS]; + for(int i=0; ireset(); + hiScalar->update(dataPtr[region], size); + hiScalar->end(ref[region]); + } + + auto hasher = new MD5Multi(numRegions); + + if(do_mb_tests(hasher, ref, dataPtr, size, numRegions)) + ERROR(" - FAILED: regions=" << numRegions << "; size=" << size); + delete hasher; + } + } + + hiScalar->destroy(); + + std::cout << "All tests passed" << std::endl; + return 0; +} From 64c684fa3a5546424d15cf15a7501b8fa1cf6507 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 22 Aug 2023 16:49:35 +1000 Subject: [PATCH 64/91] Python2.7 fix for Windows builds --- .github/workflows/build-dev-win64.yml | 2 +- .github/workflows/build.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-dev-win64.yml b/.github/workflows/build-dev-win64.yml index f82cd12d..1fdeffe3 100644 --- a/.github/workflows/build-dev-win64.yml +++ b/.github/workflows/build-dev-win64.yml @@ -11,7 +11,7 @@ jobs: BUILD_LOGLEVEL: verbose steps: - uses: ilammy/setup-nasm@v1 - - uses: actions/setup-python@v4 + - uses: MatteoH2O1999/setup-python@v1 id: py with: python-version: '2.7' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 84c55000..f17d667a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: BUILD_LOGLEVEL: verbose steps: - uses: ilammy/setup-nasm@v1 - - uses: actions/setup-python@v4 + - uses: MatteoH2O1999/setup-python@v1 id: py with: python-version: '2.7' From cb3c15af3be625326fea10f7f4af237857e1b101 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 22 Aug 2023 17:12:47 +1000 Subject: [PATCH 65/91] Test workflow typo fix --- .github/workflows/test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3bb6d5d1..eaa55095 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,9 +27,9 @@ jobs: - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f - if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }} + if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f - if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }} + if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - run: sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe # test SSE2-only to see if CPUID checking works @@ -126,9 +126,9 @@ jobs: - run: ${{ matrix.t.emu }} test/gf16/build/test - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f - if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }} + if: ${{ matrix.config == 'Release' && matrix.cc_ver == '12' }} - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f - if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }} + if: ${{ matrix.config == 'Release' && matrix.cc_ver == '12' }} - run: ${{ matrix.t.emu }} test/hasher/build/test test-linux-clang: @@ -202,9 +202,9 @@ jobs: - run: ${{ matrix.t.emu }} test/gf16/build/test - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f - if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }} + if: ${{ matrix.config == 'Release' && matrix.cc_ver == '15' }} - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f - if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }} + if: ${{ matrix.config == 'Release' && matrix.cc_ver == '15' }} - run: ${{ matrix.t.emu }} test/hasher/build/test From 4732f5f9d08d72ed1b6e633583007321bf90e0ea Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 22 Aug 2023 17:49:33 +1000 Subject: [PATCH 66/91] Test workflow fix --- .github/workflows/test.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index eaa55095..77c5d681 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,18 +11,18 @@ jobs: compiler: ['v141', 'v142', 'v143', 'ClangCL'] arch: ['Win32', 'x64'] name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} (${{ matrix.config }}) - runs-on: windows-latest + runs-on: windows-2022 steps: - uses: ilammy/setup-nasm@v1 - uses: petarpetrovt/setup-sde@v2.1 - uses: actions/checkout@v3 - run: | mkdir test\gf16\build - cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\gf16\build --config ${{ matrix.config }} mkdir test\hasher\build - cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe @@ -48,16 +48,16 @@ jobs: compiler: ['v142', 'v143', 'ClangCL'] arch: ['ARM', 'ARM64'] name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} - runs-on: windows-latest + runs-on: windows-2022 steps: - uses: actions/checkout@v3 - run: | mkdir test\gf16\build - cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\gf16\build --config Debug mkdir test\hasher\build - cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} + cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config Debug # TODO: test mingw @@ -88,18 +88,18 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 - - run: apt update + - run: sudo apt update - uses: petarpetrovt/setup-sde@v2.1 if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }} - - run: apt install -y qemu-user-static + - run: sudo apt install -y qemu-user-static if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }} - run: | - apt install -y g++-${{ matrix.cc_ver }}-${{ matrix.t.target }} + sudo apt install -y g++-${{ matrix.cc_ver }}-${{ matrix.t.target }} echo "CC=${{ matrix.t.target }}-gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV echo "CXX=${{ matrix.t.target }}-g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV if: ${{ matrix.t.arch != 'amd64' }} - run: | - apt install -y g++-${{ matrix.cc_ver }} + sudo apt install -y g++-${{ matrix.cc_ver }} echo "CC=gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV echo "CXX=g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV if: ${{ matrix.t.arch == 'amd64' }} @@ -156,12 +156,12 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 - - run: apt update && apt install -y clang-${{ matrix.cc_ver }} + - run: sudo apt update && sudo apt install -y clang-${{ matrix.cc_ver }} - uses: petarpetrovt/setup-sde@v2.1 if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }} - - run: apt install -y qemu-user-static + - run: sudo apt install -y qemu-user-static if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }} - - run: apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross + - run: sudo apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross if: ${{ matrix.t.arch != 'amd64' }} - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }} From 4145211c99e50261566add0c431c50df37e919a1 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 23 Aug 2023 23:41:27 +1000 Subject: [PATCH 67/91] Test workflow fix --- .github/workflows/test.yml | 44 ++++++++++++++++++++++---------------- test/gf16/test-inv.cpp | 3 ++- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 77c5d681..358a48ff 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: fail-fast: false matrix: config: [Debug, Release] - compiler: ['v141', 'v142', 'v143', 'ClangCL'] + compiler: ['v141', 'v143', 'ClangCL'] arch: ['Win32', 'x64'] name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} (${{ matrix.config }}) runs-on: windows-2022 @@ -24,19 +24,19 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe - - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f + - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe + - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f + - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe + - run: $SDE_PATH\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe # test SSE2-only to see if CPUID checking works - run: | - sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe - sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe + $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe + $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + $SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} @@ -47,6 +47,9 @@ jobs: matrix: compiler: ['v142', 'v143', 'ClangCL'] arch: ['ARM', 'ARM64'] + exclude: + - compiler: ClangCL + arch: ARM name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} runs-on: windows-2022 steps: @@ -74,8 +77,8 @@ jobs: cc_ver: ['9','12'] t: # qemu x86 doesn't support AVX, so we use Intel SDE instead - - {arch: 'i386', target: 'i686-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icl --'} - - {arch: 'amd64', target: 'x86-64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icl --'} + - {arch: 'i386', target: 'i686-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icx --'} + - {arch: 'amd64', target: 'x86-64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icx --'} #- {arch: 'amd64', target: 'x86-64-linux-gnux32', libc: 'x32', emu: 'qemu-x86_64-static -cpu max'} # TODO: how to test x32? - {arch: 'aarch64', target: 'aarch64-linux-gnu', libc: 'arm64', emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'} @@ -139,8 +142,8 @@ jobs: # Clang 6 available in 20.04 cc_ver: ['11','15'] t: - - {arch: 'i386', target: 'i686-linux-gnu', cl_target: 'x86-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icl --'} - - {arch: 'amd64', target: 'x86-64-linux-gnu', cl_target: 'x86_64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icl --'} + - {arch: 'i386', target: 'i686-linux-gnu', cl_target: 'i386-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icx --'} + - {arch: 'amd64', target: 'x86-64-linux-gnu', cl_target: 'x86_64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icx --'} #- {arch: 'amd64', target: 'x86-64-linux-gnux32', cl_target: 'x86-64-linux-gnux32', libc: 'x32', emu: 'qemu- -cpu max'} # TODO: how to test x32? - {arch: 'aarch64', target: 'aarch64-linux-gnu', cl_target: 'aarch64-linux-gnu', libc: 'arm64', emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'} @@ -166,6 +169,9 @@ jobs: - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }} - run: | + if [ '${{ matrix.t.arch }}' != 'amd64' ]; then + LINKER_FLAG=-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld + fi mkdir test/gf16/build cmake -Btest/gf16/build -Stest/gf16 -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} $SANITIZE \ -DCMAKE_C_COMPILER=clang-${{ matrix.cc_ver }} \ @@ -178,9 +184,9 @@ jobs: -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ - -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ - -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ - -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld + -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \ + -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \ + $LINKER_FLAG cmake --build test/gf16/build mkdir test/hasher/build @@ -195,9 +201,9 @@ jobs: -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \ - -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ - -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \ - -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld + -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \ + -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \ + $LINKER_FLAG cmake --build test/hasher/build - run: ${{ matrix.t.emu }} test/gf16/build/test - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul diff --git a/test/gf16/test-inv.cpp b/test/gf16/test-inv.cpp index a42c2c27..58e3fef5 100644 --- a/test/gf16/test-inv.cpp +++ b/test/gf16/test-inv.cpp @@ -21,11 +21,12 @@ static void compare_invert(const Galois16RecMatrix& mat, Galois16* leftmatrix, s if(recovery.size() != invalidCount) abort(); // compare - for(unsigned outRow = 0; outRow < invalidCount; outRow++) + for(unsigned outRow = 0; outRow < invalidCount; outRow++) { for(unsigned inCol = 0; inCol < inputValid.size(); inCol++) { if(leftmatrix[outRow * inputValid.size() + inCol] != mat.GetFactor(inCol, outRow)) abort(); } + } } static void do_test(std::vector inputValid, std::vector recovery, Galois16Methods method) { From d0b72690c76d9e10843f5503c998afcd3a1271d0 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 11:15:54 +1000 Subject: [PATCH 68/91] Disable SVE on Clang<12 Produces suspect code --- src/platform.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/platform.h b/src/platform.h index 3ee0d37d..d4e8ebf2 100644 --- a/src/platform.h +++ b/src/platform.h @@ -203,6 +203,16 @@ HEDLEY_WARNING("GFNI disabled on GCC < 10 due to incorrect GF2P8AFFINEQB operand # endif #endif +#if defined(__ARM_FEATURE_SVE) && defined(__clang__) && __clang_major__<12 +// Clang < 12 has issues with SVE +# ifdef __ARM_FEATURE_SVE +# undef __ARM_FEATURE_SVE +# endif +# ifdef __ARM_FEATURE_SVE2 +# undef __ARM_FEATURE_SVE2 +# endif +#endif + #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0) // GCC added RVV intrinsics in GCC13 # undef __riscv_vector From 0d852f8eb2c5e23dca11aae5ae9d5a323e81192b Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 15:57:58 +1000 Subject: [PATCH 69/91] Try to fix 'out of registers' errors for MD5 ASM --- hasher/md5x2-sse.h | 2 +- hasher/md5x2-x86-asm.h | 73 ++++++++++++++++++++++++------------------ 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/hasher/md5x2-sse.h b/hasher/md5x2-sse.h index c77e4ac1..c860262f 100644 --- a/hasher/md5x2-sse.h +++ b/hasher/md5x2-sse.h @@ -1,5 +1,5 @@ -#if defined(__GNUC__) || defined(__clang__) +#if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE__) # define MD5_USE_ASM # include "md5x2-sse-asm.h" #endif diff --git a/hasher/md5x2-x86-asm.h b/hasher/md5x2-x86-asm.h index 325938a7..d6c61926 100644 --- a/hasher/md5x2-x86-asm.h +++ b/hasher/md5x2-x86-asm.h @@ -155,45 +155,52 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co #else # define ASM_PARAMS_ONES #endif -#define ASM_PARAMS(i0, i1) \ +#define ASM_PARAMS(in) \ [A1]"+&r"(A1), [B1]"+&r"(B1), [C1]"+&r"(C1), [D1]"+&r"(D1), \ [A2]"+&r"(A2), [B2]"+&r"(B2), [C2]"+&r"(C2), [D2]"+&r"(D2), \ [TMP1]"=&r"(tmp1), [TMP2]"=&r"(tmp2) \ -: [i0_0]"m"(_data[0][i0]), [i0_1]"m"(_data[0][i1]), \ - [i1_0]"m"(_data[1][i0]), [i1_1]"m"(_data[1][i1]) ASM_PARAMS_ONES \ +: [i0]"m"(_data[0][in]), [i1]"m"(_data[1][in]) ASM_PARAMS_ONES \ : #define RF4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ - ROUND_F(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 7) \ - ROUND_F(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 12) \ -: ASM_PARAMS(i0, i1)); __asm__( \ - ROUND_F(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 17) \ - ROUND_F(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 22) \ -: ASM_PARAMS(i2, i3)); + ROUND_F(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 7) \ +: ASM_PARAMS(i0)); __asm__( \ + ROUND_F(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 12) \ +: ASM_PARAMS(i1)); __asm__( \ + ROUND_F(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 17) \ +: ASM_PARAMS(i2)); __asm__( \ + ROUND_F(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 22) \ +: ASM_PARAMS(i3)); #define RG4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ - ROUND_G(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 5) \ - ROUND_G(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 9) \ -: ASM_PARAMS(i0, i1)); __asm__( \ - ROUND_G(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 14) \ - ROUND_G(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 20) \ -: ASM_PARAMS(i2, i3)); + ROUND_G(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 5) \ +: ASM_PARAMS(i0)); __asm__( \ + ROUND_G(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 9) \ +: ASM_PARAMS(i1)); __asm__( \ + ROUND_G(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 14) \ +: ASM_PARAMS(i2)); __asm__( \ + ROUND_G(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 20) \ +: ASM_PARAMS(i3)); #define RH4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ - ROUND_H(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 4) \ - ROUND_H(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 11) \ -: ASM_PARAMS(i0, i1)); __asm__( \ - ROUND_H(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 16) \ - ROUND_H(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 23) \ -: ASM_PARAMS(i2, i3)); + ROUND_H(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 4) \ +: ASM_PARAMS(i0)); __asm__( \ + ROUND_H(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 11) \ +: ASM_PARAMS(i1)); __asm__( \ + ROUND_H(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 16) \ +: ASM_PARAMS(i2)); __asm__( \ + ROUND_H(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 23) \ +: ASM_PARAMS(i3)); #define RI4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \ - ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 6) \ - ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 10) \ -: ASM_PARAMS(i0, i1)); __asm__( \ - ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 15) \ - ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 21) \ -: ASM_PARAMS(i2, i3)); + ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 6) \ +: ASM_PARAMS(i0)); __asm__( \ + ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 10) \ +: ASM_PARAMS(i1)); __asm__( \ + ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 15) \ +: ASM_PARAMS(i2)); __asm__( \ + ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 21) \ +: ASM_PARAMS(i3)); A1 += read32(_data[0]); A2 += read32(_data[1]); @@ -218,12 +225,14 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co RI4(15, 6, 13, 4, 0x6fa87e4f, -0x01d31920, -0x5cfebcec, 0x4e0811a1) __asm__( - ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", -0x08ac817e, 6) - ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", -0x42c50dcb, 10) - : ASM_PARAMS(11, 2)); __asm__( - ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", 0x2ad7d2bb, 15) + ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", -0x08ac817e, 6) + : ASM_PARAMS(11)); __asm__( + ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", -0x42c50dcb, 10) + : ASM_PARAMS(2)); __asm__( + ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", 0x2ad7d2bb, 15) + : ASM_PARAMS(9)); __asm__( ROUND_I_LAST(B1, C1, D1, A1, B2, C2, D2, A2, -0x14792c6f, 21) - : ASM_PARAMS(9, 0)); + : ASM_PARAMS(0)); state[0] += A1; state[1] += B1; state[2] += C1; From b4858c9c2e26d081670e08f56bc23a73b63da66d Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 16:13:57 +1000 Subject: [PATCH 70/91] Windows workflow fixes --- .github/workflows/build-dev-win64.yml | 2 +- .github/workflows/build.yml | 2 +- .github/workflows/test.yml | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-dev-win64.yml b/.github/workflows/build-dev-win64.yml index 1fdeffe3..6e7a2a2a 100644 --- a/.github/workflows/build-dev-win64.yml +++ b/.github/workflows/build-dev-win64.yml @@ -20,7 +20,7 @@ jobs: - run: (cd nexe && npm install --production) - run: (cd nexe && node build) - run: nexe\parpar.exe --version - - run: nexe\parpar.exe -r1 -s1M -onexe\test.par2 nexe\parpar.exe + - run: nexe\parpar.exe -r1 -s1M -o nexe\test.par2 nexe\parpar.exe - run: move nexe\parpar.exe parpar.exe && 7z a -t7z -mx=9 parpar.7z parpar.exe - uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f17d667a..2c5c36d5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,7 +29,7 @@ jobs: - run: (cd nexe && npm install --production) - run: (cd nexe && node build) - run: nexe\parpar.exe --version - - run: nexe\parpar.exe -r1 -s1M -onexe\test.par2 nexe\parpar.exe + - run: nexe\parpar.exe -r1 -s1M -o nexe\test.par2 nexe\parpar.exe - run: move nexe\parpar.exe parpar.exe && 7z a -t7z -mx=9 parpar.7z parpar.exe - uses: actions/upload-release-asset@v1 env: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 358a48ff..57b0bf61 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,19 +24,19 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe - - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f + - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe" + - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe" + - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f + - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: $SDE_PATH\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe + - run: "%SDE_PATH%\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe" # test SSE2-only to see if CPUID checking works - run: | - $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe - $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - $SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe + %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe + %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + %SDE_PATH%\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} From 0b8bc40bc0c61035324f8a3c31c7ff2dce9e4465 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 17:18:15 +1000 Subject: [PATCH 71/91] Fix matrix inverse on big endian --- gf16/gf16mul.h | 7 ++++--- gf16/gfmat_inv.cpp | 6 +++--- gf16/gfmat_inv.h | 5 ++++- hasher/crc_arm.h | 18 ------------------ src/platform.h | 15 +++++++++++++++ test/gf16/test.cpp | 8 ++++++++ 6 files changed, 34 insertions(+), 25 deletions(-) diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h index 5979d93a..c8b0e722 100644 --- a/gf16/gf16mul.h +++ b/gf16/gf16mul.h @@ -134,9 +134,10 @@ class Galois16Mul { static void _finish_none(void *HEDLEY_RESTRICT, size_t) {} static void _prepare_packed_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); static uint16_t _replace_word(void* data, size_t index, uint16_t newValue) { - uint16_t* p = (uint16_t*)data + index; - uint16_t oldValue = *p; - *p = newValue; + uint8_t* p = (uint8_t*)data + index*2; + uint16_t oldValue = p[0] | (p[1]<<8); + p[0] = newValue & 0xff; + p[1] = newValue>>8; return oldValue; } diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index 0697b788..ad01643f 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -467,7 +467,7 @@ void Galois16RecMatrix::Construct(const std::vector& inputValid, unsigned if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case for(unsigned stripe=0; stripe& inputValid, unsigned for(loopcond) { \ uint16_t exp = recovery.at(rec); \ for(unsigned i=0; i& inputValid, unsigned unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; \ targetCol = (targetCol/sw16)*sw16*numRec + (targetCol%sw16); \ for(loopcond) { \ - mat[rec * sw16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \ + mat[rec * sw16 + targetCol] = _LE16(gfmat_coeff_from_log(inputLog, recovery.at(rec))); \ } \ } \ assert(validCol == validCount) diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h index 0e18a096..681b0216 100644 --- a/gf16/gfmat_inv.h +++ b/gf16/gfmat_inv.h @@ -6,8 +6,10 @@ #include "../src/stdint.h" #ifdef PARPAR_INVERT_SUPPORT +#include "../src/platform.h" const unsigned PP_INVERT_MAX_MULTI_ROWS = 6; // process up to 6 rows in a multi-mul call + class Galois16Mul; class Galois16RecMatrixWorker; struct Galois16RecMatrixComputeState; @@ -39,13 +41,14 @@ class Galois16RecMatrix { // TODO: check if numStripes==1? consider optimising division? unsigned sw = stripeWidth/sizeof(uint16_t); unsigned stripe = inIdx / sw; - return mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)]; + return _LE16(mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)]); } // these should only be queried after Compute has started (i.e. from the progressCb, or after it returns) /*Galois16Methods*/ int regionMethod; const char* getPointMulMethodName() const; }; + #endif #endif diff --git a/hasher/crc_arm.h b/hasher/crc_arm.h index 7c84ed7c..938e1a29 100644 --- a/hasher/crc_arm.h +++ b/hasher/crc_arm.h @@ -18,21 +18,6 @@ static HEDLEY_ALWAYS_INLINE void crc_init_arm(void* crc) { memset(crc, 0xff, sizeof(uint32_t)); } -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -# ifdef __GNUC__ -# define _LE16 __builtin_bswap16 -# define _LE32 __builtin_bswap32 -# define _LE64 __builtin_bswap64 -# else -// currently not supported -# error No endian swap intrinsic defined -# endif -#else -# define _LE16(x) (x) -# define _LE32(x) (x) -# define _LE64(x) (x) -#endif - static HEDLEY_ALWAYS_INLINE void crc_process_block_arm(void* HEDLEY_RESTRICT crc, const void* HEDLEY_RESTRICT src) { uint32_t* _crc = (uint32_t*)crc; #ifdef __aarch64__ @@ -74,6 +59,3 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_finish_arm(void* HEDLEY_RESTRICT state, return ~crc; } -#undef _LE16 -#undef _LE32 -#undef _LE64 diff --git a/src/platform.h b/src/platform.h index d4e8ebf2..0d57a3bf 100644 --- a/src/platform.h +++ b/src/platform.h @@ -44,6 +44,21 @@ # endif #endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# ifdef __GNUC__ +# define _LE16 __builtin_bswap16 +# define _LE32 __builtin_bswap32 +# define _LE64 __builtin_bswap64 +# else +// currently not supported +# error No endian swap intrinsic defined +# endif +#else +# define _LE16(x) (x) +# define _LE32(x) (x) +# define _LE64(x) (x) +#endif + # ifdef _M_ARM64 #define __ARM_NEON 1 #define __aarch64__ 1 diff --git a/test/gf16/test.cpp b/test/gf16/test.cpp index e90067ac..244a0ad7 100644 --- a/test/gf16/test.cpp +++ b/test/gf16/test.cpp @@ -920,7 +920,15 @@ int main(int argc, char** argv) { memcpy(dst, src, regionSize); for(unsigned i=0; i>8) | ((w&0xff) <<8); + w = g.replace_word(dst, i, w); + w = (w>>8) | ((w&0xff) <<8); + tmp[i] = w; +#else tmp[i] = g.replace_word(dst, i, src2[i]); +#endif } if(g.needPrepare()) g.finish(dst, regionSize); From 8fd18a0ad602e889c78227d17327a656b77d0ac7 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 17:19:04 +1000 Subject: [PATCH 72/91] Add error messages to test-inv failures --- test/gf16/test-inv.cpp | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/test/gf16/test-inv.cpp b/test/gf16/test-inv.cpp index 58e3fef5..f3b0d024 100644 --- a/test/gf16/test-inv.cpp +++ b/test/gf16/test-inv.cpp @@ -18,13 +18,20 @@ static void compare_invert(const Galois16RecMatrix& mat, Galois16* leftmatrix, s unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true); unsigned invalidCount = inputValid.size()-validCount; - if(recovery.size() != invalidCount) abort(); + if(recovery.size() != invalidCount) { + std::cout << "Count mismatch: " << recovery.size() << "!=" << invalidCount << std::endl; + abort(); + } // compare for(unsigned outRow = 0; outRow < invalidCount; outRow++) { for(unsigned inCol = 0; inCol < inputValid.size(); inCol++) { - if(leftmatrix[outRow * inputValid.size() + inCol] != mat.GetFactor(inCol, outRow)) + auto expected = leftmatrix[outRow * inputValid.size() + inCol]; + auto actual = mat.GetFactor(inCol, outRow); + if(expected != actual) { + std::cout << "Value mismatch at " << outRow << "x" << inCol << ": " << expected << "!=" << actual << std::endl; abort(); + } } } } @@ -40,7 +47,10 @@ static void do_test(std::vector inputValid, std::vector recovery unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true); Galois16RecMatrix mat; mat.regionMethod = (int)method; - if(mat.Compute(inputValid, validCount, recovery) != canInvert) abort(); + if(mat.Compute(inputValid, validCount, recovery) != canInvert) { + std::cout << "Inversion success mismatch" << std::endl; + abort(); + } if(canInvert) { compare_invert(mat, leftmatrix, inputValid, recovery); } @@ -102,9 +112,18 @@ int main(int argc, char** argv) { mat.regionMethod = (int)method; unsigned validCount = std::count(flawedInput.begin(), flawedInput.end(), true); - if(!mat.Compute(flawedInput, validCount, recovery)) abort(); - if(recovery.size() != 2) abort(); - if(!((recovery.at(0) == 0 || recovery.at(0) == 5) && recovery.at(1) == 6)) abort(); + if(!mat.Compute(flawedInput, validCount, recovery)) { + std::cout << "Failed to invert PAR2 flaw" << std::endl; + abort(); + } + if(recovery.size() != 2) { + std::cout << "Recovery size mismatch: 2 != " << recovery.size() << std::endl; + abort(); + } + if(!((recovery.at(0) == 0 || recovery.at(0) == 5) && recovery.at(1) == 6)) { + std::cout << "Recovery exponent incorrect" << std::endl; + abort(); + } Galois16* leftmatrix = nullptr; bool canInvert = p2c_invert(flawedInput, recovery, leftmatrix); @@ -170,7 +189,10 @@ int main(int argc, char** argv) { // do inversion Galois16RecMatrix mat; mat.regionMethod = (int)method; - if(mat.Compute(inputValid, iSize-invalidCount, recovery) != canInvert) abort(); + if(mat.Compute(inputValid, iSize-invalidCount, recovery) != canInvert) { + std::cout << "Inversion success mismatch" << std::endl; + abort(); + } if(canInvert) { compare_invert(mat, leftmatrix, inputValid, recovery); } From abd07c8a2f1ae337f088e9a7113facdba1434bd8 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 17:39:10 +1000 Subject: [PATCH 73/91] Move some CLI logic into util file + add unclean exit detector Ref #51 --- bin/parpar.js | 42 +++++++++++++++------------------ cli/util.js | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 23 deletions(-) create mode 100644 cli/util.js diff --git a/bin/parpar.js b/bin/parpar.js index f134bf5b..ed3094ac 100755 --- a/bin/parpar.js +++ b/bin/parpar.js @@ -3,6 +3,7 @@ "use strict"; var ParPar = require('../lib/parpar.js'); +var cliUtil = require('../cli/util'); var cliFormat = process.stderr.isTTY ? function(code, msg) { return '\x1b[' + code + 'm' + msg + '\x1b[0m'; } : function(code, msg) { return msg; }; @@ -19,15 +20,6 @@ var print_json = function(type, obj) { }; var arg_parser = require('../lib/arg_parser.js'); -var friendlySize = function(s) { - var units = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB']; - for(var i=0; i 1024*1048576) { // par2j has 1GB slice size limit hard-coded; 32-bit version supports 1GB slices // some 32-bit applications seem to have issues with 1GB slices as well (phpar2 v1.4 win32 seems to have trouble with 854M slices, 848M works in the test I did) - process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + friendlySize(g.opts.sliceSize) + ') is larger than 1GB, which is beyond what a number of PAR2 clients support. Consider increasing the number of slices or reducing the slice size so that it is under 1GB\n'); + process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + cliUtil.friendlySize(g.opts.sliceSize) + ') is larger than 1GB, which is beyond what a number of PAR2 clients support. Consider increasing the number of slices or reducing the slice size so that it is under 1GB\n'); } else if(g.opts.sliceSize > 100*1000000 && g.totalSize <= 32768*100*1000000) { // we also check whether 100MB slices are viable by checking the input size - essentially there's a max of 32768 slices, so at 100MB, max size would be 3051.76GB - process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + friendlySize(g.opts.sliceSize) + ') may be too large to be compatible with QuickPar\n'); + process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + cliUtil.friendlySize(g.opts.sliceSize) + ') may be too large to be compatible with QuickPar\n'); } process.stderr.write('Input data : ' + sizeDisp(g.totalSize) + ' (' + pluralDisp(g.inputSlices, 'slice') + ' from ' + pluralDisp(info.length, 'file') + ')\n'); @@ -921,6 +906,17 @@ var inputFiles = argv._; else process.stderr.write('\nProcessing time : ' + cliFormat('1', timeTaken + ' s') + '\n'); } + + setTimeout(function() { + if(!argv.quiet) { + process.stderr.write('Process did not terminate cleanly'); + var handles = cliUtil.activeHandleCounts(); + if(handles) + process.stderr.write('; active handles: ' + cliUtil.activeHandlesStr(handles[0])); + process.stderr.write('\n'); + } + process.exit(); + }, 5000).unref(); }); }); diff --git a/cli/util.js b/cli/util.js new file mode 100644 index 00000000..37267312 --- /dev/null +++ b/cli/util.js @@ -0,0 +1,64 @@ +"use strict"; + +module.exports = { + decimalPoint: ('' + 1.1).replace(/1/g, ''), + + friendlySize: function(s) { + var units = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB']; + for(var i=0; i l) return s; + return module.exports.repeatChar((c || ' '), l-s.length) + s; + }, + rpad: function(s, l, c) { + if(s.length > l) return s; + return s + module.exports.repeatChar((c || ' '), l-s.length); + }, + activeHandleCounts: function() { + if(!process._getActiveHandles && !process.getActiveResourcesInfo) + return null; + var hTypes = {}; + var ah; + if(process._getActiveHandles) { // undocumented function, but seems to always work + ah = process._getActiveHandles().filter(function(h) { + // exclude stdout/stderr from count + return !h.constructor || h.constructor.name != 'WriteStream' || (h.fd != 1 && h.fd != 2); + }); + ah.forEach(function(h) { + var cn = (h.constructor ? h.constructor.name : 0) || 'unknown'; + if(cn in hTypes) + hTypes[cn]++; + else + hTypes[cn] = 1; + }); + } else { + process.getActiveResourcesInfo().forEach(function(h) { + if(h in hTypes) + hTypes[h]++; + else + hTypes[h] = 1; + }); + // TODO: is there any way to exclude stdout/stderr? + } + return [hTypes, ah]; + }, + activeHandlesStr: function(hTypes) { + var handleStr = ''; + for(var hn in hTypes) { + handleStr += ', ' + hn + (hTypes[hn] > 1 ? ' (' + hTypes[hn] + ')' : ''); + } + return handleStr.substring(2); + } + +}; From 87b969edb47e84b89453abcaaabb028dfb634684 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 20:39:17 +1000 Subject: [PATCH 74/91] Test workflow fix --- .github/workflows/test.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 57b0bf61..904762d5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,19 +24,19 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe" - - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe" - - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f" + - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe" + - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe" + - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f" + - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "%SDE_PATH%\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe" + - run: "%SDE_PATH%\\sde -icx -- test\hasher\\build\\${{ matrix.config }}\\test.exe" # test SSE2-only to see if CPUID checking works - run: | - %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe - %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - %SDE_PATH%\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe + %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\\test.exe + %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\test-pmul.exe + %SDE_PATH%\\sde -p4 -- test\\hasher\\build\\${{ matrix.config }}\\test.exe if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} From a10998930b6ab77d24c1fc2a41f9f0d268449af0 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 24 Aug 2023 20:40:40 +1000 Subject: [PATCH 75/91] Test workflow fix --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 904762d5..cb302ea8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,7 +30,7 @@ jobs: if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "%SDE_PATH%\\sde -icx -- test\hasher\\build\\${{ matrix.config }}\\test.exe" + - run: "%SDE_PATH%\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe" # test SSE2-only to see if CPUID checking works - run: | From dd000ca97abc3f72b96ad75ff31b20efc6e8a84c Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 26 Aug 2023 20:45:01 +1000 Subject: [PATCH 76/91] Fix leak in par2cmdline inversion test --- test/gf16/p2c-inv/reedsolomon.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/gf16/p2c-inv/reedsolomon.cpp b/test/gf16/p2c-inv/reedsolomon.cpp index 17e41dec..3f86b287 100644 --- a/test/gf16/p2c-inv/reedsolomon.cpp +++ b/test/gf16/p2c-inv/reedsolomon.cpp @@ -135,9 +135,9 @@ bool ReedSolomon_Compute(const vector &present, vector output // SetInput u32 inputcount = (u32)present.size(); - u32* datapresentindex = new u32[inputcount]; - u32* datamissingindex = new u32[inputcount]; - Galois16::ValueType* database = new Galois16::ValueType[inputcount]; + vector datapresentindex(inputcount); + vector datamissingindex(inputcount); + vector database(inputcount); u32 datapresent = 0, datamissing = 0; unsigned int logbase = 0; From e391ad4319f096295974236bfe5c0b9b90e268cb Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 26 Aug 2023 20:45:26 +1000 Subject: [PATCH 77/91] Fixes to test workflow --- .github/workflows/test.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cb302ea8..b75387b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,19 +24,19 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe" - - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe" - - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f" + - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe" + - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe" + - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f" + - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "%SDE_PATH%\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe" + - run: "$env:SDE_PATH\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe" # test SSE2-only to see if CPUID checking works - run: | - %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\\test.exe - %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\test-pmul.exe - %SDE_PATH%\\sde -p4 -- test\\hasher\\build\\${{ matrix.config }}\\test.exe + $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe + $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + $env:SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} @@ -167,7 +167,7 @@ jobs: - run: sudo apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross if: ${{ matrix.t.arch != 'amd64' }} - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV - if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }} + if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' && matrix.cc_ver == '15' }} # SDE+ASAN problematic with Clang 11 - run: | if [ '${{ matrix.t.arch }}' != 'amd64' ]; then LINKER_FLAG=-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld From 5901d7addfe7cf4c90a2747eab2aa43f09641e9d Mon Sep 17 00:00:00 2001 From: animetosho Date: Sun, 27 Aug 2023 16:51:12 +1000 Subject: [PATCH 78/91] Updates to build workflow --- .github/workflows/build.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2c5c36d5..21444993 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,7 +22,7 @@ jobs: - uses: actions/checkout@v3 - name: Get release id: get_release - uses: bruceadams/get-release@v1.2.3 + uses: bruceadams/get-release@v1.3.2 env: GITHUB_TOKEN: ${{ github.token }} - run: npm install --production --ignore-scripts @@ -31,7 +31,7 @@ jobs: - run: nexe\parpar.exe --version - run: nexe\parpar.exe -r1 -s1M -o nexe\test.par2 nexe\parpar.exe - run: move nexe\parpar.exe parpar.exe && 7z a -t7z -mx=9 parpar.7z parpar.exe - - uses: actions/upload-release-asset@v1 + - uses: sekwah41/upload-release-assets@v1 env: GITHUB_TOKEN: ${{ github.token }} with: @@ -63,7 +63,7 @@ jobs: python-version: '3.9' # workaround "cannot import name 'Mapping' from 'collections'" error - name: Get release id: get_release - uses: bruceadams/get-release@v1.2.3 + uses: bruceadams/get-release@v1.3.2 env: GITHUB_TOKEN: ${{ github.token }} - run: | @@ -78,7 +78,7 @@ jobs: - run: nexe/parpar --version - run: nexe/parpar -r1 -s1M -onexe/test.par2 nexe/parpar - run: xz -9e --x86 --lzma2 nexe/parpar -c > parpar.xz - - uses: actions/upload-release-asset@v1 + - uses: sekwah41/upload-release-assets@v1 env: GITHUB_TOKEN: ${{ github.token }} with: @@ -113,7 +113,7 @@ jobs: arch: arm64 - name: Get release id: get_release - uses: bruceadams/get-release@v1.2.3 + uses: bruceadams/get-release@v1.3.2 env: GITHUB_TOKEN: ${{ github.token }} - run: npm install --production --ignore-scripts @@ -125,7 +125,7 @@ jobs: CC_host: cc CXX_host: c++ - run: xz -9e --lzma2 nexe/parpar -c > parpar.xz - - uses: actions/upload-release-asset@v1 + - uses: sekwah41/upload-release-assets@v1 env: GITHUB_TOKEN: ${{ github.token }} with: @@ -166,7 +166,7 @@ jobs: # packages: "libstdc++-$(c++ -dumpversion)-dev:i386 libc6-dev:i386" # - name: Get release # id: get_release -# uses: bruceadams/get-release@v1.2.3 +# uses: bruceadams/get-release@v1.3.2 # env: # GITHUB_TOKEN: ${{ github.token }} # - run: npm install --production --ignore-scripts @@ -178,7 +178,7 @@ jobs: # CC_host: cc # CXX_host: c++ # - run: xz -9e --arm --lzma2 nexe/parpar -c > parpar.xz -# - uses: actions/upload-release-asset@v1 +# - uses: sekwah41/upload-release-assets@v1 # env: # GITHUB_TOKEN: ${{ github.token }} # with: @@ -197,7 +197,7 @@ jobs: - uses: actions/checkout@v3 - name: Get release id: get_release - uses: bruceadams/get-release@v1.2.3 + uses: bruceadams/get-release@v1.3.2 env: GITHUB_TOKEN: ${{ github.token }} - run: npm install --production --ignore-scripts @@ -206,7 +206,7 @@ jobs: - run: nexe/parpar --version - run: nexe/parpar -r1 -s1M -onexe/test.par2 nexe/parpar - run: xz -9e --x86 --lzma2 nexe/parpar -c > parpar.xz - - uses: actions/upload-release-asset@v1 + - uses: sekwah41/upload-release-assets@v1 env: GITHUB_TOKEN: ${{ github.token }} with: From edcb353ba76f4913f662fa5f91ad9ee4ac03c6f1 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 28 Aug 2023 22:11:21 +1000 Subject: [PATCH 79/91] Add MSYS/POCL/FreeBSD tests --- .github/workflows/test.yml | 84 ++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b75387b3..83645a83 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -62,11 +62,47 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config Debug - - # TODO: test mingw - # https://github.com/msys2/setup-msys2 - # https://www.msys2.org/docs/ci/ - # TODO: test libuv, OpenCL + + test-msys: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + sys: [mingw32, ucrt64, clang64] + compiler: + - {cc: gcc, cxx: g++} + - {cc: clang, cxx: clang++} + #- { sys: mingw32, env: i686 } + #- { sys: ucrt64, env: ucrt-x86_64 } + #- { sys: clang64, env: clang-x86_64 } + name: Test MSYS ${{matrix.sys}} ${{matrix.compiler.cc}} + defaults: + run: + shell: msys2 {0} + steps: + #- uses: petarpetrovt/setup-sde@v2.1 + - uses: msys2/setup-msys2@v2 + with: + msystem: ${{matrix.sys}} + #update: true + install: cmake ${{matrix.compiler.cc}} make git + - uses: actions/checkout@v3 + - run: | + mkdir test/gf16/build + cmake -B test/gf16/build -S test/gf16 -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=${{matrix.compiler.cc}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} + cmake --build test/gf16/build + + mkdir test/hasher/build + cmake -B test/hasher/build -S test/hasher -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=${{matrix.compiler.cc}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} + cmake --build test/hasher/build + # TODO: test under SDE (needs static linking, or calling SDE from within MSYS) + - run: test/gf16/build/test + - run: test/gf16/build/test-pmul + - run: test/gf16/build/test-ctrl -f + - run: test/gf16/build/test-inv -f + - run: test/hasher/build/test + + # TODO: test libuv test-linux-gcc: strategy: @@ -213,6 +249,17 @@ jobs: if: ${{ matrix.config == 'Release' && matrix.cc_ver == '15' }} - run: ${{ matrix.t.emu }} test/hasher/build/test + test-linux-pocl: + name: Test POCL (OpenCL) + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - run: sudo apt update && sudo apt install -y pocl-opencl-icd + - run: | + mkdir test/gf16/build + cmake -Btest/gf16/build -Stest/gf16 -DCMAKE_BUILD_TYPE=Release -DENABLE_OCL=1 + cmake --build test/gf16/build + - run: test/gf16/build/test-ctrl -pg test-mac-x86: strategy: @@ -244,6 +291,27 @@ jobs: # TODO: test building on Mac ARM64? might not be necessary, given we build it in par2cmdline-turbo - # TODO: BSD? - # https://github.com/marketplace/actions/freebsd-vm - # https://github.com/vmactions + test-fbsd-x86: + runs-on: macos-12 + name: Test FreeBSD amd64 + steps: + - uses: actions/checkout@v3 + - id: fbsd_test + uses: vmactions/freebsd-vm@v0 + with: + usesh: true + prepare: pkg install -y cmake lang/gcc gmake + run: | + mkdir test/gf16/build + cmake -Btest/gf16/build -Stest/gf16 -DCMAKE_BUILD_TYPE=Release + cmake --build test/gf16/build + + mkdir test/hasher/build + cmake -Btest/hasher/build -Stest/hasher -DCMAKE_BUILD_TYPE=Release + cmake --build test/hasher/build + + test/gf16/build/test + test/gf16/build/test-pmul + test/gf16/build/test-ctrl -f + test/gf16/build/test-inv -f + test/hasher/build/test From a54eb75fbe38442ee6c56881d296483c3494cd91 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 11:53:17 +1000 Subject: [PATCH 80/91] Fix possible leak in matrix inversion --- gf16/gfmat_inv.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp index ad01643f..0c85ed70 100644 --- a/gf16/gfmat_inv.cpp +++ b/gf16/gfmat_inv.cpp @@ -625,6 +625,9 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va if(rowGroupSize < rowMultiple*2) rowGroupSize = rowMultiple*2; if(rowGroupSize > numRec) rowGroupSize = numRec; + std::vector stateCoeff(rowGroupSize*rowGroupSize); + state.coeff = stateCoeff.data(); + invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying? if(numRec > recovery.size()) { // not enough recovery if(_numThreads <= 1) @@ -658,14 +661,12 @@ bool Galois16RecMatrix::Compute(const std::vector& inputValid, unsigned va } \ } // max out at 6 groups (registers + cache assoc?) - state.coeff = new uint16_t[rowGroupSize*rowGroupSize]; INVERT_GROUP(6) INVERT_GROUP(5) INVERT_GROUP(4) INVERT_GROUP(3) INVERT_GROUP(2) INVERT_GROUP(1) - delete[] state.coeff; #undef INVERT_GROUP // post transform From ff6e0af925dfa882c1d0e4294e25e070d099a2db Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 12:23:11 +1000 Subject: [PATCH 81/91] Build/test fixes/tweaks --- .github/workflows/test.yml | 27 ++++++++++------------ binding.gyp | 10 +++++++- test/gf16/CMakeLists.txt | 47 +++++++++++++++++++++++++++++--------- test/hasher/CMakeLists.txt | 15 +++++++++++- 4 files changed, 71 insertions(+), 28 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 83645a83..da70268a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,19 +24,19 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe" - - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe" - - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f" + - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe + - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f" + - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: "$env:SDE_PATH\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe" + - run: $env:SDE_PATH/sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe # test SSE2-only to see if CPUID checking works - run: | - $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe - $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - $env:SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe + $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe + $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe + $env:SDE_PATH/sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} @@ -70,11 +70,8 @@ jobs: matrix: sys: [mingw32, ucrt64, clang64] compiler: - - {cc: gcc, cxx: g++} - - {cc: clang, cxx: clang++} - #- { sys: mingw32, env: i686 } - #- { sys: ucrt64, env: ucrt-x86_64 } - #- { sys: clang64, env: clang-x86_64 } + - {cc: gcc, cxx: g++, cc_extra: ""} + - {cc: clang, cxx: clang++, cc_extra: gcc} name: Test MSYS ${{matrix.sys}} ${{matrix.compiler.cc}} defaults: run: @@ -85,7 +82,7 @@ jobs: with: msystem: ${{matrix.sys}} #update: true - install: cmake ${{matrix.compiler.cc}} make git + install: cmake ${{matrix.compiler.cc}} ${{matrix.compiler.cc_extra}} make git - uses: actions/checkout@v3 - run: | mkdir test/gf16/build @@ -299,7 +296,7 @@ jobs: - id: fbsd_test uses: vmactions/freebsd-vm@v0 with: - usesh: true + copyback: false prepare: pkg install -y cmake lang/gcc gmake run: | mkdir test/gf16/build diff --git a/binding.gyp b/binding.gyp index cfdf1265..84b47dac 100644 --- a/binding.gyp +++ b/binding.gyp @@ -19,9 +19,17 @@ } }] ] + }], + ['OS!="win"', { + "variables": {"missing_memalign%": "/dev/null || echo failed)"}, + "conditions": [ + ['missing_memalign!=""', { + "cflags_c": ["-D_POSIX_C_SOURCE=200112L"], + }] + ] }] ], - "cflags_c": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"], + "cflags_c": ["-std=c99", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE", "-D_DEFAULT_SOURCE"], "cxxflags": ["-std=c++11"], "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}}, "configurations": {"Release": { diff --git a/test/gf16/CMakeLists.txt b/test/gf16/CMakeLists.txt index 37f7b578..49b716ae 100644 --- a/test/gf16/CMakeLists.txt +++ b/test/gf16/CMakeLists.txt @@ -116,11 +116,6 @@ set(GF16_CPP_SOURCES ${GF16_DIR}/controller_cpu.cpp ${GF16_DIR}/controller_ocl.cpp ${GF16_DIR}/controller_ocl_init.cpp - ${GF16_DIR}/gf16mul.cpp - - - ${GF16_DIR}/gf16pmul.cpp - ${GF16_DIR}/gfmat_inv.cpp ) include_directories(${GF16_DIR}/opencl-include ${GF16_DIR}) @@ -142,11 +137,23 @@ else() endif() if(ENABLE_SANITIZE) - set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound) + set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound -fno-sanitize-recover=all) # -fsanitize=pointer-overflow causes compilation of shuffle_avx512 to freeze on clang10 # -fsanitize=memory requires instrumented libraries, so not useful add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS}) add_link_options(${SANITIZE_OPTS}) + + #include(CheckLinkerFlag) + #check_linker_flag(C -static-libasan HAS_LIBASAN) # GCC + #check_linker_flag(C -static-libsan HAS_LIBSAN) # Clang + CHECK_CXX_COMPILER_FLAG(-static-libasan HAS_LIBASAN) + CHECK_CXX_COMPILER_FLAG(-static-libsan HAS_LIBSAN) + if(HAS_LIBASAN) + add_link_options(-static-libasan) + endif() + if(HAS_LIBSAN) + add_link_options(-static-libsan) + endif() endif() #if(ENABLE_OCL) @@ -158,20 +165,38 @@ endif() add_compile_definitions(PARPAR_INVERT_SUPPORT=1) add_library(gf16_c STATIC ${GF16_C_SOURCES}) +add_library(gf16_base STATIC ${GF16_DIR}/gf16mul.cpp) +add_library(gf16_pmul STATIC ${GF16_DIR}/gf16pmul.cpp) +add_library(gf16_inv STATIC ${GF16_DIR}/gfmat_inv.cpp) add_library(gf16_ctl STATIC ${GF16_CPP_SOURCES}) -target_link_libraries(gf16_ctl gf16_c) +target_link_libraries(gf16_base gf16_c) +target_link_libraries(gf16_pmul gf16_c) +target_link_libraries(gf16_inv gf16_base gf16_pmul) +target_link_libraries(gf16_ctl gf16_base) if(NOT MSVC) if(NOT ENABLE_SANITIZE) + target_compile_options(gf16_base PRIVATE -fno-rtti -fno-exceptions) + target_compile_options(gf16_pmul PRIVATE -fno-rtti -fno-exceptions) + target_compile_options(gf16_inv PRIVATE -fno-rtti -fno-exceptions) target_compile_options(gf16_ctl PRIVATE -fno-rtti) endif() - target_compile_definitions(gf16_c PRIVATE _POSIX_C_SOURCE=200112L) + + # posix_memalign may require _POSIX_C_SOURCE, but doing that on FreeBSD causes MAP_ANON* to disappear + # try to work around this by checking if posix_memalign exists without the define + include(CheckSymbolExists) + check_symbol_exists(posix_memalign "stdlib.h" HAVE_MEMALIGN) + if(NOT HAVE_MEMALIGN) + target_compile_definitions(gf16_c PRIVATE _POSIX_C_SOURCE=200112L) + endif() target_compile_definitions(gf16_c PRIVATE _DARWIN_C_SOURCE=) target_compile_definitions(gf16_c PRIVATE _GNU_SOURCE=) + target_compile_definitions(gf16_c PRIVATE _DEFAULT_SOURCE=) if(ENABLE_SANITIZE) # not supported on all platforms? #target_compile_options(gf16_ctl PRIVATE -fsanitize=thread) + #target_compile_options(gf16_inv PRIVATE -fsanitize=thread) endif() endif() @@ -291,13 +316,13 @@ endif() # binaries set(TEST_DIR .) add_executable(test ${TEST_DIR}/test.cpp) -target_link_libraries(test gf16_ctl) +target_link_libraries(test gf16_base) add_executable(test-ctrl ${TEST_DIR}/test-ctrl.cpp) target_link_libraries(test-ctrl gf16_ctl) add_executable(test-inv ${TEST_DIR}/test-inv.cpp ${TEST_DIR}/p2c-inv/reedsolomon.cpp) -target_link_libraries(test-inv gf16_ctl) +target_link_libraries(test-inv gf16_inv) add_executable(test-pmul ${TEST_DIR}/test-pmul.cpp) -target_link_libraries(test-pmul gf16_ctl) +target_link_libraries(test-pmul gf16_pmul) if(NOT MSVC) target_link_libraries(test-ctrl -pthread) diff --git a/test/hasher/CMakeLists.txt b/test/hasher/CMakeLists.txt index 507dd536..6c0efa82 100644 --- a/test/hasher/CMakeLists.txt +++ b/test/hasher/CMakeLists.txt @@ -83,9 +83,21 @@ else() endif() if(ENABLE_SANITIZE) - set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined) + set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all) add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS}) add_link_options(${SANITIZE_OPTS}) + + #include(CheckLinkerFlag) + #check_linker_flag(C -static-libasan HAS_LIBASAN) # GCC + #check_linker_flag(C -static-libsan HAS_LIBSAN) # Clang + CHECK_CXX_COMPILER_FLAG(-static-libasan HAS_LIBASAN) + CHECK_CXX_COMPILER_FLAG(-static-libsan HAS_LIBSAN) + if(HAS_LIBASAN) + add_link_options(-static-libasan) + endif() + if(HAS_LIBSAN) + add_link_options(-static-libsan) + endif() endif() endif() @@ -103,6 +115,7 @@ if(NOT MSVC) target_compile_definitions(hasher_c PRIVATE _POSIX_C_SOURCE=200112L) target_compile_definitions(hasher_c PRIVATE _DARWIN_C_SOURCE=) target_compile_definitions(hasher_c PRIVATE _GNU_SOURCE=) + target_compile_definitions(hasher_c PRIVATE _DEFAULT_SOURCE=) endif() if(MSVC) From e9b6d3344908c727651e7cb0c733a10d3f247cb7 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 12:31:32 +1000 Subject: [PATCH 82/91] Avoid CpuCap name conflict --- gf16/gf16mul.cpp | 33 ++++++++++++++------------------- hasher/hasher.cpp | 22 +++++++++++----------- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp index 142aba3d..34ad4ba3 100644 --- a/gf16/gf16mul.cpp +++ b/gf16/gf16mul.cpp @@ -13,21 +13,20 @@ extern "C" { } // CPUID stuff -#include "../src/platform.h" +#include "../src/cpuid.h" #ifdef PLATFORM_X86 -# include "../src/cpuid.h" # ifdef __APPLE__ # include # include # endif # include "x86_jit.h" -struct CpuCap { +struct GF16CpuCap { bool hasSSE2, hasSSSE3, hasAVX, hasAVX2, hasAVX512VLBW, hasAVX512VBMI, hasGFNI; size_t propPrefShuffleThresh; bool propFastJit, propHT; bool canMemWX, isEmulated; int jitOptStrat; - CpuCap(bool detect) : + GF16CpuCap(bool detect) : hasSSE2(true), hasSSSE3(true), hasAVX(true), @@ -196,14 +195,12 @@ struct CpuCap { }; #endif #ifdef PLATFORM_ARM -# include "../src/cpuid.h" - -struct CpuCap { +struct GF16CpuCap { bool hasNEON; bool hasSHA3; bool hasSVE; bool hasSVE2; - CpuCap(bool detect) : hasNEON(true), hasSVE(true), hasSVE2(true) { + GF16CpuCap(bool detect) : hasNEON(true), hasSVE(true), hasSVE2(true) { if(!detect) return; hasNEON = CPU_HAS_NEON; hasSHA3 = CPU_HAS_NEON_SHA3; @@ -220,11 +217,9 @@ struct CpuCap { }; #endif #ifdef __riscv -# include "../src/cpuid.h" - -struct CpuCap { +struct GF16CpuCap { bool hasVector; - CpuCap(bool detect) : hasVector(true) { + GF16CpuCap(bool detect) : hasVector(true) { if(!detect) return; hasVector = CPU_HAS_VECTOR && CPU_HAS_GC; } @@ -1117,7 +1112,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) { case GF16_XOR_JIT_SSE2: case GF16_XOR_SSE2: { #ifdef PLATFORM_X86 - int jitOptStrat = CpuCap(true).jitOptStrat; + int jitOptStrat = GF16CpuCap(true).jitOptStrat; switch(method) { case GF16_XOR_JIT_SSE2: @@ -1359,7 +1354,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu (void)forInvert; #ifdef PLATFORM_X86 - const CpuCap caps(true); + const GF16CpuCap caps(true); if(caps.hasGFNI) { if(gf16_affine_available_avx512 && caps.hasAVX512VLBW) return GF16_AFFINE_AVX512; @@ -1397,7 +1392,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu return GF16_XOR_SSE2; #endif #ifdef PLATFORM_ARM - const CpuCap caps(true); + const GF16CpuCap caps(true); if(caps.hasSVE2) { if(gf16_sve_get_size() >= 64) return GF16_SHUFFLE_512_SVE2; @@ -1419,7 +1414,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON; #endif #ifdef __riscv_ - const CpuCap caps(true); + const GF16CpuCap caps(true); if(caps.hasVector && gf16_available_rvv && gf16_rvv_get_size() >= 16) return GF16_SHUFFLE_128_RVV; #endif @@ -1437,7 +1432,7 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { ret.push_back(GF16_LOOKUP3); #ifdef PLATFORM_X86 - const CpuCap caps(checkCpuid); + const GF16CpuCap caps(checkCpuid); if(gf16_shuffle_available_ssse3 && caps.hasSSSE3) ret.push_back(GF16_SHUFFLE_SSSE3); if(gf16_shuffle_available_avx && caps.hasAVX) @@ -1485,7 +1480,7 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { } #endif #ifdef PLATFORM_ARM - const CpuCap caps(checkCpuid); + const GF16CpuCap caps(checkCpuid); if(gf16_available_neon && caps.hasNEON) { ret.push_back(GF16_SHUFFLE_NEON); ret.push_back(GF16_CLMUL_NEON); @@ -1505,7 +1500,7 @@ std::vector Galois16Mul::availableMethods(bool checkCpuid) { } #endif #ifdef __riscv - const CpuCap caps(checkCpuid); + const GF16CpuCap caps(checkCpuid); if(gf16_available_rvv && caps.hasVector && gf16_rvv_get_size() >= 16) ret.push_back(GF16_SHUFFLE_128_RVV); #endif diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp index ce6e3f40..f3d23d59 100644 --- a/hasher/hasher.cpp +++ b/hasher/hasher.cpp @@ -8,11 +8,11 @@ uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*) = NULL; MD5CRCMethods MD5CRC_Method = MD5CRCMETH_SCALAR; uint32_t(*CRC32_Calc)(const void*, size_t) = NULL; MD5CRCMethods CRC32_Method = MD5CRCMETH_SCALAR; -struct CpuCap { +struct HasherCpuCap { #ifdef PLATFORM_X86 bool hasSSE2, hasClMul, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW; bool isSmallCore, isLEASlow, isVecRotSlow; - CpuCap(bool detect) : + HasherCpuCap(bool detect) : hasSSE2(true), hasClMul(true), hasXOP(true), hasBMI1(true), hasAVX2(true), hasAVX512F(true), hasAVX512VLBW(true), isSmallCore(false), isLEASlow(false), isVecRotSlow(false) { @@ -65,7 +65,7 @@ struct CpuCap { #endif #ifdef PLATFORM_ARM bool hasCRC, hasNEON, hasSVE2; - CpuCap(bool detect) : hasCRC(true), hasNEON(true), hasSVE2(true) { + HasherCpuCap(bool detect) : hasCRC(true), hasNEON(true), hasSVE2(true) { if(!detect) return; hasCRC = CPU_HAS_ARMCRC; hasNEON = CPU_HAS_NEON; @@ -83,7 +83,7 @@ void setup_hasher() { set_hasherMD5CRC(MD5CRCMETH_SCALAR); #ifdef PLATFORM_X86 - struct CpuCap caps(true); + struct HasherCpuCap caps(true); if(caps.hasAVX512VLBW && caps.hasClMul && !caps.isVecRotSlow && HasherInput_AVX512::isAvailable) set_hasherInput(INHASH_AVX512); @@ -111,7 +111,7 @@ void setup_hasher() { #endif #ifdef PLATFORM_ARM - struct CpuCap caps(true); + struct HasherCpuCap caps(true); if(caps.hasCRC && HasherInput_ARMCRC::isAvailable) // TODO: fast core only set_hasherInput(INHASH_CRC); @@ -527,7 +527,7 @@ std::vector hasherInput_availableMethods(bool checkCpuid) { ret.push_back(INHASH_SCALAR); #ifdef PLATFORM_X86 - const CpuCap caps(checkCpuid); + const HasherCpuCap caps(checkCpuid); if(caps.hasClMul) { if(caps.hasAVX512VLBW && HasherInput_AVX512::isAvailable) ret.push_back(INHASH_AVX512); @@ -542,7 +542,7 @@ std::vector hasherInput_availableMethods(bool checkCpuid) { ret.push_back(INHASH_SIMD); #endif #ifdef PLATFORM_ARM - const CpuCap caps(checkCpuid); + const HasherCpuCap caps(checkCpuid); if(caps.hasCRC && HasherInput_ARMCRC::isAvailable) ret.push_back(INHASH_CRC); if(caps.hasNEON && HasherInput_NEON::isAvailable) @@ -559,7 +559,7 @@ std::vector hasherMD5CRC_availableMethods(bool checkCpuid) { ret.push_back(MD5CRCMETH_SCALAR); #ifdef PLATFORM_X86 - const CpuCap caps(checkCpuid); + const HasherCpuCap caps(checkCpuid); if(caps.hasClMul) { if(caps.hasAVX512VLBW && MD5CRC_isAvailable_AVX512) ret.push_back(MD5CRCMETH_AVX512); @@ -572,7 +572,7 @@ std::vector hasherMD5CRC_availableMethods(bool checkCpuid) { } #endif #ifdef PLATFORM_ARM - const CpuCap caps(checkCpuid); + const HasherCpuCap caps(checkCpuid); if(caps.hasCRC && MD5CRC_isAvailable_ARMCRC) ret.push_back(MD5CRCMETH_ARMCRC); #endif @@ -585,7 +585,7 @@ std::vector hasherMD5Multi_availableMethods(bool checkCpuid) { ret.push_back(MD5MULT_SCALAR); #ifdef PLATFORM_X86 - const CpuCap caps(checkCpuid); + const HasherCpuCap caps(checkCpuid); if(caps.hasAVX512VLBW && MD5Multi_AVX512_256::isAvailable) ret.push_back(MD5MULT_AVX512VL); if(caps.hasAVX512F && MD5Multi_AVX512::isAvailable) @@ -598,7 +598,7 @@ std::vector hasherMD5Multi_availableMethods(bool checkCpuid) { ret.push_back(MD5MULT_SSE); #endif #ifdef PLATFORM_ARM - const CpuCap caps(checkCpuid); + const HasherCpuCap caps(checkCpuid); if(caps.hasSVE2 && MD5Multi_SVE2::isAvailable) ret.push_back(MD5MULT_SVE2); if(caps.hasNEON && MD5Multi_NEON::isAvailable) From 2757f9caf61c7a5a00c860e6d66aeaea460ad841 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 17:07:00 +1000 Subject: [PATCH 83/91] Fix UBSan warnings --- .github/workflows/test.yml | 2 +- gf16/gf16_affine_avx10.h | 16 ++++++++-------- gf16/gf16_muladd_multi.h | 15 +++++++++++---- gf16/gf16_shuffle_avx2.c | 14 +++++++------- gf16/gf16_xor_avx2.c | 16 ++++++++-------- gf16/gf16_xor_avx512.c | 16 ++++++++-------- gf16/gf16_xor_sse2.c | 26 +++++++++++++------------- hasher/md5mb-sse.h | 6 +++--- test/gf16/CMakeLists.txt | 3 +-- 9 files changed, 60 insertions(+), 54 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index da70268a..543b284e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,7 +38,7 @@ jobs: $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe $env:SDE_PATH/sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} - + # TODO: XOP tests for hasher? # test building only test-win-arm: diff --git a/gf16/gf16_affine_avx10.h b/gf16/gf16_affine_avx10.h index f2e9de1a..54ff86c7 100644 --- a/gf16/gf16_affine_avx10.h +++ b/gf16/gf16_affine_avx10.h @@ -199,10 +199,10 @@ GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine, _FNSUFFIX) #ifdef _AVAILABLE -static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcCountOffs, const void* _src1, const void* _src2, _mword* result, _mword* swapped, _mword matNorm1, _mword matSwap1, _mword matNorm2, _mword matSwap2) { +static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcCountOffs, const uint8_t* _src1, const uint8_t* _src2, intptr_t srcOffset, _mword* result, _mword* swapped, _mword matNorm1, _mword matSwap1, _mword matNorm2, _mword matSwap2) { if(srcCountOffs < 0) return; - _mword data1 = _MMI(load)(_src1); + _mword data1 = _MMI(load)((const _mword*)(_src1 + srcOffset)); if(srcCountOffs == 0) { *result = _MMI(xor)( *result, @@ -214,7 +214,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcC ); } else { // if(srcCountOffs > 0) - _mword data2 = _MMI(load)(_src2); + _mword data2 = _MMI(load)((const _mword*)(_src2 + srcOffset)); *result = _MM(ternarylogic_epi32)( *result, _MM(gf2p8affine_epi64_epi8)(data1, matNorm1, 0), @@ -391,11 +391,11 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_x)( ); } - _FN(gf16_affine2x_muladd_2round)(srcCount - 4, _src4 + ptr*srcScale, _src5 + ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE); - _FN(gf16_affine2x_muladd_2round)(srcCount - 6, _src6 + ptr*srcScale, _src7 + ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG); - _FN(gf16_affine2x_muladd_2round)(srcCount - 8, _src8 + ptr*srcScale, _src9 + ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI); - _FN(gf16_affine2x_muladd_2round)(srcCount - 10, _src10 + ptr*srcScale, _src11 + ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK); - _FN(gf16_affine2x_muladd_2round)(srcCount - 12, _src12 + ptr*srcScale, _src13 + ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM); + _FN(gf16_affine2x_muladd_2round)(srcCount - 4, _src4, _src5, ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE); + _FN(gf16_affine2x_muladd_2round)(srcCount - 6, _src6, _src7, ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG); + _FN(gf16_affine2x_muladd_2round)(srcCount - 8, _src8, _src9, ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI); + _FN(gf16_affine2x_muladd_2round)(srcCount - 10, _src10, _src11, ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK); + _FN(gf16_affine2x_muladd_2round)(srcCount - 12, _src12, _src13, ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM); result = _MM(ternarylogic_epi32)( result, diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h index f445dbb3..fc10c97e 100644 --- a/gf16/gf16_muladd_multi.h +++ b/gf16/gf16_muladd_multi.h @@ -78,6 +78,12 @@ typedef void (*const fMuladdPF) const int doPrefetch, const char* _pf ); +// suppress UBSan warning about adding to a NULL pointer; `coefficients` can be NULL from gf_add*, but it's never used there, and it's annoying to have to check and branch on these +#if defined(__clang__) +# define IGNORE_NULL_ADD __attribute__((no_sanitize("pointer-overflow"))) +#else +# define IGNORE_NULL_ADD +#endif static HEDLEY_ALWAYS_INLINE void gf16_muladd_single(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, uint16_t val) { muladd_pf( @@ -107,7 +113,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_prefetch_single(const void *HEDLEY_ #define REMAINING_CASES CASE(17); CASE(16); CASE(15); CASE(14); CASE(13); CASE(12); CASE(11); CASE(10); CASE( 9); CASE( 8); CASE( 7); CASE( 6); CASE( 5); CASE( 4); CASE( 3); CASE( 2); CASE( 1) -static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients) { +static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients) IGNORE_NULL_ADD { uint8_t* _dst = (uint8_t*)dst + offset + len; #define _SRC(limit, n) limit > n ? (const uint8_t*)src[region+n] + offset + len : NULL @@ -148,7 +154,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s #undef _SRC } -static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetch) { +static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetch) IGNORE_NULL_ADD { uint8_t* _dst = (uint8_t*)dst + len; uint8_t* srcEnd = (uint8_t*)src + len; @@ -232,7 +238,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_R #undef _SRC } -static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients) { +static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients) IGNORE_NULL_ADD { ASSUME(regions <= inputPackSize); uint8_t* _dst = (uint8_t*)dst + len; @@ -351,7 +357,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RES # define MM_HINT_WT1 _MM_HINT_ET1 #endif -static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packpf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { +static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packpf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) IGNORE_NULL_ADD { ASSUME(regions <= inputPackSize); uint8_t* _dst = (uint8_t*)dst + len; @@ -615,3 +621,4 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packpf(const void *HEDLEY_RES } #undef REMAINING_CASES +#undef IGNORE_NULL_ADD diff --git a/gf16/gf16_shuffle_avx2.c b/gf16/gf16_shuffle_avx2.c index 28a7991e..5899d4c7 100644 --- a/gf16/gf16_shuffle_avx2.c +++ b/gf16/gf16_shuffle_avx2.c @@ -17,8 +17,8 @@ #include "gf16_muladd_multi.h" #if defined(_AVAILABLE) -static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_round_avx2(__m256i* _dst, const int srcCount, __m256i* _src1, __m256i* _src2, __m256i shufNormLoA, __m256i shufNormLoB, __m256i shufNormHiA, __m256i shufNormHiB, __m256i shufSwapLoA, __m256i shufSwapLoB, __m256i shufSwapHiA, __m256i shufSwapHiB) { - __m256i data = _mm256_load_si256(_src1); +static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_round_avx2(__m256i* _dst, const int srcCount, const uint8_t* _src1, const uint8_t* _src2, intptr_t srcOffset, __m256i shufNormLoA, __m256i shufNormLoB, __m256i shufNormHiA, __m256i shufNormHiB, __m256i shufSwapLoA, __m256i shufSwapLoB, __m256i shufSwapHiA, __m256i shufSwapHiB) { + __m256i data = _mm256_load_si256((const __m256i*)(_src1 + srcOffset)); __m256i mask = _mm256_set1_epi8(0x0f); __m256i ti = _mm256_and_si256(mask, data); @@ -32,7 +32,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_round_avx2(__m256i* _dst, result = _mm256_xor_si256(result, _mm256_load_si256(_dst)); if(srcCount > 1) { - data = _mm256_load_si256(_src2); + data = _mm256_load_si256((const __m256i*)(_src2 + srcOffset)); ti = _mm256_and_si256(mask, data); result = _mm256_xor_si256(_mm256_shuffle_epi8(shufNormLoB, ti), result); @@ -117,7 +117,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_avx2(const void *HEDLEY intptr_t ptr = -(intptr_t)len; if(len & (sizeof(__m256i)*2-1)) { // number of loop iterations isn't even, so do one iteration to make it even gf16_shuffle2x_muladd_round_avx2( - (__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale), + (__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale, shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB ); if(doPrefetch == 1) @@ -128,12 +128,12 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_avx2(const void *HEDLEY } while(ptr) { gf16_shuffle2x_muladd_round_avx2( - (__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale), + (__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale, shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB ); ptr += sizeof(__m256i); gf16_shuffle2x_muladd_round_avx2( - (__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale), + (__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale, shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB ); @@ -146,7 +146,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_avx2(const void *HEDLEY } else { for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m256i)) { gf16_shuffle2x_muladd_round_avx2( - (__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale), + (__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale, shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB ); } diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c index 219b05ae..019af819 100644 --- a/gf16/gf16_xor_avx2.c +++ b/gf16/gf16_xor_avx2.c @@ -477,22 +477,22 @@ static HEDLEY_ALWAYS_INLINE __m256i gf16_xor_finish_extract_bits(__m256i src) { static HEDLEY_ALWAYS_INLINE void gf16_xor_finish_extract_bits_store(uint32_t* dst, __m256i src) { __m256i srcShifted = _mm256_add_epi8(src, src); __m256i lane = _mm256_inserti128_si256(srcShifted, _mm256_castsi256_si128(src), 1); - dst[3] = _mm256_movemask_epi8(lane); + write32(dst+3, _mm256_movemask_epi8(lane)); lane = _mm256_slli_epi16(lane, 2); - dst[2] = _mm256_movemask_epi8(lane); + write32(dst+2, _mm256_movemask_epi8(lane)); lane = _mm256_slli_epi16(lane, 2); - dst[1] = _mm256_movemask_epi8(lane); + write32(dst+1, _mm256_movemask_epi8(lane)); lane = _mm256_slli_epi16(lane, 2); - dst[0] = _mm256_movemask_epi8(lane); + write32(dst+0, _mm256_movemask_epi8(lane)); lane = _mm256_permute2x128_si256(srcShifted, src, 0x31); - dst[7] = _mm256_movemask_epi8(lane); + write32(dst+7, _mm256_movemask_epi8(lane)); lane = _mm256_slli_epi16(lane, 2); - dst[6] = _mm256_movemask_epi8(lane); + write32(dst+6, _mm256_movemask_epi8(lane)); lane = _mm256_slli_epi16(lane, 2); - dst[5] = _mm256_movemask_epi8(lane); + write32(dst+5, _mm256_movemask_epi8(lane)); lane = _mm256_slli_epi16(lane, 2); - dst[4] = _mm256_movemask_epi8(lane); + write32(dst+4, _mm256_movemask_epi8(lane)); } #define LOAD_HALVES(a, b, upper) \ diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c index d87d79e1..6fea216d 100644 --- a/gf16/gf16_xor_avx512.c +++ b/gf16/gf16_xor_avx512.c @@ -1024,20 +1024,20 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_finish_bit_extract(uint64_t* dst, __m5 0x10101010, 0x10101010, 0x10101010, 0x10101010 ); __m512i lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(0,0,0,0)); - dst[0] = _mm512_test_epi8_mask(lane, lo_nibble_test); - dst[1] = _mm512_test_epi8_mask(lane, hi_nibble_test); + write64(dst+0, _mm512_test_epi8_mask(lane, lo_nibble_test)); + write64(dst+1, _mm512_test_epi8_mask(lane, hi_nibble_test)); lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(1,1,1,1)); - dst[32 +0] = _mm512_test_epi8_mask(lane, lo_nibble_test); - dst[32 +1] = _mm512_test_epi8_mask(lane, hi_nibble_test); + write64(dst+32 +0, _mm512_test_epi8_mask(lane, lo_nibble_test)); + write64(dst+32 +1, _mm512_test_epi8_mask(lane, hi_nibble_test)); lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(2,2,2,2)); - dst[64 +0] = _mm512_test_epi8_mask(lane, lo_nibble_test); - dst[64 +1] = _mm512_test_epi8_mask(lane, hi_nibble_test); + write64(dst+64 +0, _mm512_test_epi8_mask(lane, lo_nibble_test)); + write64(dst+64 +1, _mm512_test_epi8_mask(lane, hi_nibble_test)); lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(3,3,3,3)); - dst[96 +0] = _mm512_test_epi8_mask(lane, lo_nibble_test); - dst[96 +1] = _mm512_test_epi8_mask(lane, hi_nibble_test); + write64(dst+96 +0, _mm512_test_epi8_mask(lane, lo_nibble_test)); + write64(dst+96 +1, _mm512_test_epi8_mask(lane, hi_nibble_test)); } static HEDLEY_ALWAYS_INLINE void _gf16_xor_finish_copy_block_avx512(void* dst, const void* src) { diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c index fe68fbbf..c5dcb948 100644 --- a/gf16/gf16_xor_sse2.c +++ b/gf16/gf16_xor_sse2.c @@ -271,7 +271,7 @@ static HEDLEY_ALWAYS_INLINE void STOREU_XMM(void* dest, __m128i xmm) { #define CMOV(c, d, s) if(c) (d) = (s) #endif -static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3(uint8_t* dest, uint_fast32_t mask, __m128i* tCode, uint16_t* tInfo, intptr_t* posC, unsigned long* movC, uint_fast8_t isR64) { +static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3(uint8_t* dest, uint_fast32_t mask, __m128i* tCode, uint16_t* tInfo, intptr_t* posC, long* movC, uint_fast8_t isR64) { uint_fast16_t info = tInfo[mask>>1]; intptr_t pC = info >> 12; @@ -281,12 +281,12 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3(uint8_t* dest, uint_f // handle conditional move for common mask (since it's always done) CMOV(*movC, *posC, pC+isR64); *posC -= info & 0xF; - *movC &= -(pC == 0); + *movC &= -(long)(pC == 0); return info; } -static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, unsigned long* mov1, intptr_t* pos2, unsigned long* mov2, int isR64) { +static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, long* mov1, intptr_t* pos2, long* mov2, int isR64) { UNUSED(dest); uintptr_t p1 = (info >> 4) & 0xF; uintptr_t p2 = (info >> 8) & 0xF; @@ -294,12 +294,12 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_noxor(uint8_t* dest, CMOV(*mov2, *pos2, p2+isR64); *pos1 -= info & 0xF; *pos2 -= info & 0xF; - *mov1 &= -(p1 == 0); - *mov2 &= -(p2 == 0); + *mov1 &= -(long)(p1 == 0); + *mov2 &= -(long)(p2 == 0); return info & 0xF; } -static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, unsigned long* mov1, intptr_t* pos2, unsigned long* mov2, int isR64) { +static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, long* mov1, intptr_t* pos2, long* mov2, int isR64) { UNUSED(dest); uintptr_t p1 = (info >> 8) & 0xF; uintptr_t p2 = info >> 12; @@ -307,8 +307,8 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* des CMOV(*mov2, *pos2, p2+isR64); *pos1 -= info & 0xF; *pos2 -= info & 0xF; - *mov1 &= -(p1 == 0); - *mov2 &= -(p2 == 0); + *mov1 &= -(long)(p1 == 0); + *mov2 &= -(long)(p2 == 0); return info & 0xF; } #undef CMOV @@ -499,7 +499,7 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST for(bit=0; bit<8; bit++) { int destOffs = (bit<<5)-128; int destOffs2 = destOffs+16; - unsigned long movC = 0xFF; + long movC = 0xFF; intptr_t posC = 0; uint_fast32_t mask = lumask[bit]; _LD_APS(0, DX, destOffs); @@ -572,8 +572,8 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST for(bit=0; bit<8; bit++) { int destOffs = (bit<<5)-128; int destOffs2 = destOffs+16; - unsigned long mov1 = 0xFF, mov2 = 0xFF, - movC = 0xFF; + long mov1 = 0xFF, mov2 = 0xFF, + movC = 0xFF; intptr_t pos1 = 0, pos2 = 0, posC = 0; uint_fast32_t mask = lumask[bit]; @@ -1012,10 +1012,10 @@ void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_REST srcDQh = _mm_unpackhi_epi64(srcQ0d, srcQ8d) #define EXTRACT_BITS(target, srcVec) \ - (target)[7] = _mm_movemask_epi8(srcVec); \ + write16((target) + 7, _mm_movemask_epi8(srcVec)); \ for(int i=6; i>=0; i--) { \ srcVec = _mm_add_epi8(srcVec, srcVec); \ - (target)[i] = _mm_movemask_epi8(srcVec); \ + write16((target) + i, _mm_movemask_epi8(srcVec)); \ } void gf16_xor_finish_block_sse2(void *HEDLEY_RESTRICT dst) { uint16_t* _dst = (uint16_t*)dst; diff --git a/hasher/md5mb-sse.h b/hasher/md5mb-sse.h index 81e2503a..3eb26329 100644 --- a/hasher/md5mb-sse.h +++ b/hasher/md5mb-sse.h @@ -59,7 +59,7 @@ static HEDLEY_ALWAYS_INLINE void md5_extract_mb_sse(void* dst, void* state, int idx) { - HEDLEY_ASSUME(idx < md5mb_regions_sse); + HEDLEY_ASSUME(idx >= 0 && idx < md5mb_regions_sse*2); // 2 = md5mb_interleave __m128i* state_ = (__m128i*)state + (idx & 4); __m128i tmp1 = _mm_unpacklo_epi32(state_[0], state_[1]); __m128i tmp2 = _mm_unpackhi_epi32(state_[0], state_[1]); @@ -269,7 +269,7 @@ static HEDLEY_ALWAYS_INLINE void md5_extract_all_mb_sse(void* dst, void* state, static HEDLEY_ALWAYS_INLINE void md5_extract_mb_avx2(void* dst, void* state, int idx) { - HEDLEY_ASSUME(idx < md5mb_regions_avx2); + HEDLEY_ASSUME(idx >= 0 && idx < md5mb_regions_avx2*2); __m256i* state_ = (__m256i*)state + ((idx & 8) >> 1); __m256i tmpAB0 = _mm256_unpacklo_epi32(state_[0], state_[1]); __m256i tmpAB2 = _mm256_unpackhi_epi32(state_[0], state_[1]); @@ -477,7 +477,7 @@ static HEDLEY_ALWAYS_INLINE void md5_extract_all_mb_avx2(void* dst, void* state, #undef LOAD16 static HEDLEY_ALWAYS_INLINE void md5_extract_mb_avx512(void* dst, void* state, int idx) { - HEDLEY_ASSUME(idx < md5mb_regions_avx512); + HEDLEY_ASSUME(idx >= 0 && idx < md5mb_regions_avx512*2); __m512i* state_ = (__m512i*)state + ((idx & 16) >> 2); __m512i tmpAB0 = _mm512_unpacklo_epi32(state_[0], state_[1]); __m512i tmpAB2 = _mm512_unpackhi_epi32(state_[0], state_[1]); diff --git a/test/gf16/CMakeLists.txt b/test/gf16/CMakeLists.txt index 49b716ae..66af243b 100644 --- a/test/gf16/CMakeLists.txt +++ b/test/gf16/CMakeLists.txt @@ -137,8 +137,7 @@ else() endif() if(ENABLE_SANITIZE) - set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound -fno-sanitize-recover=all) - # -fsanitize=pointer-overflow causes compilation of shuffle_avx512 to freeze on clang10 + set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all) # -fsanitize=memory requires instrumented libraries, so not useful add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS}) add_link_options(${SANITIZE_OPTS}) From 91b8c39f3771b7a9819c402ceac3cf7f52e95d44 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 20:14:18 +1000 Subject: [PATCH 84/91] Test workflow fixes --- .github/workflows/test.yml | 31 +++++++++++++++++-------------- gf16/threadqueue.h | 4 +++- gf16/x86_jit.h | 4 +++- src/cpuid.h | 4 +++- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 543b284e..8357eec4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,19 +24,19 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe - - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f + - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test.exe" + - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-pmul.exe" + - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-ctrl.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f + - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-inv.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: $env:SDE_PATH/sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe + - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/hasher/build/${{ matrix.config }}/test.exe" # test SSE2-only to see if CPUID checking works - run: | - $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe - $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe - $env:SDE_PATH/sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe + Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test.exe" + Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test-pmul.exe" + Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/hasher/build/${{ matrix.config }}/test.exe" if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} # TODO: XOP tests for hasher? @@ -68,11 +68,14 @@ jobs: strategy: fail-fast: false matrix: - sys: [mingw32, ucrt64, clang64] + sys: + - { sys: mingw32, env: i686 } + - { sys: ucrt64, env: ucrt-x86_64 } + - { sys: clang64, env: clang-x86_64 } compiler: - - {cc: gcc, cxx: g++, cc_extra: ""} - - {cc: clang, cxx: clang++, cc_extra: gcc} - name: Test MSYS ${{matrix.sys}} ${{matrix.compiler.cc}} + - {cc: gcc, cxx: g++} + - {cc: clang, cxx: clang++} + name: Test MSYS ${{matrix.sys.sys}} ${{matrix.compiler.cc}} defaults: run: shell: msys2 {0} @@ -80,9 +83,9 @@ jobs: #- uses: petarpetrovt/setup-sde@v2.1 - uses: msys2/setup-msys2@v2 with: - msystem: ${{matrix.sys}} + msystem: ${{matrix.sys.sys}} #update: true - install: cmake ${{matrix.compiler.cc}} ${{matrix.compiler.cc_extra}} make git + install: cmake mingw-w64-${{matrix.sys.env}}-${{matrix.compiler.cc}} make git - uses: actions/checkout@v3 - run: | mkdir test/gf16/build diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h index 3db53cb3..0b3c23f8 100644 --- a/gf16/threadqueue.h +++ b/gf16/threadqueue.h @@ -201,7 +201,9 @@ typedef std::function&)> thread_cb_t; #if defined(_WINDOWS) || defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64) -# define NOMINMAX +# ifndef NOMINMAX +# define NOMINMAX +# endif # define WIN32_LEAN_AND_MEAN # include #else diff --git a/gf16/x86_jit.h b/gf16/x86_jit.h index 2597d9d8..08636a94 100644 --- a/gf16/x86_jit.h +++ b/gf16/x86_jit.h @@ -702,7 +702,9 @@ typedef struct { } jit_wx_pair; #if defined(_WINDOWS) || defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64) -# define NOMINMAX +# ifndef NOMINMAX +# define NOMINMAX +# endif # include static HEDLEY_ALWAYS_INLINE jit_wx_pair* jit_alloc(size_t len) { void* mem = VirtualAlloc(NULL, len, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); diff --git a/src/cpuid.h b/src/cpuid.h index 6d7c1c11..c44cabd3 100644 --- a/src/cpuid.h +++ b/src/cpuid.h @@ -46,7 +46,9 @@ # include # elif defined(_WIN32) # define WIN32_LEAN_AND_MEAN -# define NOMINMAX +# ifndef NOMINMAX +# define NOMINMAX +# endif # include # elif defined(__APPLE__) # include From 7b6fb6660a492a5fbbdbea9ea46bc5aba801d7bf Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 20:36:46 +1000 Subject: [PATCH 85/91] Add full test workflow --- .github/workflows/test-full.yml | 36 ++++++++ test/cached-cmpref-fast.json | 1 + test/par-compare.js | 149 ++++++++++++++++++-------------- 3 files changed, 120 insertions(+), 66 deletions(-) create mode 100644 .github/workflows/test-full.yml create mode 100644 test/cached-cmpref-fast.json diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml new file mode 100644 index 00000000..b936c7cc --- /dev/null +++ b/.github/workflows/test-full.yml @@ -0,0 +1,36 @@ +name: Run PAR2 Create Tests +on: + workflow_dispatch: + push: + +jobs: + test-node: + strategy: + fail-fast: false + matrix: + include: + - version: '0.10.40' + flags: '' + python2: true + - version: '4.9.1' + flags: '' + python2: true + - version: '12.22.12' + flags: '--trace-warnings' + python2: false + - version: '20.5.1' + flags: '--pending-deprecation --throw-deprecation --trace-warnings' + python2: false + name: Test on Node v${{ matrix.version }} + runs-on: ubuntu-latest + steps: + - uses: MatteoH2O1999/setup-python@v1 + with: + python-version: '2.7' + if: ${{ matrix.python2 }} + - uses: actions/checkout@v3 + - uses: actions/setup-node@v3 + with: + node-version: ${{ matrix.version }} + - run: (npm install --production + - run: node ${{ matrix.flags }} test/par-compare.js -f diff --git a/test/cached-cmpref-fast.json b/test/cached-cmpref-fast.json new file mode 100644 index 00000000..3b0ac589 --- /dev/null +++ b/test/cached-cmpref-fast.json @@ -0,0 +1 @@ +{"0":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"384329a66d8557f9c35b05e2d391b2db","len":262152},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e968c2b42a5e148ee1da7556645ec19","len":262152},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"f9cbb890066d9b19df7fb43cdec29f89","len":262152},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee74b923884c67a0f9c75f8db2a4946a","len":262152},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"c95cc5d5456cc427eb3f05cab6e7712d","len":262152},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"b789836bb5fda2ad650ed492693c24d0","len":262152},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"30c5398aa3faf8b4306d4010dde7f34a","len":262152},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"3652d2c728cf4e5cf0305e05237bc8b4","len":262152},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c51f2e4e80ac1f825bc98ce1ee50137","len":262152},"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"527d15c6ae89dd498b68d283cb13a04d","len":262152},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"5a841a77e7876ad22c0860da5f6ed754","len":262152},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"2d942f333b87e8242d5addbfc309c9c0","len":262152},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f8c66e0c1496a0ab21c6af961c8a8a73","len":262152},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"3419f14c6492702df0b75e1eac8a0e07","len":262152},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"50b3d0e1f17909a0d32b1a9c5a983ce2","len":262152},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab9251ff754860ecff97ecb45d113171","len":262152},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"eda7ed48e29ee51e543eae412afa3e38","len":262152},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe34f3c978e36050cc0324ebd10f2113","len":262152},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"770f8e4351f416d01b4e5905e6182114","len":262152},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ffc5b30c2ae5f68d97db47093f093b7","len":262152},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"203bf0ad82cefd3d9240a50883226500","len":262152},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"7083ddb527d8f284a27b965dbb3f71ce","len":262152},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"8972d12d517f515153e4217593e38cbd","len":262152},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"378c7d54aa0e47c1be08062302c2d615","len":262152},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3b09495cfc309312879658b0e636063","len":262152},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c57c1edbd2ecdb08710d02cd8b708c6","len":262152},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"8dea37ee27350c5d4e14f60749484d58","len":262152},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0dee7d46d4c23673394b693f27f3766","len":262152},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"c62c595a6e2990882d7cadff631c56de","len":262152},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"08eff6bf1aed164dab1b5144c93a339e","len":262152},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9e1ea86f490cb436315b2a5dfe27700","len":262152},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"62c45fbbc54d5ba2aab1e4f219639e16","len":262152},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"76d12aa5f889db92da1b4d90cb70ec28","len":262152},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbb84f9b1dd02e5e6cce7d385aefc47e","len":262152},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"65ea8fae55f8e2b93ffca5cfa59417b3","len":262152},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"308bd5740cd38138b61a6a659ab3489d","len":262152},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"725f7e7221786949fb166a17cea3d4fe","len":262152},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"94b230fc6b49f004a0b35fca8321953f","len":262152},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"37d8119968567d0a7e7f04b0e0e10d7e","len":262152},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"381fdb3aa50e50cbfc3692633c89c847","len":262152},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"a9dfadf596069976c88a1943099e0284","len":262152},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"2968cf0b54fc2d2f28777d67a6cb74ba","len":262152},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"59682b0bd874f1f6f2dc8f8a37d065f7","len":262152},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4abc2e61efa29e839e8c54a8597b3cd","len":262152},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"d827bb49770c8dd9397e1dc3e435f7b4","len":262152},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"572d530fcddea95b69daa2c5c49ca59e","len":262152},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"2eea5c67990da60d4af649369ee57a4e","len":262152},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"fbdde1ef3f31e9b49e200583f3ea3162","len":262152},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6d04af4d5bddebd010e2c9087835ad6","len":262152},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"67d807315ac9bdf41ff67f27a1b9e9da","len":262152},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"cc9a11df7a97fb9a0169f63150aff0d2","len":262152},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa64d50c2e72c70d47c527a412c6ebc","len":262152},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b18bcf9f7017a7a8a261cf34c570817","len":262152},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"5dac9d8f1e5754d818a724841712d518","len":262152},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"99395b4ea95dfb1e30421850562fccae","len":262152},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"e2470652540238a6d4e4ee4a0f41b288","len":262152},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"7578ea0996471412fa31d42b4b6793fc","len":262152},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"c324e0207bf219380c8498200d187e09","len":262152},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f5237508b02edaeea3b7566aaab1c8f","len":262152},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"45b6aa9de72aa9ecb63b77a4c6469f08","len":262152},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"d171092245e7736d740668d4bf88b69d","len":262152},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a55521c05a72864671e695a953d7f63","len":262152},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"352e09046f010c26bcebf1caf6a31008","len":262152},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a88541f965eea59e425f5a0ee54e7ea","len":262152},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"16de857133d3118e86a0ea8ec2b32108","len":262152},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"7e86600a79c8e4255d6b16051997160c","len":262152},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"685f9d66019a07b7663c1cc3333d27ab","len":262152},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd358825cd83587525a2707e3d966696","len":262152},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"98db58ba135f4006057cef5ad69029f8","len":262152},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ee1489fa618a21a3b3366e0c8322165","len":262152},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"d236c0a9ed4b8e5915bc8c265bb06aca","len":262152},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9d1304bd26aaefa615f00737a864779","len":262152},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0767636b21e45d207cc97617666a3","len":262152},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"e104cc83447db74e477a6dc172e4a44b","len":262152},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"47a8035c958e238f18d8365c27d4ed34","len":262152},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1d0bd7ef446e2cfcbbc882324f6a64b","len":262152},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"d66d76a5e5ea3aaeb7693bfd4d79c38b","len":262152},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"92f12b3316b91c0e478b71db38f662b3","len":262152},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"19640a449419963e7035979c5a420833","len":262152},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a6755283ec0bea1915a63ddff53f349","len":262152},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca91dc007eb44c1f55f889cc7375748a","len":262152},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"592ea92cac06a328f6b7b00153ce343a","len":262152},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"5154fae148e8c21e507215d740ce275c","len":262152},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a4101d6c828627657c6c1dd0b775e0","len":262152},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"98b70a8f1dda25aaf4cd6490e7d75a71","len":262152},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6f04b148eadd8c49cd620f50cc4d254d","len":262152},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"182191f40c369d41af2d331056975cbd","len":262152},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb514ff59b226f072aed7f351ef62890","len":262152},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"cfe974ad0e408adcae5c4359e9333167","len":262152},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"19550d9807d635fa0c76430a90ef9b2d","len":262152},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0bed1ee4a5791b1bf6b339096cb5f","len":262152},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5d9e00c46253847f07171849d6eadc7","len":262152},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"82d7d59990475c76797adafe0dc40696","len":262152},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"097f1e8dd4ccb3592b8da5723ef59cd4","len":262152},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"ded7762317423bd6ba8c1c70ee003895","len":262152},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b20168baef01265ca59af7749a004f19","len":262152},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"f2a61d2c37afd4e32380c5f974c718c4","len":262152},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"c505c8d454e6bd6805c19983c4b3becb","len":262152},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"451a64afafa5742e04a006595e3b76ed","len":262152},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"27122f17a65d56f09a5a99ec15023906","len":262152},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd77e86b9c6a708f0fdef896794e1416","len":262152},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b66fb1a91751f8fa60a26d01266773b","len":262152},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"17396f15a240e560fd0abbf793cff777","len":262152},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"db3e3eefb31e2e13de7f6b792d08fab6","len":262152},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"96c045e794657f81be4f550931d09f86","len":262152},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b2d795b329310ade27c930c0c9ac3a5","len":262152},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"5429af73f35f0c3aec714c0c2bb6fd63","len":262152},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba08b07cbfaa853ad84d02e16862b0c4","len":262152},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a6530cfe5bba58e1fec23eabeb8e6f","len":262152},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"01eb64dc438d15b3d37ad07a751ba19e","len":262152},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"055ff87be7d79d6a503ec16287cd7e1d","len":262152},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"f1d7485cfe6bceb4eaba44d23362a168","len":262152},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"60f1e6f7d1317bf376fb6c3fd6470db0","len":262152},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"371f5d440f6fb38a8dbd1bb08426d734","len":262152},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"1088a949951fc9d698682d724e67dd7b","len":262152},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4a1947699b7877aca19f2e9ba66d3a6","len":262152},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"39aecd2559c82e37e66328bcd8ece41d","len":262152},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"764fab8b1ebda52512b3bbeefb9a9bc2","len":262152},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9be02bf929a30beec27ada23f59b6a3","len":262152},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c4ba2e7b40c211a91300091cdfaea1","len":262152},"recovery120":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f161539917d7aed2bdcfc6a97bf1243","len":262152},"recovery121":{"type":"PAR 2.0\u0000RecvSlic","md5":"06cb8f2d83a03ad41d991c95cd93df59","len":262152},"recovery122":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c36c76eeaf4c7c3d669133896b673e","len":262152},"recovery123":{"type":"PAR 2.0\u0000RecvSlic","md5":"2701161fade73a745dfe8fc06d097aa5","len":262152},"recovery124":{"type":"PAR 2.0\u0000RecvSlic","md5":"02579b843bcf0750cf7595c84c4a1b2c","len":262152},"recovery125":{"type":"PAR 2.0\u0000RecvSlic","md5":"8af83a50b1e0c00dc811d24090466dc6","len":262152},"recovery126":{"type":"PAR 2.0\u0000RecvSlic","md5":"5fbcd8d0454c6674bd236ccfab72f784","len":262152},"recovery127":{"type":"PAR 2.0\u0000RecvSlic","md5":"2aae76e0ee91e98cfcad8b6edf64b30a","len":262152},"recovery128":{"type":"PAR 2.0\u0000RecvSlic","md5":"3c80e0a3b4aced780158b2cda147025b","len":262152},"recovery129":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714844585e05d9b8ecca9b3ce144e2","len":262152},"recovery130":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ff70a539d48bf46bddc4a4d604ef122","len":262152},"recovery131":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac9e11b7553248cc9cab4ffa4b0eadc0","len":262152},"recovery132":{"type":"PAR 2.0\u0000RecvSlic","md5":"a338dfa5fb3dfda5d0844ccf33b7a0a0","len":262152},"recovery133":{"type":"PAR 2.0\u0000RecvSlic","md5":"55816e85a627b359f1ce6abcf07a80f8","len":262152},"recovery134":{"type":"PAR 2.0\u0000RecvSlic","md5":"62989b2c6a48c883b7153f8841c57fbf","len":262152},"recovery135":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714d1f7c84c36bba5211a441322f1f","len":262152},"recovery136":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe16f8e7183642779c2fda0bdd2e69c7","len":262152},"recovery137":{"type":"PAR 2.0\u0000RecvSlic","md5":"959e3ed520135276ce507bfb974170e5","len":262152},"recovery138":{"type":"PAR 2.0\u0000RecvSlic","md5":"38755e49834c707a18ebc78153dd309f","len":262152},"recovery139":{"type":"PAR 2.0\u0000RecvSlic","md5":"861c385818cc82e20538517a860ac822","len":262152},"recovery140":{"type":"PAR 2.0\u0000RecvSlic","md5":"573401fb45c72080a5fbe59575daf1f0","len":262152},"recovery141":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbfa6d65eeffb234b4b35bb0101813c2","len":262152},"recovery142":{"type":"PAR 2.0\u0000RecvSlic","md5":"a915d1e365b2f6667b561f15074c246f","len":262152},"recovery143":{"type":"PAR 2.0\u0000RecvSlic","md5":"aaa5d236c4b0ab8bca3837357a676828","len":262152},"recovery144":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5e825e51919a1526298b2a63a67cd18","len":262152},"recovery145":{"type":"PAR 2.0\u0000RecvSlic","md5":"935be6ea3de1a2b7662868af15db66ae","len":262152},"recovery146":{"type":"PAR 2.0\u0000RecvSlic","md5":"20927e4c32ca839d9bc2e32dda3b88bb","len":262152},"recovery147":{"type":"PAR 2.0\u0000RecvSlic","md5":"734b9f415296856d8a9935673a6359ce","len":262152},"recovery148":{"type":"PAR 2.0\u0000RecvSlic","md5":"00df8edd547ee5f396ff09c1b0c8a979","len":262152},"recovery149":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c7d4748b05c963294d008bcd4d9abd4","len":262152},"recovery150":{"type":"PAR 2.0\u0000RecvSlic","md5":"136eefb57c6163ba9ee4361bc30a946b","len":262152},"recovery151":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb45e084b5f5d3c1659825791ebe9dcf","len":262152},"recovery152":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1167f13fd2ab1a49de63f0c23d97b30","len":262152},"recovery153":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec60aca69736e5c01c0bc0a4a5fc20a7","len":262152},"recovery154":{"type":"PAR 2.0\u0000RecvSlic","md5":"1eeecfb1e63674b87f0d47736f77cf8a","len":262152},"recovery155":{"type":"PAR 2.0\u0000RecvSlic","md5":"d24ea57a9b206855ce59c28a29a609b5","len":262152},"recovery156":{"type":"PAR 2.0\u0000RecvSlic","md5":"adaad933db3226d3778da61397198f17","len":262152},"recovery157":{"type":"PAR 2.0\u0000RecvSlic","md5":"36e8c301bcbcb253546bb3672400f0f2","len":262152},"recovery158":{"type":"PAR 2.0\u0000RecvSlic","md5":"acb88c3d8b4676e5b2a1d07240ecbf3a","len":262152},"recovery159":{"type":"PAR 2.0\u0000RecvSlic","md5":"6ec97320f986ffc6dc8884d762784e64","len":262152},"recovery160":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba205e752f5743e7fb56f91159f11638","len":262152},"recovery161":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee0bc353e44cb3a0a6674bebb8f2b02f","len":262152},"recovery162":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ffb47c96191de05ee92d65d17e6d1b5","len":262152},"recovery163":{"type":"PAR 2.0\u0000RecvSlic","md5":"cab383c52fd7edf43c813a02b0e9f715","len":262152},"recovery164":{"type":"PAR 2.0\u0000RecvSlic","md5":"7579d93610965f3fa9d5380b8a05d179","len":262152},"recovery165":{"type":"PAR 2.0\u0000RecvSlic","md5":"79ebb790e0f5b3878e8efa9e31a1ec2f","len":262152},"recovery166":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98d9e9fcaeab7644965287fdfa14927","len":262152},"recovery167":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c4d552661fe8a972e9a3451829f467f","len":262152},"recovery168":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48d561f1fbc700b8c676ee59c4a9f5d","len":262152},"recovery169":{"type":"PAR 2.0\u0000RecvSlic","md5":"585373bf1021202ac1a871dd1325e797","len":262152},"recovery170":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab371b66e23bf6b2f143a97162857c7f","len":262152},"recovery171":{"type":"PAR 2.0\u0000RecvSlic","md5":"e73956ff54bc05bd6f269456952de3f1","len":262152},"recovery172":{"type":"PAR 2.0\u0000RecvSlic","md5":"1908334591c7199fb59474e189631053","len":262152},"recovery173":{"type":"PAR 2.0\u0000RecvSlic","md5":"c2fe26625667e1d867eb435a7b542044","len":262152},"recovery174":{"type":"PAR 2.0\u0000RecvSlic","md5":"54a87b2c409465195efcac62cf90c0d0","len":262152},"recovery175":{"type":"PAR 2.0\u0000RecvSlic","md5":"b067362a194e73030224dcb460b406f5","len":262152},"recovery176":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce671aa8082cea7d459d1f23b43ed566","len":262152},"recovery177":{"type":"PAR 2.0\u0000RecvSlic","md5":"37770253e086905a8273fa7681e120eb","len":262152},"recovery178":{"type":"PAR 2.0\u0000RecvSlic","md5":"891a00e3973ede2824a25a87243e8847","len":262152},"recovery179":{"type":"PAR 2.0\u0000RecvSlic","md5":"44cb5fa5db70c22cb3d382bf4fe76924","len":262152},"recovery180":{"type":"PAR 2.0\u0000RecvSlic","md5":"27ca455202140b26fb48af7db5c559eb","len":262152},"recovery181":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac3a13daccd4e334fb2cb4ba8806931","len":262152},"recovery182":{"type":"PAR 2.0\u0000RecvSlic","md5":"6d218c46b445edd8b584b1079aea762c","len":262152},"recovery183":{"type":"PAR 2.0\u0000RecvSlic","md5":"c288a78496f8e466bc2ca2555ab57651","len":262152},"recovery184":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e2d494f19d7e12728b2813959f363b9","len":262152},"recovery185":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac64828b943e917b0b327ba4e3e1d7a","len":262152},"recovery186":{"type":"PAR 2.0\u0000RecvSlic","md5":"64149d71b23f3b48601a1f66d38d2ce9","len":262152},"recovery187":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb81ff2511a7b5d0cd9f4c88ebef6f4b","len":262152},"recovery188":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fc5b1e6e008764933bb831afe224ea7","len":262152},"recovery189":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ad66c7aeb321475b766456499f641ea","len":262152},"recovery190":{"type":"PAR 2.0\u0000RecvSlic","md5":"15a3793c1c67f7a9cf718d331d34f41d","len":262152},"recovery191":{"type":"PAR 2.0\u0000RecvSlic","md5":"40d0e37fbc8aa71fb21b18da1b7d025c","len":262152},"recovery192":{"type":"PAR 2.0\u0000RecvSlic","md5":"c650dffc875af3e180761d3eef8994c9","len":262152},"recovery193":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c22cdc5d78c408671a55c724affe633","len":262152},"recovery194":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e4eca09aeed7823945af8434d4ccd8","len":262152},"recovery195":{"type":"PAR 2.0\u0000RecvSlic","md5":"3fe2b6e3f848a643fd8e77a05d795486","len":262152},"recovery196":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ab3653b65ae244124b97817498673f0","len":262152},"recovery197":{"type":"PAR 2.0\u0000RecvSlic","md5":"3499ed0cd0b8e789b4622ebc71c49e1b","len":262152},"recovery198":{"type":"PAR 2.0\u0000RecvSlic","md5":"853ea439016d879ca104b2ea5621c320","len":262152},"recovery199":{"type":"PAR 2.0\u0000RecvSlic","md5":"99af3d84da4967f583714a44b8c5129b","len":262152},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}}],"1":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"6281dd7c3c3938b3e6185a0477e6e287","len":65608},"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}}],"2":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"80ee3e17eb31ff1aeb2b4b1299f54110","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"47d39160c9dd187d9602b395a9960adc","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2fce1dc1ba509aae7225e828a735ca3","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"93c8615d044e46d375da63c6e5c6999e","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"fcef287602419071bdf23c06a665b7dd","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"73901dbf207c0a3ce23b257df0c90f1c","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e455419ba916f25c94ac35c7fdf339b","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"30b4e3c59fa3f4dfce5440d6494f2eed","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"7868a53c236c0bdb9cd3a2f2a167766b","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"5081c501ed7ef9de8960c89020e87192","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"3984f758117ba91e1a6f4a4a0371d017","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"12b685ecaae6d8aa9e6a993fdfd07e0c","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f50c1e1f38f49140b7a0601896740fa1","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6f795b7c34ab97e65dd3bca71698c5c","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"96ec24936af60d8ae873945cf38b9091","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a8d86b450e2da2d84a606e4bf97079","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"c867457ea6e5d2a1cf07091126ab98eb","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"3":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"4":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"edd1ade340f0f983fbfbcf844d801cda","len":2097304},"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd6685f87867e9c3b0990dd40ba8fc1d","len":2097304},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fe15849c49662577aea77449e41e12e","len":2097304},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"720cf3d8b68b0312ba6972859c9adac5","len":2097304},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"4588d83af606eb1601a1b5ff510b7879","len":2097304},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2f302e2675b82a7e510046d3053d9919","len":2097304},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfe8e1351ed2ac4376d0309aa01301b1","len":2097304},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}}],"5":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"227b1235272f1640a2f3f1f5ac13109f","len":4194372},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b43e42ea2ec0617637ab5e1cfb98c0d","len":4194372},"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"612f84ace10d12ed08b83ccdd9d66511","len":4194372},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3595e2157d1f7b0864a88e07c1f5442","len":4194372},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1ca7d5e9064e21e1777d44aacf4fb30","len":4194372},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1a5cf7498aaafb919379feccf77ad64","len":4194372},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1de02388438296cf7a44cdef5c38856","len":4194372},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1283885f4c351acdab8eef7df7258942","len":4194372},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f9d71ae23872e39f122c02e3f403cbc","len":4194372},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"662220713f645187c50d0f950bafc747","len":4194372},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"63bafc1c7d16744ec4904235621d3be1","len":4194372},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"d67b2fdd14e40510d595897300ff8889","len":4194372},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"50ef19310cad87061283fa3ce62188a3","len":4194372},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b79e5fac9942dc229eef88d1b65c15c","len":4194372},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfc8f3cf562f919f39e681a0207afa00","len":4194372},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e536d4be26737e0f2fb2941c652b9b","len":4194372},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9afe78529cbfaffceba5e2601f13279","len":4194372},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"1240e4446fc41a8805e0b328ff3fef7c","len":4194372},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"7d6562c315684126e67926609d6ea1c7","len":4194372},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddeef17620d1515a91bb6d195e627438","len":4194372},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e03cdfc92253706ac7392426134ce613","len":4194372},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"05d9de37cb8619790dabc4686a6465ed","len":4194372},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddd9a65e990a88b69ba76360900162bd","len":4194372},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a1dac3c4b04b02bd63db1169606f96","len":4194372},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}}],"6":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"7":[{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"13d75e3ff2871a1f3554d0810d758a9b","len":12292},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d61059e69c4520710a1bf6157339902","len":12292},"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad306d05c1f843b71793db9705949994","len":12292},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"9df88ae2f95bb7ac4d7756cff03be42e","len":12292},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"019ef2edd1afe0d9e3a4fae44fcc4dcb","len":12292},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a296997ecfb7742bd8adcb4408f3b1e","len":12292},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"396efc5068f49b51ff1c78438defd032","len":12292},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e29e2769b588fdd6073366f0c69ba755","len":12292},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"59175e7ac2d1d2364a469e6213bae609","len":12292},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"423e090521d2e6e445f3ab47a591f170","len":12292},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"2ce148ae06221e99da47f28ef4fa0c2e","len":12292},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"61c196fcab5efb2423e912493deb174f","len":12292},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"2a95543e8ad6e15bd03ee56837995b56","len":12292},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b48bff16a3a8eeebaef276c142aa66b","len":12292},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"6018b4d628e4267080dfbfbaa11f8482","len":12292},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"6355f957b5299352ceb471cc7aaab9de","len":12292},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f14355deaf0bae05b3a601788f952c","len":12292},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a9553081f2bcb2568cc3d49e817c12d","len":12292},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"64c64c2c2aac4fa2cf1dc5abe9e79e40","len":12292},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"11e3ecabd3e549dc7bd276d0ef06e4f2","len":12292},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"045ba81e58ecde2ae2e5e10d6ff52deb","len":12292},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"7c2a0122a173e9c24288c6fdff476f75","len":12292},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fada7e5fd40793968feb8708724e76f","len":12292},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"6051d8fff3ba7d1a9ef0b89dc3c099fe","len":12292},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"a20b372c84071403679473808447e1d1","len":12292},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a073c7ab64609655101b3f9d43f83217","len":12292},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef752e17c8404a77522e886979da238e","len":12292},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"9489fae5a303376fcf21bdeca134a01e","len":12292},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd47dc979bcdc24667a1743097d6d57b","len":12292},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"6576e487f28903ad92ac1bb9afde1cfa","len":12292},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"a77c05ca3f1390283d1fbb1c0332a0d0","len":12292},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"3923f44f59d0ba6e84ce1d30a44962b7","len":12292},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"d956db1617f11f33b3663c81e560bae8","len":12292},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"740ded4b4c20fe1f6799961a9f4a863e","len":12292},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"90781f2934736ba360ceb99d95652fde","len":12292},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a4bdc4e888708e7639a073817e96be3","len":12292},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad671f3f22846043eee50812b7841d19","len":12292},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"370998274b1b0348d66f79f98f277707","len":12292},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"c8491de0561f52f26a01dd5b1fc0a680","len":12292},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"9108adfad172584acad344c253d6407c","len":12292},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"64d94e06c9c8076fb98148977aabada7","len":12292},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"934a2da041551ca688d4cd65201de9de","len":12292},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"47ad1b8f9e972ff43bc55d8a7c7c8b82","len":12292},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"dff77ec66b2bb8226d40075e7dafd65d","len":12292},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"947c5b6bded6872b18f8b0eafbb75a82","len":12292},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9dc6162f9bf3e41821ff1aa49b02fd7c","len":12292},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"0936daced4a42c3bdd97bcf7bed560be","len":12292},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d3733e28add7241ce6c160afc499514","len":12292},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"3e457b7d118e652d35420859f0c2cb0c","len":12292},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"89751e1de0aea82ecff902443fc29a65","len":12292},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"0bcb7ff73a2b4185a25634b41fd282db","len":12292},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0051a97153a602a5da921ce4cc39ab8c","len":12292},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"c33f2d2904d814872fe43dcb1ba83fa6","len":12292},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"44d12ca49b89389205fa867b31ef821f","len":12292},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"908fa6ddb6e46fbc79cd7e8f36fdc91c","len":12292},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"e01263d2374aca3358fafd4f1e31dd50","len":12292},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"d87a878179763b3b553850bcc3f37463","len":12292},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"4748999b7a65523b2c40bdba325fa0d3","len":12292},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"abf213d33e195ca39e86262a57f5a335","len":12292},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba9e9db844838a23d3a845cc180aa7a9","len":12292},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"4598550a4ee7bb79427d2baa85ed5341","len":12292},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aeb560438ea23e78d2d990b70579d84","len":12292},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"b21c0f8558b48322af337d9502ff7410","len":12292},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"f965345c3ebc04d23a8a7343d3f4573a","len":12292},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ad9c5792fcc6ed1d0d6889b8067ebc1","len":12292},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b73c50d6e875d041d47d371bebdeaff","len":12292},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3bbd4599eaf7e264910b462fa59ee48","len":12292},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"052c9e79b7f175a837a46923f0a80ee2","len":12292},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"be0f6bfabf1bf6ba6508237f216475a1","len":12292},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea169208817c6d7fe336c901bf4a5ea0","len":12292},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"625407da280dfbfc7cacc8fb4f8bb265","len":12292},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"1c32d547d7d52d43cebc4d996c262111","len":12292},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"cedc3a8750dfef198ad10d9dccefe339","len":12292},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec4ef290224420846245b9375b814f8b","len":12292},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"4bad80c7f5a56d2fc87937642b18aa0c","len":12292},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4b0db1068256aa8c1e07f98a2d16339","len":12292},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"375911da8153e31766960340b1626520","len":12292},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"38b8edbd15145ef6126699807dc73da5","len":12292},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e66ffe7118eb850b326d9a3daba5d97","len":12292},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"d12c98c5352c1db7a19cd928513e66c3","len":12292},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef8dc2866ac48b1831afacf9e0750164","len":12292},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd23bf25bed444e4bdc22b9fc65288bd","len":12292},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"a82f0d4f7663ac83d47ca10ac3eac1a7","len":12292},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"340d4ae8bbe9f403176a4a001ecede09","len":12292},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"1465427d8499f1cc00977b1bde040381","len":12292},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"961b476e9df27b019b129aa37e9b4c87","len":12292},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4743689d9571c1ecc2c4341a8400e40","len":12292},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f6d7fead6d5b60a7fdbe453abd63fa5","len":12292},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b56ed13c80fed786dfaec38591a7f849","len":12292},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"3dffd1a0424af8dae4cf74d936aff7ea","len":12292},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b1e226311ae44d16bcc0f56c68c1a3","len":12292},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3fc2ca676e4c262dd57e61cf68c5361","len":12292},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"42b3a719315c551d3c91ee822c2da7ed","len":12292},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"928fc76675423931cbabc3c07b638294","len":12292},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3928b4db0b5b11d3cafba8c20f77376","len":12292},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"69092ea4065e6a6cbec1bf3841321cc4","len":12292},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"011dcea0d6d2d576f76d0a06f7f745b8","len":12292},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48dc08c95b0176c34116600eb14961c","len":12292},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"51ecf96adae6cab2ddb04d94f1e31d78","len":12292},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa7ffc715bf00106a5d7bc6d0da3ab2","len":12292},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4cf770599853908d4cea098a1298970","len":12292},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac084faa95146eeed6664cc3e029e6ce","len":12292},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"52f195fc483df0404a6c742750023de0","len":12292},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"61ce493a2aa951375af43e9d1ca79d4f","len":12292},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"fefb4a742be24ac3731afe307f5b82ba","len":12292},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c4e137dade4e9642b809f09dc050eaa","len":12292},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"af7592f4713e63de24ff2a591dd4fe26","len":12292},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9b995c9fe0601e5e333f734fb387047","len":12292},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f0bc8e940bfc1ee4a8088af4d36ea80","len":12292},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a3fef678dfb898d2a6a2fc349ab8885","len":12292},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4c5d8d29385549ee22e2e3a796ecbce","len":12292},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"a94ce91cec61f2add2f780412f6688dd","len":12292},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"45117210792cfdca35871f0625f29f50","len":12292},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}}],"8":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"8f490677c7370c81405b5dc720de5259","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab4b40f2f8c79be1b72fc3c297034e18","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}}],"9":[{"main":{"type":"PAR 2.0\u0000Main","md5":"d2e9f5f81e8780b703db895c249cbd68","len":92},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"4a2d665a1e6879cd1b9d04025a7a5f80","len":132},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"59a78d4061c80c5b686bc85f1ae871e1","len":120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"aef6213477d5a93d8932e106b971214e","len":104}}],"10":[{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2b7cd021389fdb5ed3791087cab67848","len":16777284},"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}}],"11":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ae3fa27bb30388ee1999b3e5d295b1e","len":1048644},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"76549588cac2b8d3f6ffb53ef2e5e5d5","len":1048644},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"46a19207e25504383c733859fa23a5d3","len":1048644},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"e39bd46ac1405579ea58ad6990a52e5f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aa647a57f56be00b66830ebeed8d766","len":1048644},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"8e3df5e94c2602a027a43df9a79340fa","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"97253b6d0f8d827181326470066333e4","len":1048644},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d3dc575cd28880a5218b9c5d0762efb","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"696a55821a69dc48cde7e7b7dbfe18e7","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f8aa5bf1983752b0122f91c6dc46dae","len":1048644},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"60ab3ac1c0b22bb2660ec3b1a44a454f","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"32b0c0ccf4da378145ec2881d06a0104","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe3cf4681ff6dab4fe48baadeefa8b9f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"75a0660ce94daa96fefcdb8ecb7ffe2d","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9fc5ad5e0214fb90588695cf1bea1d4","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a7a7c816d43b31ffd1f1ab7ff93cb78","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"6514fb821fb9d012cedb09a5cdb5ce81","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"c55c09da6d1858b24764007cc42d01e1","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ee9880f40b8062e5748abe8b5fb2579","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"329cb0d4712fc629414fbb2aa1c15454","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b03860153ecb1c0b15b785422cf36c","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4ffc6740802ea6274134e905fe8ec1b","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"36c944effcf8db4f5098fdef4d7f9fb7","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"33dddc46138a869d45473597bd8a0885","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"52e988fed07f5c02b94695f129ffee05","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"860ce0c08f14146f6545f8b455388457","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"02fd2a64a0440a35e571383c3d0a62ef","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"557a895ab8b60101fc681cd5908d4bfe","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"2045f82f13b16f36b30986653111655d","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce6fad602d7745fc6b87ce0cf9adf74b","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"f916de512a753ad776ee91d1cd0c5e88","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"5f9b5225d53a5df264e98d57ef41488f","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"b35b8b7c45908e260ff729d0a927ed93","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"f976fa312351c40ff2c6fb8022aabf6d","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef96b2f79e199d7bb037fe0b05056211","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"96647c04ece1d25fb0f6b1eca3ae2219","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7bf3716a84aed5b1321b19e18a445eb4","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"257500d3e77cf74b1efa67606d4ec1d0","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"696781c6156f397df6106e2d51819aa4","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"9046f4b89e3a13e3947e785eae760ea7","len":1048644},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fad2133a7b1672ed31694f89fe8b1c1","len":1048644},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"285897a69aa8e07e6f4f82025f25455f","len":1048644},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"424f74e6c405f183cb635524ae086d36","len":1048644},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"d7285bfef36a2314e8d5cc2323a91625","len":1048644},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"bf8285ec53c4c06737ba3f1ffd792760","len":1048644},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef6616466b9fefebfaddfadcf25b8fa9","len":1048644},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b64f6fc3816ff762953d256f6be5c09","len":1048644},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a6690f87f54eb8769c236e5972f4817","len":1048644},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"9fcdc6307e18029a7306c24d6e23a798","len":1048644},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"291516ea56111c0400a6a12f676e5a53","len":1048644},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"c87b641d60c32ea5d2883509304abc60","len":1048644},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8a1d99820b3030161654ce66e732522","len":1048644},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b7465b4e0aa16ed2531a4a10297c4d3","len":1048644},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a0f4220cc76d6ab4aa7d537c1f7c77b","len":1048644},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"9143733d0076ab08719a05ca13f1f422","len":1048644},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"5266852e8cce0de780bce15fb1a608dd","len":1048644},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"19a85bb67233bbe31319d61afa482c25","len":1048644},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"777e26b540a27ea9480817bad4ba5eb1","len":1048644},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"a6559b6d4dc1280cbcf748e968b4dbae","len":1048644},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"29a393bd18579d5ae612a73b5ad5471f","len":1048644},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"0c61aafcedeeb70a5677bac1f64ace26","len":1048644},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"18c227628a2b46e865649fbe7ed1c9ed","len":1048644},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dea15252f8943bc37ccfd8dede9ca4d","len":1048644},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"a437635c37da22690a51bcca8af94066","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}}],"14":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdb6fb8f2a0d2a5902cabcde8b859035","len":4294967364},"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b2721e854623f19ede417796298e409","len":4294967364},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}}],"18":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c59db66d27af0dd33c5c76649c80596","len":268435528},"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"b997d60937956d94164d0d4afbb9efe3","len":268435528},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}}],"20":[{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"21":[{"main":{"type":"PAR 2.0\u0000Main","md5":"979bf683edbeb3c67b54eefe72ca796e","len":92},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"572fd34269714193e5755b1c9b4e3dcd","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"787536bb15eb4279f56212f01d1fdc1d","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0769f180278adcffd6dcdfbaa0987882","len":104}}],"22":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"dc21985927d59404c3a10a1523e53e0c","len":262212},"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2042cbf8d665fba64349918e12425f02","len":262212},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"010700d8dce400b258d57b93189f6261","len":262212},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"a0871e7345520f1e61b2a99292b01a18","len":262212},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e9c8c7d20fe75165ee4f0812ebfd434","len":262212},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6b02899978c2d52177f754cb1708bc1","len":262212},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"55ec6d9b1af64f6fa6ceda6fa43fd722","len":262212},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b50a45fdef6a9878bcf7de3e56cc23a","len":262212},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"36d262d667af95827a870ca0379516fb","len":262212},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"d0a7a6cecea8434b3de7934ecb982e53","len":262212},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}}]} \ No newline at end of file diff --git a/test/par-compare.js b/test/par-compare.js index 60832de8..79c2ab72 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -13,8 +13,11 @@ var exeParpar = '../bin/parpar'; var exePar2 = 'par2'; var skipFileCreate = true; // skip creating test files if they already exist (speeds up repeated failing tests, but existing files aren't checked) +var pruneCache = false; // prune unused keys from cached results +var fastTest = process.argv.slice(2).indexOf('-f') > -1; + var fs = require('fs'); var crypto = require('crypto'); @@ -311,7 +314,8 @@ function writeRndFile(name, size) { } writeRndFile('test64m.bin', 64*1048576); writeRndFile('test2200m.bin', 2200*1048576); -writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows +if(!fastTest) + writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows // we don't test 0 byte files - different implementations seem to treat it differently: // - par2cmdline: skips all 0 byte files @@ -326,14 +330,15 @@ writeRndFile('test13m.bin', 13631477); var cachedResults = {}; +var setCacheKeys = {}; var sourceFiles = {}; - +var cacheFileName = fastTest ? 'cached-cmpref-fast.json' : 'cached-cmpref.json'; try { - cachedResults = require(tmpDir + 'cached-cmpref.json'); + cachedResults = require(tmpDir + cacheFileName); } catch(x) { try { // try current folder as well, since I tend to stick it there - cachedResults = require('./cached-cmpref.json'); + cachedResults = require('./' + cacheFileName); } catch(x) { cachedResults = {}; } @@ -456,15 +461,6 @@ var allTests = [ cacheKey: '22' }, - // issue #6 - { - in: [tmpDir + 'test64m.bin'], - blockSize: 40000, - blocks: 10000, - singleFile: true, - cacheKey: '12' - }, - // no recovery test { in: [tmpDir + 'test64m.bin'], @@ -481,15 +477,7 @@ var allTests = [ cacheKey: '21' }, - // 2x large block size test - { - in: [tmpDir + 'test64m.bin'], - blockSize: 2048*1048576 - 1024-68, - blocks: 1, - memory: is64bPlatform ? 2560*1048576 : 1536*1048576, - singleFile: true, - cacheKey: '13' - }, + // large block size test { in: [tmpDir + 'test64m.bin'], blockSize: 4294967296, // 4GB, should exceed node's limit @@ -499,30 +487,6 @@ var allTests = [ cacheKey: '14' }, - // 2x large input file test - { - in: [tmpDir + 'test4100m.bin'], - blockSize: 1048576, - blocks: 64, - singleFile: true, - cacheKey: '15' - }, - { - in: [tmpDir + 'test2200m.bin', tmpDir + 'test1b.bin'], - blockSize: 768000, - blocks: 2800, - singleFile: true, - cacheKey: '16' - }, - - { // max number of blocks test - in: [tmpDir + 'test64m.bin'], - blockSize: 2048, - blocks: 32768, // max allowed by par2cmdline; TODO: test w/ 65535 - singleFile: true, - cacheKey: '17' - }, - { // skewed slice size to test chunk miscalculation bug in: [tmpDir + 'test2200m.bin'], blockSize: 256*1048576 + 4, @@ -531,25 +495,70 @@ var allTests = [ cacheKey: '18' }, - { // slice > 4GB (generally unsupported, but can be made via par2cmdline with some trickery) - in: [tmpDir + 'test4100m.bin'], - inBlocks: 1, // 4100MB slice - blocks: 2, - singleFile: true, - cacheKey: '19' - }, - ]; -if(is64bPlatform) { - allTests.push({ // recovery > 4GB in memory [https://github.com/animetosho/par2cmdline-turbo/issues/7] - in: [tmpDir + 'test4100m.bin'], - blockSize: 100*1048576, - blocks: 41, - singleFile: true, - memory: 8192*1048576, - cacheKey: '23', - readSize: '100M' - }); +if(!fastTest) { + allTests.push( + // issue #6 + { + in: [tmpDir + 'test64m.bin'], + blockSize: 40000, + blocks: 10000, + singleFile: true, + cacheKey: '12' + }, + // large block+mem test + { + in: [tmpDir + 'test64m.bin'], + blockSize: 2048*1048576 - 1024-68, + blocks: 1, + memory: is64bPlatform ? 2560*1048576 : 1536*1048576, + singleFile: true, + cacheKey: '13' + }, + + // 2x large input file test + { + in: [tmpDir + 'test4100m.bin'], + blockSize: 1048576, + blocks: 64, + singleFile: true, + cacheKey: '15' + }, + { + in: [tmpDir + 'test2200m.bin', tmpDir + 'test1b.bin'], + blockSize: 768000, + blocks: 2800, + singleFile: true, + cacheKey: '16' + }, + + { // max number of blocks test + in: [tmpDir + 'test64m.bin'], + blockSize: 2048, + blocks: 32768, // max allowed by par2cmdline; TODO: test w/ 65535 + singleFile: true, + cacheKey: '17' + }, + + { // slice > 4GB (generally unsupported, but can be made via par2cmdline with some trickery) + in: [tmpDir + 'test4100m.bin'], + inBlocks: 1, // 4100MB slice + blocks: 2, + singleFile: true, + cacheKey: '19' + }, + ); + if(is64bPlatform) { + allTests.push({ // recovery > 4GB in memory [https://github.com/animetosho/par2cmdline-turbo/issues/7] + in: [tmpDir + 'test4100m.bin'], + blockSize: 100*1048576, + blocks: 41, + singleFile: true, + memory: 8192*1048576, + cacheKey: '23', + readSize: '100M' + }); + } } @@ -635,6 +644,7 @@ async.timesSeries(allTests.length, function(testNum, cb) { } return ret; }))); + setCacheKeys[test.cacheKey] = 1; delOutput(); cb(); @@ -650,12 +660,19 @@ async.timesSeries(allTests.length, function(testNum, cb) { fs.unlinkSync(tmpDir + 'test65k.bin'); fs.unlinkSync(tmpDir + 'test13m.bin'); fs.unlinkSync(tmpDir + 'test2200m.bin'); - fs.unlinkSync(tmpDir + 'test4100m.bin'); + if(!fastTest) + fs.unlinkSync(tmpDir + 'test4100m.bin'); } if(!err) { + if(pruneCache) { + for(var k in cachedResults) + if(!(k in setCacheKeys)) + delete cachedResults[k]; + } + try { - fs.writeFileSync(tmpDir + 'cached-cmpref.json', JSON.stringify(cachedResults)); + fs.writeFileSync(tmpDir + cacheFileName, JSON.stringify(cachedResults)); } catch(x) { console.log(x); } From 63b1a40778a38ed80e799dbbc579bc65491c31f7 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 20:43:00 +1000 Subject: [PATCH 86/91] Fix test workflow --- .github/workflows/test-full.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index b936c7cc..d573e406 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -1,7 +1,7 @@ name: Run PAR2 Create Tests on: workflow_dispatch: - push: + #push: jobs: test-node: @@ -32,5 +32,6 @@ jobs: - uses: actions/setup-node@v3 with: node-version: ${{ matrix.version }} - - run: (npm install --production + - run: npm install --production + - run: node-gyp rebuild - run: node ${{ matrix.flags }} test/par-compare.js -f From afe543d48f7d77d51e348fb71eb8c83590f4a9b3 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 21:30:02 +1000 Subject: [PATCH 87/91] More build fixes --- .github/workflows/test-full.yml | 2 +- .github/workflows/test.yml | 17 +++++++++-------- gf16/threadqueue.h | 2 +- src/cpuid.h | 4 ++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index d573e406..2dc57ad1 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -1,7 +1,7 @@ name: Run PAR2 Create Tests on: workflow_dispatch: - #push: + push: jobs: test-node: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8357eec4..13356699 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,19 +24,20 @@ jobs: mkdir test\hasher\build cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }} cmake --build test\hasher\build --config ${{ matrix.config }} - - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test.exe" - - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-pmul.exe" - - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-ctrl.exe -f" + - run: dir $env:SDE_PATH + - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test.exe" + - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test-pmul.exe" + - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test-ctrl.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-inv.exe -f" + - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test-inv.exe -f" if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }} - - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/hasher/build/${{ matrix.config }}/test.exe" + - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/hasher/build/${{ matrix.config }}/test.exe" # test SSE2-only to see if CPUID checking works - run: | - Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test.exe" - Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test-pmul.exe" - Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/hasher/build/${{ matrix.config }}/test.exe" + Invoke-Expression "$env:SDE_PATH/sde.exe -p4 -- test/gf16/build/${{ matrix.config }}/test.exe" + Invoke-Expression "$env:SDE_PATH/sde.exe -p4 -- test/gf16/build/${{ matrix.config }}/test-pmul.exe" + Invoke-Expression "$env:SDE_PATH/sde.exe -p4 -- test/hasher/build/${{ matrix.config }}/test.exe" if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }} # TODO: XOP tests for hasher? diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h index 0b3c23f8..a203d050 100644 --- a/gf16/threadqueue.h +++ b/gf16/threadqueue.h @@ -280,7 +280,7 @@ class MessageThread { #if defined(_WINDOWS) || defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64) HMODULE h = GetModuleHandleA("kernelbase.dll"); if(h) { - HRESULT(__stdcall *fnSetTD)(HANDLE, PCWSTR) = (HRESULT(__stdcall *)(HANDLE, PCWSTR))GetProcAddress(h, "SetThreadDescription"); + HRESULT(__stdcall *fnSetTD)(HANDLE, PCWSTR) = (HRESULT(__stdcall *)(HANDLE, PCWSTR))((void*)GetProcAddress(h, "SetThreadDescription")); if(fnSetTD) { wchar_t nameUCS2[17]; //assert(strlen(self->name) <= 16); // always hard-coded string, plus Linux limits it to 16 chars, so shouldn't ever overflow diff --git a/src/cpuid.h b/src/cpuid.h index c44cabd3..2fa6cf38 100644 --- a/src/cpuid.h +++ b/src/cpuid.h @@ -16,8 +16,8 @@ # else #include /* GCC seems to support this, I assume everyone else does too? */ - #define _cpuid(ar, eax) __cpuid(eax, ar[0], ar[1], ar[2], ar[3]) - #define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3]) + #define _cpuid(ar, eax) __cpuid(eax, (ar)[0], (ar)[1], (ar)[2], (ar)[3]) + #define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, (ar)[0], (ar)[1], (ar)[2], (ar)[3]) static inline int _GET_XCR() { int xcr0; From ce88e2825843c90a31bf2feccbb2d744aafeabd1 Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 21:54:35 +1000 Subject: [PATCH 88/91] Test workflow fixes --- .github/workflows/test-full.yml | 1 - .github/workflows/test.yml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index 2dc57ad1..90b17d37 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -33,5 +33,4 @@ jobs: with: node-version: ${{ matrix.version }} - run: npm install --production - - run: node-gyp rebuild - run: node ${{ matrix.flags }} test/par-compare.js -f diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 13356699..2e1c3534 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,6 +15,7 @@ jobs: steps: - uses: ilammy/setup-nasm@v1 - uses: petarpetrovt/setup-sde@v2.1 + sdeVersion: 8.69.1 - uses: actions/checkout@v3 - run: | mkdir test\gf16\build From 72d5382c25896a4615abb56ab16b70b342a0c3ff Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 22:09:54 +1000 Subject: [PATCH 89/91] Test workflow fixes --- .github/workflows/test-full.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index 90b17d37..9474aa31 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -33,4 +33,4 @@ jobs: with: node-version: ${{ matrix.version }} - run: npm install --production - - run: node ${{ matrix.flags }} test/par-compare.js -f + - run: (cd test && node ${{ matrix.flags }} par-compare.js -f) From d56cc9989a7eaf52658e18b4140995a4a97c6edc Mon Sep 17 00:00:00 2001 From: animetosho Date: Tue, 29 Aug 2023 22:49:24 +1000 Subject: [PATCH 90/91] Test workflow fixes --- .github/workflows/test-full.yml | 2 +- test/cached-cmpref-fast.json | 2 +- test/par-compare.js | 11 +++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index 9474aa31..562917d9 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -19,7 +19,7 @@ jobs: flags: '--trace-warnings' python2: false - version: '20.5.1' - flags: '--pending-deprecation --throw-deprecation --trace-warnings' + flags: '--pending-deprecation --throw-deprecation --trace-warnings --openssl-legacy-provider' python2: false name: Test on Node v${{ matrix.version }} runs-on: ubuntu-latest diff --git a/test/cached-cmpref-fast.json b/test/cached-cmpref-fast.json index 3b0ac589..c6232270 100644 --- a/test/cached-cmpref-fast.json +++ b/test/cached-cmpref-fast.json @@ -1 +1 @@ -{"0":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"384329a66d8557f9c35b05e2d391b2db","len":262152},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e968c2b42a5e148ee1da7556645ec19","len":262152},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"f9cbb890066d9b19df7fb43cdec29f89","len":262152},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee74b923884c67a0f9c75f8db2a4946a","len":262152},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"c95cc5d5456cc427eb3f05cab6e7712d","len":262152},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"b789836bb5fda2ad650ed492693c24d0","len":262152},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"30c5398aa3faf8b4306d4010dde7f34a","len":262152},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"3652d2c728cf4e5cf0305e05237bc8b4","len":262152},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c51f2e4e80ac1f825bc98ce1ee50137","len":262152},"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"527d15c6ae89dd498b68d283cb13a04d","len":262152},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"5a841a77e7876ad22c0860da5f6ed754","len":262152},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"2d942f333b87e8242d5addbfc309c9c0","len":262152},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f8c66e0c1496a0ab21c6af961c8a8a73","len":262152},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"3419f14c6492702df0b75e1eac8a0e07","len":262152},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"50b3d0e1f17909a0d32b1a9c5a983ce2","len":262152},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab9251ff754860ecff97ecb45d113171","len":262152},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"eda7ed48e29ee51e543eae412afa3e38","len":262152},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe34f3c978e36050cc0324ebd10f2113","len":262152},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"770f8e4351f416d01b4e5905e6182114","len":262152},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ffc5b30c2ae5f68d97db47093f093b7","len":262152},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"203bf0ad82cefd3d9240a50883226500","len":262152},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"7083ddb527d8f284a27b965dbb3f71ce","len":262152},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"8972d12d517f515153e4217593e38cbd","len":262152},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"378c7d54aa0e47c1be08062302c2d615","len":262152},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3b09495cfc309312879658b0e636063","len":262152},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c57c1edbd2ecdb08710d02cd8b708c6","len":262152},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"8dea37ee27350c5d4e14f60749484d58","len":262152},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0dee7d46d4c23673394b693f27f3766","len":262152},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"c62c595a6e2990882d7cadff631c56de","len":262152},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"08eff6bf1aed164dab1b5144c93a339e","len":262152},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9e1ea86f490cb436315b2a5dfe27700","len":262152},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"62c45fbbc54d5ba2aab1e4f219639e16","len":262152},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"76d12aa5f889db92da1b4d90cb70ec28","len":262152},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbb84f9b1dd02e5e6cce7d385aefc47e","len":262152},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"65ea8fae55f8e2b93ffca5cfa59417b3","len":262152},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"308bd5740cd38138b61a6a659ab3489d","len":262152},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"725f7e7221786949fb166a17cea3d4fe","len":262152},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"94b230fc6b49f004a0b35fca8321953f","len":262152},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"37d8119968567d0a7e7f04b0e0e10d7e","len":262152},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"381fdb3aa50e50cbfc3692633c89c847","len":262152},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"a9dfadf596069976c88a1943099e0284","len":262152},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"2968cf0b54fc2d2f28777d67a6cb74ba","len":262152},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"59682b0bd874f1f6f2dc8f8a37d065f7","len":262152},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4abc2e61efa29e839e8c54a8597b3cd","len":262152},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"d827bb49770c8dd9397e1dc3e435f7b4","len":262152},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"572d530fcddea95b69daa2c5c49ca59e","len":262152},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"2eea5c67990da60d4af649369ee57a4e","len":262152},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"fbdde1ef3f31e9b49e200583f3ea3162","len":262152},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6d04af4d5bddebd010e2c9087835ad6","len":262152},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"67d807315ac9bdf41ff67f27a1b9e9da","len":262152},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"cc9a11df7a97fb9a0169f63150aff0d2","len":262152},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa64d50c2e72c70d47c527a412c6ebc","len":262152},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b18bcf9f7017a7a8a261cf34c570817","len":262152},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"5dac9d8f1e5754d818a724841712d518","len":262152},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"99395b4ea95dfb1e30421850562fccae","len":262152},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"e2470652540238a6d4e4ee4a0f41b288","len":262152},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"7578ea0996471412fa31d42b4b6793fc","len":262152},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"c324e0207bf219380c8498200d187e09","len":262152},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f5237508b02edaeea3b7566aaab1c8f","len":262152},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"45b6aa9de72aa9ecb63b77a4c6469f08","len":262152},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"d171092245e7736d740668d4bf88b69d","len":262152},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a55521c05a72864671e695a953d7f63","len":262152},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"352e09046f010c26bcebf1caf6a31008","len":262152},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a88541f965eea59e425f5a0ee54e7ea","len":262152},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"16de857133d3118e86a0ea8ec2b32108","len":262152},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"7e86600a79c8e4255d6b16051997160c","len":262152},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"685f9d66019a07b7663c1cc3333d27ab","len":262152},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd358825cd83587525a2707e3d966696","len":262152},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"98db58ba135f4006057cef5ad69029f8","len":262152},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ee1489fa618a21a3b3366e0c8322165","len":262152},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"d236c0a9ed4b8e5915bc8c265bb06aca","len":262152},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9d1304bd26aaefa615f00737a864779","len":262152},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0767636b21e45d207cc97617666a3","len":262152},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"e104cc83447db74e477a6dc172e4a44b","len":262152},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"47a8035c958e238f18d8365c27d4ed34","len":262152},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1d0bd7ef446e2cfcbbc882324f6a64b","len":262152},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"d66d76a5e5ea3aaeb7693bfd4d79c38b","len":262152},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"92f12b3316b91c0e478b71db38f662b3","len":262152},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"19640a449419963e7035979c5a420833","len":262152},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a6755283ec0bea1915a63ddff53f349","len":262152},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca91dc007eb44c1f55f889cc7375748a","len":262152},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"592ea92cac06a328f6b7b00153ce343a","len":262152},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"5154fae148e8c21e507215d740ce275c","len":262152},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a4101d6c828627657c6c1dd0b775e0","len":262152},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"98b70a8f1dda25aaf4cd6490e7d75a71","len":262152},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6f04b148eadd8c49cd620f50cc4d254d","len":262152},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"182191f40c369d41af2d331056975cbd","len":262152},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb514ff59b226f072aed7f351ef62890","len":262152},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"cfe974ad0e408adcae5c4359e9333167","len":262152},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"19550d9807d635fa0c76430a90ef9b2d","len":262152},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0bed1ee4a5791b1bf6b339096cb5f","len":262152},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5d9e00c46253847f07171849d6eadc7","len":262152},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"82d7d59990475c76797adafe0dc40696","len":262152},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"097f1e8dd4ccb3592b8da5723ef59cd4","len":262152},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"ded7762317423bd6ba8c1c70ee003895","len":262152},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b20168baef01265ca59af7749a004f19","len":262152},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"f2a61d2c37afd4e32380c5f974c718c4","len":262152},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"c505c8d454e6bd6805c19983c4b3becb","len":262152},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"451a64afafa5742e04a006595e3b76ed","len":262152},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"27122f17a65d56f09a5a99ec15023906","len":262152},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd77e86b9c6a708f0fdef896794e1416","len":262152},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b66fb1a91751f8fa60a26d01266773b","len":262152},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"17396f15a240e560fd0abbf793cff777","len":262152},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"db3e3eefb31e2e13de7f6b792d08fab6","len":262152},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"96c045e794657f81be4f550931d09f86","len":262152},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b2d795b329310ade27c930c0c9ac3a5","len":262152},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"5429af73f35f0c3aec714c0c2bb6fd63","len":262152},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba08b07cbfaa853ad84d02e16862b0c4","len":262152},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a6530cfe5bba58e1fec23eabeb8e6f","len":262152},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"01eb64dc438d15b3d37ad07a751ba19e","len":262152},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"055ff87be7d79d6a503ec16287cd7e1d","len":262152},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"f1d7485cfe6bceb4eaba44d23362a168","len":262152},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"60f1e6f7d1317bf376fb6c3fd6470db0","len":262152},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"371f5d440f6fb38a8dbd1bb08426d734","len":262152},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"1088a949951fc9d698682d724e67dd7b","len":262152},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4a1947699b7877aca19f2e9ba66d3a6","len":262152},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"39aecd2559c82e37e66328bcd8ece41d","len":262152},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"764fab8b1ebda52512b3bbeefb9a9bc2","len":262152},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9be02bf929a30beec27ada23f59b6a3","len":262152},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c4ba2e7b40c211a91300091cdfaea1","len":262152},"recovery120":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f161539917d7aed2bdcfc6a97bf1243","len":262152},"recovery121":{"type":"PAR 2.0\u0000RecvSlic","md5":"06cb8f2d83a03ad41d991c95cd93df59","len":262152},"recovery122":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c36c76eeaf4c7c3d669133896b673e","len":262152},"recovery123":{"type":"PAR 2.0\u0000RecvSlic","md5":"2701161fade73a745dfe8fc06d097aa5","len":262152},"recovery124":{"type":"PAR 2.0\u0000RecvSlic","md5":"02579b843bcf0750cf7595c84c4a1b2c","len":262152},"recovery125":{"type":"PAR 2.0\u0000RecvSlic","md5":"8af83a50b1e0c00dc811d24090466dc6","len":262152},"recovery126":{"type":"PAR 2.0\u0000RecvSlic","md5":"5fbcd8d0454c6674bd236ccfab72f784","len":262152},"recovery127":{"type":"PAR 2.0\u0000RecvSlic","md5":"2aae76e0ee91e98cfcad8b6edf64b30a","len":262152},"recovery128":{"type":"PAR 2.0\u0000RecvSlic","md5":"3c80e0a3b4aced780158b2cda147025b","len":262152},"recovery129":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714844585e05d9b8ecca9b3ce144e2","len":262152},"recovery130":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ff70a539d48bf46bddc4a4d604ef122","len":262152},"recovery131":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac9e11b7553248cc9cab4ffa4b0eadc0","len":262152},"recovery132":{"type":"PAR 2.0\u0000RecvSlic","md5":"a338dfa5fb3dfda5d0844ccf33b7a0a0","len":262152},"recovery133":{"type":"PAR 2.0\u0000RecvSlic","md5":"55816e85a627b359f1ce6abcf07a80f8","len":262152},"recovery134":{"type":"PAR 2.0\u0000RecvSlic","md5":"62989b2c6a48c883b7153f8841c57fbf","len":262152},"recovery135":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714d1f7c84c36bba5211a441322f1f","len":262152},"recovery136":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe16f8e7183642779c2fda0bdd2e69c7","len":262152},"recovery137":{"type":"PAR 2.0\u0000RecvSlic","md5":"959e3ed520135276ce507bfb974170e5","len":262152},"recovery138":{"type":"PAR 2.0\u0000RecvSlic","md5":"38755e49834c707a18ebc78153dd309f","len":262152},"recovery139":{"type":"PAR 2.0\u0000RecvSlic","md5":"861c385818cc82e20538517a860ac822","len":262152},"recovery140":{"type":"PAR 2.0\u0000RecvSlic","md5":"573401fb45c72080a5fbe59575daf1f0","len":262152},"recovery141":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbfa6d65eeffb234b4b35bb0101813c2","len":262152},"recovery142":{"type":"PAR 2.0\u0000RecvSlic","md5":"a915d1e365b2f6667b561f15074c246f","len":262152},"recovery143":{"type":"PAR 2.0\u0000RecvSlic","md5":"aaa5d236c4b0ab8bca3837357a676828","len":262152},"recovery144":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5e825e51919a1526298b2a63a67cd18","len":262152},"recovery145":{"type":"PAR 2.0\u0000RecvSlic","md5":"935be6ea3de1a2b7662868af15db66ae","len":262152},"recovery146":{"type":"PAR 2.0\u0000RecvSlic","md5":"20927e4c32ca839d9bc2e32dda3b88bb","len":262152},"recovery147":{"type":"PAR 2.0\u0000RecvSlic","md5":"734b9f415296856d8a9935673a6359ce","len":262152},"recovery148":{"type":"PAR 2.0\u0000RecvSlic","md5":"00df8edd547ee5f396ff09c1b0c8a979","len":262152},"recovery149":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c7d4748b05c963294d008bcd4d9abd4","len":262152},"recovery150":{"type":"PAR 2.0\u0000RecvSlic","md5":"136eefb57c6163ba9ee4361bc30a946b","len":262152},"recovery151":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb45e084b5f5d3c1659825791ebe9dcf","len":262152},"recovery152":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1167f13fd2ab1a49de63f0c23d97b30","len":262152},"recovery153":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec60aca69736e5c01c0bc0a4a5fc20a7","len":262152},"recovery154":{"type":"PAR 2.0\u0000RecvSlic","md5":"1eeecfb1e63674b87f0d47736f77cf8a","len":262152},"recovery155":{"type":"PAR 2.0\u0000RecvSlic","md5":"d24ea57a9b206855ce59c28a29a609b5","len":262152},"recovery156":{"type":"PAR 2.0\u0000RecvSlic","md5":"adaad933db3226d3778da61397198f17","len":262152},"recovery157":{"type":"PAR 2.0\u0000RecvSlic","md5":"36e8c301bcbcb253546bb3672400f0f2","len":262152},"recovery158":{"type":"PAR 2.0\u0000RecvSlic","md5":"acb88c3d8b4676e5b2a1d07240ecbf3a","len":262152},"recovery159":{"type":"PAR 2.0\u0000RecvSlic","md5":"6ec97320f986ffc6dc8884d762784e64","len":262152},"recovery160":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba205e752f5743e7fb56f91159f11638","len":262152},"recovery161":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee0bc353e44cb3a0a6674bebb8f2b02f","len":262152},"recovery162":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ffb47c96191de05ee92d65d17e6d1b5","len":262152},"recovery163":{"type":"PAR 2.0\u0000RecvSlic","md5":"cab383c52fd7edf43c813a02b0e9f715","len":262152},"recovery164":{"type":"PAR 2.0\u0000RecvSlic","md5":"7579d93610965f3fa9d5380b8a05d179","len":262152},"recovery165":{"type":"PAR 2.0\u0000RecvSlic","md5":"79ebb790e0f5b3878e8efa9e31a1ec2f","len":262152},"recovery166":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98d9e9fcaeab7644965287fdfa14927","len":262152},"recovery167":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c4d552661fe8a972e9a3451829f467f","len":262152},"recovery168":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48d561f1fbc700b8c676ee59c4a9f5d","len":262152},"recovery169":{"type":"PAR 2.0\u0000RecvSlic","md5":"585373bf1021202ac1a871dd1325e797","len":262152},"recovery170":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab371b66e23bf6b2f143a97162857c7f","len":262152},"recovery171":{"type":"PAR 2.0\u0000RecvSlic","md5":"e73956ff54bc05bd6f269456952de3f1","len":262152},"recovery172":{"type":"PAR 2.0\u0000RecvSlic","md5":"1908334591c7199fb59474e189631053","len":262152},"recovery173":{"type":"PAR 2.0\u0000RecvSlic","md5":"c2fe26625667e1d867eb435a7b542044","len":262152},"recovery174":{"type":"PAR 2.0\u0000RecvSlic","md5":"54a87b2c409465195efcac62cf90c0d0","len":262152},"recovery175":{"type":"PAR 2.0\u0000RecvSlic","md5":"b067362a194e73030224dcb460b406f5","len":262152},"recovery176":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce671aa8082cea7d459d1f23b43ed566","len":262152},"recovery177":{"type":"PAR 2.0\u0000RecvSlic","md5":"37770253e086905a8273fa7681e120eb","len":262152},"recovery178":{"type":"PAR 2.0\u0000RecvSlic","md5":"891a00e3973ede2824a25a87243e8847","len":262152},"recovery179":{"type":"PAR 2.0\u0000RecvSlic","md5":"44cb5fa5db70c22cb3d382bf4fe76924","len":262152},"recovery180":{"type":"PAR 2.0\u0000RecvSlic","md5":"27ca455202140b26fb48af7db5c559eb","len":262152},"recovery181":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac3a13daccd4e334fb2cb4ba8806931","len":262152},"recovery182":{"type":"PAR 2.0\u0000RecvSlic","md5":"6d218c46b445edd8b584b1079aea762c","len":262152},"recovery183":{"type":"PAR 2.0\u0000RecvSlic","md5":"c288a78496f8e466bc2ca2555ab57651","len":262152},"recovery184":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e2d494f19d7e12728b2813959f363b9","len":262152},"recovery185":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac64828b943e917b0b327ba4e3e1d7a","len":262152},"recovery186":{"type":"PAR 2.0\u0000RecvSlic","md5":"64149d71b23f3b48601a1f66d38d2ce9","len":262152},"recovery187":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb81ff2511a7b5d0cd9f4c88ebef6f4b","len":262152},"recovery188":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fc5b1e6e008764933bb831afe224ea7","len":262152},"recovery189":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ad66c7aeb321475b766456499f641ea","len":262152},"recovery190":{"type":"PAR 2.0\u0000RecvSlic","md5":"15a3793c1c67f7a9cf718d331d34f41d","len":262152},"recovery191":{"type":"PAR 2.0\u0000RecvSlic","md5":"40d0e37fbc8aa71fb21b18da1b7d025c","len":262152},"recovery192":{"type":"PAR 2.0\u0000RecvSlic","md5":"c650dffc875af3e180761d3eef8994c9","len":262152},"recovery193":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c22cdc5d78c408671a55c724affe633","len":262152},"recovery194":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e4eca09aeed7823945af8434d4ccd8","len":262152},"recovery195":{"type":"PAR 2.0\u0000RecvSlic","md5":"3fe2b6e3f848a643fd8e77a05d795486","len":262152},"recovery196":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ab3653b65ae244124b97817498673f0","len":262152},"recovery197":{"type":"PAR 2.0\u0000RecvSlic","md5":"3499ed0cd0b8e789b4622ebc71c49e1b","len":262152},"recovery198":{"type":"PAR 2.0\u0000RecvSlic","md5":"853ea439016d879ca104b2ea5621c320","len":262152},"recovery199":{"type":"PAR 2.0\u0000RecvSlic","md5":"99af3d84da4967f583714a44b8c5129b","len":262152},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}}],"1":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"6281dd7c3c3938b3e6185a0477e6e287","len":65608},"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}}],"2":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"80ee3e17eb31ff1aeb2b4b1299f54110","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"47d39160c9dd187d9602b395a9960adc","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2fce1dc1ba509aae7225e828a735ca3","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"93c8615d044e46d375da63c6e5c6999e","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"fcef287602419071bdf23c06a665b7dd","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"73901dbf207c0a3ce23b257df0c90f1c","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e455419ba916f25c94ac35c7fdf339b","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"30b4e3c59fa3f4dfce5440d6494f2eed","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"7868a53c236c0bdb9cd3a2f2a167766b","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"5081c501ed7ef9de8960c89020e87192","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"3984f758117ba91e1a6f4a4a0371d017","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"12b685ecaae6d8aa9e6a993fdfd07e0c","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f50c1e1f38f49140b7a0601896740fa1","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6f795b7c34ab97e65dd3bca71698c5c","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"96ec24936af60d8ae873945cf38b9091","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a8d86b450e2da2d84a606e4bf97079","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"c867457ea6e5d2a1cf07091126ab98eb","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"3":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"4":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"edd1ade340f0f983fbfbcf844d801cda","len":2097304},"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd6685f87867e9c3b0990dd40ba8fc1d","len":2097304},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fe15849c49662577aea77449e41e12e","len":2097304},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"720cf3d8b68b0312ba6972859c9adac5","len":2097304},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"4588d83af606eb1601a1b5ff510b7879","len":2097304},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2f302e2675b82a7e510046d3053d9919","len":2097304},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfe8e1351ed2ac4376d0309aa01301b1","len":2097304},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}}],"5":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"227b1235272f1640a2f3f1f5ac13109f","len":4194372},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b43e42ea2ec0617637ab5e1cfb98c0d","len":4194372},"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"612f84ace10d12ed08b83ccdd9d66511","len":4194372},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3595e2157d1f7b0864a88e07c1f5442","len":4194372},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1ca7d5e9064e21e1777d44aacf4fb30","len":4194372},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1a5cf7498aaafb919379feccf77ad64","len":4194372},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1de02388438296cf7a44cdef5c38856","len":4194372},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1283885f4c351acdab8eef7df7258942","len":4194372},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f9d71ae23872e39f122c02e3f403cbc","len":4194372},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"662220713f645187c50d0f950bafc747","len":4194372},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"63bafc1c7d16744ec4904235621d3be1","len":4194372},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"d67b2fdd14e40510d595897300ff8889","len":4194372},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"50ef19310cad87061283fa3ce62188a3","len":4194372},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b79e5fac9942dc229eef88d1b65c15c","len":4194372},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfc8f3cf562f919f39e681a0207afa00","len":4194372},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e536d4be26737e0f2fb2941c652b9b","len":4194372},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9afe78529cbfaffceba5e2601f13279","len":4194372},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"1240e4446fc41a8805e0b328ff3fef7c","len":4194372},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"7d6562c315684126e67926609d6ea1c7","len":4194372},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddeef17620d1515a91bb6d195e627438","len":4194372},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e03cdfc92253706ac7392426134ce613","len":4194372},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"05d9de37cb8619790dabc4686a6465ed","len":4194372},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddd9a65e990a88b69ba76360900162bd","len":4194372},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a1dac3c4b04b02bd63db1169606f96","len":4194372},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}}],"6":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"7":[{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"13d75e3ff2871a1f3554d0810d758a9b","len":12292},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d61059e69c4520710a1bf6157339902","len":12292},"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad306d05c1f843b71793db9705949994","len":12292},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"9df88ae2f95bb7ac4d7756cff03be42e","len":12292},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"019ef2edd1afe0d9e3a4fae44fcc4dcb","len":12292},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a296997ecfb7742bd8adcb4408f3b1e","len":12292},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"396efc5068f49b51ff1c78438defd032","len":12292},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e29e2769b588fdd6073366f0c69ba755","len":12292},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"59175e7ac2d1d2364a469e6213bae609","len":12292},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"423e090521d2e6e445f3ab47a591f170","len":12292},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"2ce148ae06221e99da47f28ef4fa0c2e","len":12292},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"61c196fcab5efb2423e912493deb174f","len":12292},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"2a95543e8ad6e15bd03ee56837995b56","len":12292},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b48bff16a3a8eeebaef276c142aa66b","len":12292},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"6018b4d628e4267080dfbfbaa11f8482","len":12292},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"6355f957b5299352ceb471cc7aaab9de","len":12292},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f14355deaf0bae05b3a601788f952c","len":12292},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a9553081f2bcb2568cc3d49e817c12d","len":12292},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"64c64c2c2aac4fa2cf1dc5abe9e79e40","len":12292},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"11e3ecabd3e549dc7bd276d0ef06e4f2","len":12292},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"045ba81e58ecde2ae2e5e10d6ff52deb","len":12292},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"7c2a0122a173e9c24288c6fdff476f75","len":12292},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fada7e5fd40793968feb8708724e76f","len":12292},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"6051d8fff3ba7d1a9ef0b89dc3c099fe","len":12292},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"a20b372c84071403679473808447e1d1","len":12292},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a073c7ab64609655101b3f9d43f83217","len":12292},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef752e17c8404a77522e886979da238e","len":12292},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"9489fae5a303376fcf21bdeca134a01e","len":12292},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd47dc979bcdc24667a1743097d6d57b","len":12292},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"6576e487f28903ad92ac1bb9afde1cfa","len":12292},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"a77c05ca3f1390283d1fbb1c0332a0d0","len":12292},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"3923f44f59d0ba6e84ce1d30a44962b7","len":12292},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"d956db1617f11f33b3663c81e560bae8","len":12292},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"740ded4b4c20fe1f6799961a9f4a863e","len":12292},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"90781f2934736ba360ceb99d95652fde","len":12292},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a4bdc4e888708e7639a073817e96be3","len":12292},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad671f3f22846043eee50812b7841d19","len":12292},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"370998274b1b0348d66f79f98f277707","len":12292},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"c8491de0561f52f26a01dd5b1fc0a680","len":12292},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"9108adfad172584acad344c253d6407c","len":12292},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"64d94e06c9c8076fb98148977aabada7","len":12292},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"934a2da041551ca688d4cd65201de9de","len":12292},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"47ad1b8f9e972ff43bc55d8a7c7c8b82","len":12292},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"dff77ec66b2bb8226d40075e7dafd65d","len":12292},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"947c5b6bded6872b18f8b0eafbb75a82","len":12292},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9dc6162f9bf3e41821ff1aa49b02fd7c","len":12292},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"0936daced4a42c3bdd97bcf7bed560be","len":12292},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d3733e28add7241ce6c160afc499514","len":12292},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"3e457b7d118e652d35420859f0c2cb0c","len":12292},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"89751e1de0aea82ecff902443fc29a65","len":12292},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"0bcb7ff73a2b4185a25634b41fd282db","len":12292},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0051a97153a602a5da921ce4cc39ab8c","len":12292},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"c33f2d2904d814872fe43dcb1ba83fa6","len":12292},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"44d12ca49b89389205fa867b31ef821f","len":12292},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"908fa6ddb6e46fbc79cd7e8f36fdc91c","len":12292},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"e01263d2374aca3358fafd4f1e31dd50","len":12292},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"d87a878179763b3b553850bcc3f37463","len":12292},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"4748999b7a65523b2c40bdba325fa0d3","len":12292},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"abf213d33e195ca39e86262a57f5a335","len":12292},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba9e9db844838a23d3a845cc180aa7a9","len":12292},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"4598550a4ee7bb79427d2baa85ed5341","len":12292},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aeb560438ea23e78d2d990b70579d84","len":12292},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"b21c0f8558b48322af337d9502ff7410","len":12292},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"f965345c3ebc04d23a8a7343d3f4573a","len":12292},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ad9c5792fcc6ed1d0d6889b8067ebc1","len":12292},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b73c50d6e875d041d47d371bebdeaff","len":12292},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3bbd4599eaf7e264910b462fa59ee48","len":12292},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"052c9e79b7f175a837a46923f0a80ee2","len":12292},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"be0f6bfabf1bf6ba6508237f216475a1","len":12292},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea169208817c6d7fe336c901bf4a5ea0","len":12292},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"625407da280dfbfc7cacc8fb4f8bb265","len":12292},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"1c32d547d7d52d43cebc4d996c262111","len":12292},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"cedc3a8750dfef198ad10d9dccefe339","len":12292},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec4ef290224420846245b9375b814f8b","len":12292},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"4bad80c7f5a56d2fc87937642b18aa0c","len":12292},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4b0db1068256aa8c1e07f98a2d16339","len":12292},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"375911da8153e31766960340b1626520","len":12292},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"38b8edbd15145ef6126699807dc73da5","len":12292},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e66ffe7118eb850b326d9a3daba5d97","len":12292},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"d12c98c5352c1db7a19cd928513e66c3","len":12292},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef8dc2866ac48b1831afacf9e0750164","len":12292},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd23bf25bed444e4bdc22b9fc65288bd","len":12292},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"a82f0d4f7663ac83d47ca10ac3eac1a7","len":12292},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"340d4ae8bbe9f403176a4a001ecede09","len":12292},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"1465427d8499f1cc00977b1bde040381","len":12292},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"961b476e9df27b019b129aa37e9b4c87","len":12292},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4743689d9571c1ecc2c4341a8400e40","len":12292},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f6d7fead6d5b60a7fdbe453abd63fa5","len":12292},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b56ed13c80fed786dfaec38591a7f849","len":12292},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"3dffd1a0424af8dae4cf74d936aff7ea","len":12292},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b1e226311ae44d16bcc0f56c68c1a3","len":12292},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3fc2ca676e4c262dd57e61cf68c5361","len":12292},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"42b3a719315c551d3c91ee822c2da7ed","len":12292},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"928fc76675423931cbabc3c07b638294","len":12292},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3928b4db0b5b11d3cafba8c20f77376","len":12292},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"69092ea4065e6a6cbec1bf3841321cc4","len":12292},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"011dcea0d6d2d576f76d0a06f7f745b8","len":12292},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48dc08c95b0176c34116600eb14961c","len":12292},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"51ecf96adae6cab2ddb04d94f1e31d78","len":12292},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa7ffc715bf00106a5d7bc6d0da3ab2","len":12292},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4cf770599853908d4cea098a1298970","len":12292},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac084faa95146eeed6664cc3e029e6ce","len":12292},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"52f195fc483df0404a6c742750023de0","len":12292},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"61ce493a2aa951375af43e9d1ca79d4f","len":12292},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"fefb4a742be24ac3731afe307f5b82ba","len":12292},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c4e137dade4e9642b809f09dc050eaa","len":12292},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"af7592f4713e63de24ff2a591dd4fe26","len":12292},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9b995c9fe0601e5e333f734fb387047","len":12292},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f0bc8e940bfc1ee4a8088af4d36ea80","len":12292},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a3fef678dfb898d2a6a2fc349ab8885","len":12292},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4c5d8d29385549ee22e2e3a796ecbce","len":12292},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"a94ce91cec61f2add2f780412f6688dd","len":12292},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"45117210792cfdca35871f0625f29f50","len":12292},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}}],"8":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"8f490677c7370c81405b5dc720de5259","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab4b40f2f8c79be1b72fc3c297034e18","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}}],"9":[{"main":{"type":"PAR 2.0\u0000Main","md5":"d2e9f5f81e8780b703db895c249cbd68","len":92},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"4a2d665a1e6879cd1b9d04025a7a5f80","len":132},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"59a78d4061c80c5b686bc85f1ae871e1","len":120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"aef6213477d5a93d8932e106b971214e","len":104}}],"10":[{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2b7cd021389fdb5ed3791087cab67848","len":16777284},"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}}],"11":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ae3fa27bb30388ee1999b3e5d295b1e","len":1048644},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"76549588cac2b8d3f6ffb53ef2e5e5d5","len":1048644},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"46a19207e25504383c733859fa23a5d3","len":1048644},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"e39bd46ac1405579ea58ad6990a52e5f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aa647a57f56be00b66830ebeed8d766","len":1048644},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"8e3df5e94c2602a027a43df9a79340fa","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"97253b6d0f8d827181326470066333e4","len":1048644},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d3dc575cd28880a5218b9c5d0762efb","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"696a55821a69dc48cde7e7b7dbfe18e7","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f8aa5bf1983752b0122f91c6dc46dae","len":1048644},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"60ab3ac1c0b22bb2660ec3b1a44a454f","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"32b0c0ccf4da378145ec2881d06a0104","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe3cf4681ff6dab4fe48baadeefa8b9f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"75a0660ce94daa96fefcdb8ecb7ffe2d","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9fc5ad5e0214fb90588695cf1bea1d4","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a7a7c816d43b31ffd1f1ab7ff93cb78","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"6514fb821fb9d012cedb09a5cdb5ce81","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"c55c09da6d1858b24764007cc42d01e1","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ee9880f40b8062e5748abe8b5fb2579","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"329cb0d4712fc629414fbb2aa1c15454","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b03860153ecb1c0b15b785422cf36c","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4ffc6740802ea6274134e905fe8ec1b","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"36c944effcf8db4f5098fdef4d7f9fb7","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"33dddc46138a869d45473597bd8a0885","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"52e988fed07f5c02b94695f129ffee05","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"860ce0c08f14146f6545f8b455388457","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"02fd2a64a0440a35e571383c3d0a62ef","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"557a895ab8b60101fc681cd5908d4bfe","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"2045f82f13b16f36b30986653111655d","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce6fad602d7745fc6b87ce0cf9adf74b","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"f916de512a753ad776ee91d1cd0c5e88","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"5f9b5225d53a5df264e98d57ef41488f","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"b35b8b7c45908e260ff729d0a927ed93","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"f976fa312351c40ff2c6fb8022aabf6d","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef96b2f79e199d7bb037fe0b05056211","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"96647c04ece1d25fb0f6b1eca3ae2219","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7bf3716a84aed5b1321b19e18a445eb4","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"257500d3e77cf74b1efa67606d4ec1d0","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"696781c6156f397df6106e2d51819aa4","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"9046f4b89e3a13e3947e785eae760ea7","len":1048644},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fad2133a7b1672ed31694f89fe8b1c1","len":1048644},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"285897a69aa8e07e6f4f82025f25455f","len":1048644},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"424f74e6c405f183cb635524ae086d36","len":1048644},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"d7285bfef36a2314e8d5cc2323a91625","len":1048644},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"bf8285ec53c4c06737ba3f1ffd792760","len":1048644},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef6616466b9fefebfaddfadcf25b8fa9","len":1048644},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b64f6fc3816ff762953d256f6be5c09","len":1048644},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a6690f87f54eb8769c236e5972f4817","len":1048644},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"9fcdc6307e18029a7306c24d6e23a798","len":1048644},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"291516ea56111c0400a6a12f676e5a53","len":1048644},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"c87b641d60c32ea5d2883509304abc60","len":1048644},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8a1d99820b3030161654ce66e732522","len":1048644},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b7465b4e0aa16ed2531a4a10297c4d3","len":1048644},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a0f4220cc76d6ab4aa7d537c1f7c77b","len":1048644},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"9143733d0076ab08719a05ca13f1f422","len":1048644},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"5266852e8cce0de780bce15fb1a608dd","len":1048644},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"19a85bb67233bbe31319d61afa482c25","len":1048644},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"777e26b540a27ea9480817bad4ba5eb1","len":1048644},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"a6559b6d4dc1280cbcf748e968b4dbae","len":1048644},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"29a393bd18579d5ae612a73b5ad5471f","len":1048644},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"0c61aafcedeeb70a5677bac1f64ace26","len":1048644},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"18c227628a2b46e865649fbe7ed1c9ed","len":1048644},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dea15252f8943bc37ccfd8dede9ca4d","len":1048644},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"a437635c37da22690a51bcca8af94066","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}}],"14":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdb6fb8f2a0d2a5902cabcde8b859035","len":4294967364},"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b2721e854623f19ede417796298e409","len":4294967364},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}}],"18":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c59db66d27af0dd33c5c76649c80596","len":268435528},"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"b997d60937956d94164d0d4afbb9efe3","len":268435528},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}}],"20":[{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"21":[{"main":{"type":"PAR 2.0\u0000Main","md5":"979bf683edbeb3c67b54eefe72ca796e","len":92},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"572fd34269714193e5755b1c9b4e3dcd","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"787536bb15eb4279f56212f01d1fdc1d","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0769f180278adcffd6dcdfbaa0987882","len":104}}],"22":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"dc21985927d59404c3a10a1523e53e0c","len":262212},"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2042cbf8d665fba64349918e12425f02","len":262212},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"010700d8dce400b258d57b93189f6261","len":262212},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"a0871e7345520f1e61b2a99292b01a18","len":262212},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e9c8c7d20fe75165ee4f0812ebfd434","len":262212},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6b02899978c2d52177f754cb1708bc1","len":262212},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"55ec6d9b1af64f6fa6ceda6fa43fd722","len":262212},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b50a45fdef6a9878bcf7de3e56cc23a","len":262212},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"36d262d667af95827a870ca0379516fb","len":262212},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"d0a7a6cecea8434b3de7934ecb982e53","len":262212},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}}]} \ No newline at end of file +{"0":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"03970ede6d760d2bf5d6cb33b38b008c","len":262152},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"a09b1093677a95f91a6e12f30c96d465","len":262152},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3172fabbb103c06cf765dac0ab61938","len":262152},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f9744132ce0f40d202a35620f660cf4","len":262152},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"afde5ce9c99744bf99890c2c907be1a0","len":262152},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0383fdc3bf825deed285ad41e72dad0f","len":262152},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"49861fef773046c995ce1b470a408786","len":262152},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c7b9b0852558fbf8671ecb317087352","len":262152},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"698f229cbf8b133ae4a8e7291ef1ece7","len":262152},"main":{"type":"PAR 2.0\u0000Main","md5":"89c5482dbd2f87c50e63dd1d6fe57acb","len":92},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"7d5a13921d239a4c41c1a717b8cfff80","len":262152},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"2169e55a9542206a54e61a1bdc497b81","len":262152},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ac590d0f6eb4acf2e4722b1e0886f9d","len":262152},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f04c18cb08ae75f14a1bfcdd93fef33","len":262152},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"9520fde4a0a015846a26b27e6fa8ffc9","len":262152},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd75fe1a450ec13d5deb291ec7a6c06e","len":262152},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"0df5e975fd4d26d4645efcf277b97864","len":262152},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"da0633552876e6952b343e9e1475a07d","len":262152},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"52fb5aaf6b778813e2457d51678ebe56","len":132},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"c8a5773035c82e3919e1fa630397f4e3","len":262152},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"887d8999ab30bda143c7c44d5db8b2ef","len":262152},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"042606927322c90d5a47d3db0940d20f","len":262152},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a1cf4b6cbf163175568c2ba6cd59e54","len":262152},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"aab5ff9e96bf7166715848629553e1d7","len":262152},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"99dee1834fe4bbf362867cdc1f2657cc","len":262152},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"4aad3e6f408fdb71d8da0ae89cb8a6d8","len":262152},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"aa0cfff6648596cc55d43cdf7edcc811","len":262152},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cfcc0ca48bb20a25e91d1b117b1ab518","len":5220},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"f0da77095c905c3f12473d95b228a833","len":262152},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"e995cb583e47d8062b8a570ae3d89890","len":262152},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d95da53406f7f05b3df9fd772fddbb2","len":262152},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"081a934272ce2b045588c3e8315dab85","len":262152},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"210a791f5f1abb3c766a9afb868046ff","len":262152},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"a054993798a060f9158bb6ee5429ba39","len":262152},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea9bdbfd00fb4d92a5ad7e12806a07e3","len":262152},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"71095456144a043238aac78b9404cf31","len":262152},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"161d6a70a1f9660ddaee08b2f8ff07c5","len":262152},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"5fbc1fbe505af3935a6626704443f95f","len":262152},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"b137effeecf7508303909cc1f4c4744a","len":262152},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"4bd21a197729254883c39639ab1d1d2c","len":262152},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"421dab830bd4008cb6b2fbae4b4acac4","len":262152},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"a414ec0c642cbd9696617549ac233fcf","len":262152},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"aa1231c5f3b62b97aaabab8360687d69","len":262152},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"75ec845540c1623cb23d923ff1599c62","len":262152},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"125cf5ecbca7b2cec06c2aeb34bb0c45","len":262152},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"83bcfdab942c36d87c16a86854b7126a","len":262152},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"05edd2b3d1b0956ee0a746c3ac4e89f2","len":262152},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"5e298933bc064994adafa95427d9cc16","len":262152},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb9dbce2b3cedfde6ff13bda272c9492","len":262152},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"777fa36408c9baff7d53124a155e9b70","len":262152},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d51af8d47e332b8b7588bc6b41cb90f","len":262152},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"c304b5026423dae46d152ea39fdcba2e","len":262152},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"a3b6c7d58ae173aa8b5e427e52ba5ccb","len":262152},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"3a79c70ba528b971d431549a6ab780b7","len":262152},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"343fb9b3384b5e33f1ee51361a5998c4","len":262152},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8074380558ea76ffd244dfc0c7f43a3","len":262152},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"275a845bd65e643230f8fd898e5011b7","len":262152},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"fef14352bee0d22ffa74386828f3ccbd","len":262152},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"82f3994148d2892b8f1b8843cf335295","len":262152},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb516d90ba09d6075ac74892b50618db","len":262152},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"d5ecab451b59a28d40239111294a39af","len":262152},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"b20f9e576ffdb89fab5e192e80ee7a33","len":262152},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"1287eef5baaff56af11ca84e8d3c1748","len":262152},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"54687f28deea74ad577c42aff9f76e78","len":262152},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"8cb4a241f39bc369936757cf033a1eb5","len":262152},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"bba478514161683f5822e4b58cb8454e","len":262152},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"16893c1739ef5d4261f1cf5c309f7b79","len":262152},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"0735e832cbfeeebd6f9c96be8b23cf51","len":262152},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"70509929a1adbc65f16109b5bf09f63e","len":262152},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"fa0e8cb3bf86b47dc92058df73c7d004","len":262152},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"331fc5d6ca5bfdd47656af08be93b0fd","len":262152},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"12c947862c6378a86c6e6b06d5498589","len":262152},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"cfb0553db13af90a358a7d170570bdfa","len":262152},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a4c6e95258bd09d23607a9410943f39","len":262152},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d92e9d78ad8e50271f5bfa55948743c","len":262152},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"d123c3ec100da92fe45cd2a5af2ec7fc","len":262152},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"039f147b63cc0777d6e1d805fdafbee2","len":262152},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"1be0fc177323345fd62a7c22f2a94853","len":262152},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"bc493ce8d6f38631383bf2979bc01067","len":262152},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2961018c462f5981cef07267176f04c","len":262152},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"3f87e937dea4e4664a872645746db8ba","len":262152},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"8de28f1862f9a086d04a9aa960b43a12","len":262152},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f44c18c155bdbdd2341bc31e37082ee","len":262152},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"e2fd62569e0cbb5b572880819f5f201f","len":262152},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"347a0aa5b8f4447262aeeaa00f53bce4","len":262152},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"24031e3e370225bfea2d86072b79cf26","len":262152},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"1e177641661d08ec6f0a42cd4b41cc15","len":262152},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"988b050937b5ae4910a31432fefc875e","len":262152},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6dda853f4ccc46c65262e9af10246ea4","len":262152},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"8badc2fa06aa5d56409a90ff28335101","len":262152},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"7a1b2d483182384822640f2d1c8c624a","len":262152},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"63f2bfc85026efe8a791208a7d8c3089","len":262152},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"106bdd07f0dc559e30784a840ce2059d","len":262152},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"663360b546f956b7aa0a1487b6cf55ae","len":262152},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"f80076528b533e0f05d16c09091138f7","len":262152},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"3f0b031ac855ae6194a0d226ab404f4d","len":262152},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"f97921024f81af02f9ee5d4de57482a9","len":262152},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"e0d93d56835eeaa7be21514d3649b750","len":262152},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ac9c411a5f7a1c4a340f9963d8e9600","len":262152},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce73db440330f0a2bb0376af3c4993cc","len":262152},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"286de36d1b78d7b80d31b6f17a648185","len":262152},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"48451352ab94a5022853ce548c665da8","len":262152},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"a15cfb01e9080e979c370c258d71d784","len":262152},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"90bc078b3558a063e4b3d863036c6e30","len":262152},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"1d6aa5f278b70abac0173874216eca34","len":262152},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdbdd9d8d8b6e554960b88fd458b6a45","len":262152},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ec684e6bab962b228409487a0d4984c","len":262152},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e462ac7f98db990ea3ed78239df0f1c","len":262152},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"7bcd383088e1c74c0e403aef0892b806","len":262152},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"fa220540df29315fb7dbd3681b094310","len":262152},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"c5be37a007252a75765caab04115a9f8","len":262152},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"92e1cc5b4b87395214afe2fa361422e6","len":262152},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"2d60d115f1f2bf3ff2ca6a351b72bbd7","len":262152},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"cf6b7a3892e01ca2909ccd1462bd4c20","len":262152},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c6decd7cec310b857010c848077aef9","len":262152},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"a43027f82a38b9c78e5fc3c2f00731f6","len":262152},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"91c953ee943d0d89ca50ee73053305dc","len":262152},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b4e8c8065d7c190d54f51cdde021058","len":262152},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"8d6675acb4ebd061ac9598fd01dae7b4","len":262152},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"78524ff0643436fd95d771664da8b12f","len":262152},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"9f688409071efcdb9ea196eec65be8b3","len":262152},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"a69818a17a03aa13375cc10a07f616d2","len":262152},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"30f3355ccf75fc559138cc706c9c04d5","len":262152},"recovery120":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dfcbcb0851fb544530c4c4099fd053d","len":262152},"recovery121":{"type":"PAR 2.0\u0000RecvSlic","md5":"c7a364ecbceb8c9491dffe835e09f7bb","len":262152},"recovery122":{"type":"PAR 2.0\u0000RecvSlic","md5":"abaa58c8e90a572bf76e49a7e4845646","len":262152},"recovery123":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3e9332163a0bec102eec6bb74569474","len":262152},"recovery124":{"type":"PAR 2.0\u0000RecvSlic","md5":"48a5bec73284a3e683cb1e88f6b4dff1","len":262152},"recovery125":{"type":"PAR 2.0\u0000RecvSlic","md5":"b751ff190f59c1d807d088ca5f9d544e","len":262152},"recovery126":{"type":"PAR 2.0\u0000RecvSlic","md5":"efb5922f8842d49d836e54b3c230f47d","len":262152},"recovery127":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a6a008bd479eba9f3bf9ac723f9d3a6","len":262152},"recovery128":{"type":"PAR 2.0\u0000RecvSlic","md5":"872f97d2b9da3c002a036e9142c7f481","len":262152},"recovery129":{"type":"PAR 2.0\u0000RecvSlic","md5":"e005e9f222f3061cfc5c79cfccb2799a","len":262152},"recovery130":{"type":"PAR 2.0\u0000RecvSlic","md5":"aeb4683c1a61c30756d52c15b4a25a19","len":262152},"recovery131":{"type":"PAR 2.0\u0000RecvSlic","md5":"72aa837c40a87a4ce9c2db41b36d17f8","len":262152},"recovery132":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac87c0cecb67c4ef0befe5c7193d71d2","len":262152},"recovery133":{"type":"PAR 2.0\u0000RecvSlic","md5":"fcaad629446fdf03a295e2fc729fa956","len":262152},"recovery134":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b2dbd073bf9a16c19d980ccb3c3cf36","len":262152},"recovery135":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1075aef3398e5021a8acbe3196e019d","len":262152},"recovery136":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef67403febd7eb60a1220317bad008a8","len":262152},"recovery137":{"type":"PAR 2.0\u0000RecvSlic","md5":"9541bc64807293499de6abbd1da9d633","len":262152},"recovery138":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca40c2c9aed94b57fa53fab122abc95c","len":262152},"recovery139":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ef6a42a467cdea12815b5c150a5f743","len":262152},"recovery140":{"type":"PAR 2.0\u0000RecvSlic","md5":"e0f5ce4dc6e0cadc5f3dddca5b93c005","len":262152},"recovery141":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b5c43d4b1a0c46990f3f333a0898054","len":262152},"recovery142":{"type":"PAR 2.0\u0000RecvSlic","md5":"d5eed530038d77dac6eab507dcf47fa2","len":262152},"recovery143":{"type":"PAR 2.0\u0000RecvSlic","md5":"529c34fd816ede1899e40f536e0c8203","len":262152},"recovery144":{"type":"PAR 2.0\u0000RecvSlic","md5":"8a6cb458f1f508547f2a9184855c79dc","len":262152},"recovery145":{"type":"PAR 2.0\u0000RecvSlic","md5":"7be5eac2be255b4adcdfcc3d58b622bf","len":262152},"recovery146":{"type":"PAR 2.0\u0000RecvSlic","md5":"f8799c463c2efda9dbdb4ddc9bddcf54","len":262152},"recovery147":{"type":"PAR 2.0\u0000RecvSlic","md5":"80140c6dddb23f99f49a231f0f710b64","len":262152},"recovery148":{"type":"PAR 2.0\u0000RecvSlic","md5":"19bc507b8ec8c8260ea526036b381547","len":262152},"recovery149":{"type":"PAR 2.0\u0000RecvSlic","md5":"6214b5e6747bceb3b135b1ab9b91c6bc","len":262152},"recovery150":{"type":"PAR 2.0\u0000RecvSlic","md5":"e63a5aaa79fb2c561c3e89983fe217b7","len":262152},"recovery151":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d50287fe67bbab640739a3b9f85f978","len":262152},"recovery152":{"type":"PAR 2.0\u0000RecvSlic","md5":"7a9be033a85fafaa8122980e4ddb4947","len":262152},"recovery153":{"type":"PAR 2.0\u0000RecvSlic","md5":"a4a4267103497dc3e7061219c0342a71","len":262152},"recovery154":{"type":"PAR 2.0\u0000RecvSlic","md5":"ae8fe2ccaf48ddf415d154cb74f27961","len":262152},"recovery155":{"type":"PAR 2.0\u0000RecvSlic","md5":"29ee8f8377b984f3d91c787092777399","len":262152},"recovery156":{"type":"PAR 2.0\u0000RecvSlic","md5":"98d05ea372a553c0dfa1f15fb91860b1","len":262152},"recovery157":{"type":"PAR 2.0\u0000RecvSlic","md5":"96189de63a41a05d863fa42f3badf79b","len":262152},"recovery158":{"type":"PAR 2.0\u0000RecvSlic","md5":"df1f2f861783d35897b02b43afe62db3","len":262152},"recovery159":{"type":"PAR 2.0\u0000RecvSlic","md5":"1e97fd5e5fc588e3424bccaba08ff3a9","len":262152},"recovery160":{"type":"PAR 2.0\u0000RecvSlic","md5":"894e9bae976951d893f88faf3b539be3","len":262152},"recovery161":{"type":"PAR 2.0\u0000RecvSlic","md5":"a6debea18ff12b7c152cdf8539a8b84c","len":262152},"recovery162":{"type":"PAR 2.0\u0000RecvSlic","md5":"446cb123074c7435ed1a883ffb578e62","len":262152},"recovery163":{"type":"PAR 2.0\u0000RecvSlic","md5":"af617de168be8b15a2e30aa5b11a630d","len":262152},"recovery164":{"type":"PAR 2.0\u0000RecvSlic","md5":"caeaeda3c01970907014b4cb6be2e0e9","len":262152},"recovery165":{"type":"PAR 2.0\u0000RecvSlic","md5":"b72d7c84ee2f6480e66c5451c305361f","len":262152},"recovery166":{"type":"PAR 2.0\u0000RecvSlic","md5":"bddcb7c296fa98f0f93c6233b5971cab","len":262152},"recovery167":{"type":"PAR 2.0\u0000RecvSlic","md5":"c5c6024a222934cde4a08ad92cfb7c4e","len":262152},"recovery168":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea2a639a62ca0356d05eb81c5f1e51e3","len":262152},"recovery169":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb81b5ea915c74803315125ef6b35082","len":262152},"recovery170":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f860ad42df81289117e7599b54c7eb7","len":262152},"recovery171":{"type":"PAR 2.0\u0000RecvSlic","md5":"a28214640f8b91fb23b8ac7c31781808","len":262152},"recovery172":{"type":"PAR 2.0\u0000RecvSlic","md5":"0ce2938fd11eb52f82458e81375c5a87","len":262152},"recovery173":{"type":"PAR 2.0\u0000RecvSlic","md5":"9aafee82dd491549c94c1d094898dd4a","len":262152},"recovery174":{"type":"PAR 2.0\u0000RecvSlic","md5":"12852b57d924c0caf9f1a0380431434b","len":262152},"recovery175":{"type":"PAR 2.0\u0000RecvSlic","md5":"39e2197c88713c38df03b7ed1bcdf3f7","len":262152},"recovery176":{"type":"PAR 2.0\u0000RecvSlic","md5":"c61db631d2776f27c18fc2afe3a5da22","len":262152},"recovery177":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e92440d523b464790094b6edf7d9ebf","len":262152},"recovery178":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd7203dab76dcf651a159fffea1da80f","len":262152},"recovery179":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c3ddd4864fbc7f9102326252c5155d4","len":262152},"recovery180":{"type":"PAR 2.0\u0000RecvSlic","md5":"a586d5f39701672f9c030fd24c8fcb76","len":262152},"recovery181":{"type":"PAR 2.0\u0000RecvSlic","md5":"9904b5bdaaf4a4b9592bd00f981dbb3b","len":262152},"recovery182":{"type":"PAR 2.0\u0000RecvSlic","md5":"8ffb4c94544b1b24582e29539ad3005a","len":262152},"recovery183":{"type":"PAR 2.0\u0000RecvSlic","md5":"7287c960ac6b695db428449fbca96281","len":262152},"recovery184":{"type":"PAR 2.0\u0000RecvSlic","md5":"883a3c3f3d0f9df49abf107c3847af28","len":262152},"recovery185":{"type":"PAR 2.0\u0000RecvSlic","md5":"93e9c487429c1befc4bbd9ab268593da","len":262152},"recovery186":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ccbaa2bb58dc2a99f9975526ef8bc80","len":262152},"recovery187":{"type":"PAR 2.0\u0000RecvSlic","md5":"f745e05a999b05e38c55e4376581c488","len":262152},"recovery188":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdc31d49a5d632f5a63f254bfa941581","len":262152},"recovery189":{"type":"PAR 2.0\u0000RecvSlic","md5":"7521bc07a1963e4e752c9b9762cf1886","len":262152},"recovery190":{"type":"PAR 2.0\u0000RecvSlic","md5":"0273545e7fd5bec03732379f53affe0c","len":262152},"recovery191":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b0135fd6e027d865fb67b993c8b25c0","len":262152},"recovery192":{"type":"PAR 2.0\u0000RecvSlic","md5":"9cf0b02b61ff390c9075abc5b0b07940","len":262152},"recovery193":{"type":"PAR 2.0\u0000RecvSlic","md5":"c993b05b5f88490514cf00a7379c027c","len":262152},"recovery194":{"type":"PAR 2.0\u0000RecvSlic","md5":"0854447904b8ba72f87e4bca81ba7c36","len":262152},"recovery195":{"type":"PAR 2.0\u0000RecvSlic","md5":"41b20d0d135d0d9e1e69f075d7415511","len":262152},"recovery196":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b36c343aeef12aa5aebef859afb7939","len":262152},"recovery197":{"type":"PAR 2.0\u0000RecvSlic","md5":"566559dbe69de697c566139016380d23","len":262152},"recovery198":{"type":"PAR 2.0\u0000RecvSlic","md5":"37e47ff9f55e0cc4f4c2a219a92b5ca2","len":262152},"recovery199":{"type":"PAR 2.0\u0000RecvSlic","md5":"2395c640cf1ea0a5a607c8b191045299","len":262152},"creator":{"type":"PAR 2.0\u0000Creator","md5":"89ad6e4b397f77c2695f641c56d1b889","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"89c5482dbd2f87c50e63dd1d6fe57acb","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"52fb5aaf6b778813e2457d51678ebe56","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cfcc0ca48bb20a25e91d1b117b1ab518","len":5220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"89ad6e4b397f77c2695f641c56d1b889","len":104}}],"1":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"2ac05d18f3edb8fbf441a9f3a6e4020f","len":65608},"main":{"type":"PAR 2.0\u0000Main","md5":"588dc7bd3ba772f311d29a7969658d81","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"7615919203c5f0ba8accba4877e1aa55","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"762e98460db2f630a54c36d8e3a15807","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8e109e9880f96796975db1e8977b2890","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"588dc7bd3ba772f311d29a7969658d81","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"7615919203c5f0ba8accba4877e1aa55","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"762e98460db2f630a54c36d8e3a15807","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8e109e9880f96796975db1e8977b2890","len":104}}],"2":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4e872922149a549d977d81640ceab55","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"b85a4b084b9d52d116f333a1d5b19e19","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"59adbd88d4c1bc4319b425a546cf89d8","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"1814cf6eb1e1ace8332be23cd15eefd1","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ffcbdd906b9ff145708f777748d03b5","len":1048644},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd8421495e090fd5c060556086b84c32","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"76ec256a0f5572fcd0fc51ad0276877f","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"4d1bb22d29b6cb751b1fa24f5a815fb1","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea7f112eb738f044695834348c2e0cfa","len":1048644},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea31249d845a9a3303e2a0d706a9dff0","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"399718d48d5c703f63d4f8c3d61fd6c7","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"f507373d364c93b47de1f01eb44d3d61","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"a834bfee7edcedd0328f0b89ed2332b8","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"12f17a64a1486c7b076ae9053a07457a","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"41459752a93ac1f4f6686d4fb1db904d","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"b977f212e7d954a4ff9e679fa3240a50","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"742f7601e141c6853d6ac6c7f7b9d0ab","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}}],"3":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ec1b43947a219587684fbf415f9b610","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6a13ffd2f75ce6c75ac4028baaf39ca","len":1048644},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"70205e891ea5cf723375e8b710a10678","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"2436f22b6d754e6ea2c20fefa2600cc5","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"bc1f6c6d31a8e0ac1da75822406a8183","len":1048644},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c9f5e668b354ba002563567c415410a","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c36084e21c5aa8043a9af0bbe2f40b1","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e0a6b21b13b40d970bbf3d68cef8749","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"923493bb8645068ce59031656a6a81be","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"24f3affd0a3c433979e520fe164af3a2","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"2007e274e35d6592233a68a944ed564d","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dad701e5314dbf13b703a0f2bf1b462","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a7398c38a1927b36ec9fc4974dc0c6a","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee398a2258c8e389ba78b782cb0bcc67","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f51d8bdeb8d22405eb473efae46343","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"b276b7929c15f1ae6cf6d40e8376d4e1","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"075682015f58a65d763dbc9f0ee513e2","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce527a46c4740a6061dfdc4341b75ef8","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"6cae9d1f1b7a5fe33e797d2f06a7fbf9","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"518201534e884cd9aca3371613be161f","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"468de55acbe2330f7d410c208c9d74c7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"11c5e20152902ec61007121531fcd640","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98a1d2fadc180e15253cb38e1a63cb8","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"23d0dabf638bbd3296eff194897c5d7c","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"fdaca43a4141a1d49ba22108b8811f31","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"57851eaa8bca48269261f4b6298e2e4f","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"b8be0336b8787be4fa86041f9d38ca6d","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"312b9e44906da959cf77f49e04f52095","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"d268da0938bdcd1210126898b124188e","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"5b66e1272d272cefd8608856755ffb19","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5f1a350dd4c18d3c9d8680d084abaea","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4e0b201fb86d09bbe9cf86903c17a82","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a022d11b27ad0ca6beccb0e9aa1579b3","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"0de6017fd8d80d67e780eb679b9318f1","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e52bc14774ca31fa04edd8c5e9cd66f","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"50218f80d454d5c8e3c61698a5c500fb","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"e85705a19cca00706effad30d2e75c07","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3c10df5b6c5154300f69f7027b48b62","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"e7a98e885bb4f2393f8c1b560f2a56ab","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"602321ff1c2be2949f99be571fc7e6f1","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}}],"4":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f43fb52f1e4d733c3d2791ff276d1c8","len":2097304},"main":{"type":"PAR 2.0\u0000Main","md5":"15e29c28b0c942b71bf8d72c41f417e4","len":124},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"2b695fbbe4cffbd85107376fdebe4755","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"94c27b82ae09326b1d706f0a697bf363","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec874e384853036e7db4c8f9e5daf295","len":2097304},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"253b5bbe92cfb3c63dad4803d0eac1c8","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"bd4dedc8e22c169bb6d1e7a5e88e1c5a","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"ebf38ffca9179af5a839caed9afc70a2","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"0ae9e4a6f98a778a295426b6a0b18fce","len":2097304},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a3909310e9fa86d03b262e342c37eb46","len":220},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"025c93f5177f838bf02e3a62082b3cf9","len":2097304},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"8897558cd15484c5302aa09ebc257785","len":2097304},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"dcce7fba8e1775f5c94cc63f9031ea9a","len":2097304},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab2f333b9017c251b68841d2c8ae5e32","len":2097304},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7bb60d3f320fb55987f5df188a263775","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"15e29c28b0c942b71bf8d72c41f417e4","len":124},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"2b695fbbe4cffbd85107376fdebe4755","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"94c27b82ae09326b1d706f0a697bf363","len":132},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"253b5bbe92cfb3c63dad4803d0eac1c8","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"bd4dedc8e22c169bb6d1e7a5e88e1c5a","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"ebf38ffca9179af5a839caed9afc70a2","len":100},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a3909310e9fa86d03b262e342c37eb46","len":220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7bb60d3f320fb55987f5df188a263775","len":104}}],"5":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e9db1f3183bfee308f5b0916d01bf00","len":4194372},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"df2509789f3dc55df1987f2f92bb3fb7","len":4194372},"main":{"type":"PAR 2.0\u0000Main","md5":"dda74ad77c703c3b6bd3238d9609f4e6","len":92},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"5561f26c5da0d8aa979e4d20f219c2fc","len":4194372},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f4d785c8c4191bb867e53d6153f1a9b","len":4194372},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e826d9027b81b9c644a4846cc492e075","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"1e992b76a71d37375b85f658b0219484","len":4194372},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"1e2014a0ae283f00fe92273afb399c27","len":400},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"7acd330e30a81a3b06466db8d173cbbd","len":4194372},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c573ac66702d2f291fb274aa0807f60","len":4194372},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"e20112cc349e06f57cf69a4a4641d92d","len":4194372},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"88e4c154c27d77d92e628957cb75aa16","len":4194372},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"88f0286f334757303fd9e07740173ac5","len":4194372},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"97eea69ad75b5e84584002f9bec4f3be","len":4194372},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"a7ce99ec586dfaaec53f5c23bd6f11c7","len":4194372},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"baf2a0ffaf51aa6c6c79fb9ccbadfec4","len":4194372},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"a557d8b87d5fe7c931f0f842c6bf0442","len":4194372},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"8847c3bde0e6e789f862fb4bcb554554","len":4194372},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"cac77778f5f9468c4ac9d08c065bcbc5","len":4194372},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"178ad22283c6e76c44831fa5b25f13af","len":4194372},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"6776395d84a158b5b25fa69a7b9463fc","len":4194372},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"290f0106b684c7b0efcc4bc7b86ecd0a","len":4194372},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a5f230aa2dbf521be5ec054a8400452","len":4194372},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"2470711c92d349f1163b397692d7b330","len":4194372},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"8103d5f2ff25dd792bdbb177a3fe46ae","len":4194372},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"f52ffd659931f247f43ffdd0f11caf93","len":4194372},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8270285b6c635e60c4b4dee42bf05c1","len":4194372},"creator":{"type":"PAR 2.0\u0000Creator","md5":"16cbf752046052fdc30197e209799d64","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"dda74ad77c703c3b6bd3238d9609f4e6","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e826d9027b81b9c644a4846cc492e075","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"1e2014a0ae283f00fe92273afb399c27","len":400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"16cbf752046052fdc30197e209799d64","len":104}}],"6":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ec1b43947a219587684fbf415f9b610","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6a13ffd2f75ce6c75ac4028baaf39ca","len":1048644},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"70205e891ea5cf723375e8b710a10678","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"2436f22b6d754e6ea2c20fefa2600cc5","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"bc1f6c6d31a8e0ac1da75822406a8183","len":1048644},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c9f5e668b354ba002563567c415410a","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c36084e21c5aa8043a9af0bbe2f40b1","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e0a6b21b13b40d970bbf3d68cef8749","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"923493bb8645068ce59031656a6a81be","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"24f3affd0a3c433979e520fe164af3a2","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"2007e274e35d6592233a68a944ed564d","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dad701e5314dbf13b703a0f2bf1b462","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a7398c38a1927b36ec9fc4974dc0c6a","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee398a2258c8e389ba78b782cb0bcc67","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f51d8bdeb8d22405eb473efae46343","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"b276b7929c15f1ae6cf6d40e8376d4e1","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"075682015f58a65d763dbc9f0ee513e2","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce527a46c4740a6061dfdc4341b75ef8","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"6cae9d1f1b7a5fe33e797d2f06a7fbf9","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"518201534e884cd9aca3371613be161f","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"468de55acbe2330f7d410c208c9d74c7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"11c5e20152902ec61007121531fcd640","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98a1d2fadc180e15253cb38e1a63cb8","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"23d0dabf638bbd3296eff194897c5d7c","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"fdaca43a4141a1d49ba22108b8811f31","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"57851eaa8bca48269261f4b6298e2e4f","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"b8be0336b8787be4fa86041f9d38ca6d","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"312b9e44906da959cf77f49e04f52095","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"d268da0938bdcd1210126898b124188e","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"5b66e1272d272cefd8608856755ffb19","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5f1a350dd4c18d3c9d8680d084abaea","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4e0b201fb86d09bbe9cf86903c17a82","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a022d11b27ad0ca6beccb0e9aa1579b3","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"0de6017fd8d80d67e780eb679b9318f1","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e52bc14774ca31fa04edd8c5e9cd66f","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"50218f80d454d5c8e3c61698a5c500fb","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"e85705a19cca00706effad30d2e75c07","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3c10df5b6c5154300f69f7027b48b62","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"e7a98e885bb4f2393f8c1b560f2a56ab","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"602321ff1c2be2949f99be571fc7e6f1","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}}],"7":[{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b005924c173f828f388d03710c8bfcf","len":12292},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"10ee04a4d28a19d6c0e88d58515e1ef1","len":12292},"main":{"type":"PAR 2.0\u0000Main","md5":"5e7183ec1456a6f15e072fed0f78b9ca","len":140},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6c047f94fcc5a2d6d0dbafe7e633c39","len":12292},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"a03a9d24991dfd875d3ec30272366de5","len":12292},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"a2f96fdc0a29fc04ca17e31c20e6fc3f","len":132},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"1369390495de64f6640e5b8e7be37f40","len":12292},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"4622df73a1e90846931c16c2209768c9","len":12292},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"c4ee087ce65d3beb7eeeaa194ab7bef8","len":132},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"6434dffa3a3e653035ec03b1e371d564","len":12292},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"f5df0e311c0a7cd67535d75dee491d87","len":12292},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f11c6620a21f01950727de8dad1687f6","len":132},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"efb7ce56dcc860ae904aeaeae59a2ece","len":12292},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"dff92dc77555b6085e02423f106c4fd5","len":132},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"19a8eea83a8907a69f069f455550d359","len":12292},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a8e498c90635320fb4c3a88a0189665","len":12292},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"f6aa8895439563d682fb9fe2df0e1c9b","len":200},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a2b82f1404d9f2c83883ef6a43e6a81","len":12292},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"d409a0d5436982476d0a3b9bf19e3c8d","len":12292},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"1ffce628da93d13e375591ff67bc654b","len":100},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b46ab703bca1431991ca000043f84ea2","len":12292},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"21cb7408d0ba065287cd8e4053d61063","len":12292},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"0933688bb6983474179824e77f29b250","len":22400},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b94f8141601721b6f240d521fc22a56","len":12292},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a4508896659f168b3e1fef517fb636e","len":12292},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"21252c20b9aed4d71890b7d5854ccdb2","len":100},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"05880fb4f3e515af747090399210bbed","len":12292},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"efacb826b39b78dd72e73ea4554edda1","len":12292},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"623f9c56d059a1e578f4625322980c5e","len":12292},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"aa90ff8cd4a601e4b4cf20e61b35d083","len":12292},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"fb4c6a48ffe06cd2d77a61f1bee63667","len":12292},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"375cd39fdab776888186847c949f43f0","len":12292},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"52006dc0cdbd3b368237437c602aac6c","len":12292},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"e150767a0391b4fda00804f29eff96b7","len":12292},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"4509170ce60b05ed4a7acc5d585db77e","len":12292},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e593087e7c9fae63066b8731a1f0ee1","len":12292},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1a80cefac3d5afd7fe58a1b344339ae","len":12292},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d9065479d517927ec148b2519a09b08c","len":12292},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"82b9dcd7ccd15be338307666bd270064","len":12292},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"8ac4426e3cb2cff66b57ec356202890a","len":12292},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"d652a19d4c7dcfff936c634aa089fba8","len":12292},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"5b05c818bc77bda7fa2e7f51798b977c","len":12292},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"3217f0a91b66e3602c91723346a00d8f","len":12292},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"6046ec1511827d2425f3ce4cdcabd937","len":12292},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"53a6a420f0b16ada5b75efddd4ebe79c","len":12292},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"914df5e88258380e9efab604addaf6ca","len":12292},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a326bf24356ab8cb9585783e8344cff","len":12292},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"db20b5be20b70a83a3f8cafceaa4c330","len":12292},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"f028d797702c7bff3e83458be486d555","len":12292},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ee1623a1d7aa875f656b6c4775bb8f9","len":12292},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef1528e5ed4152c56b89a82bd3828dde","len":12292},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"01b9506a11bf8eec8a0ff39231c6bd0c","len":12292},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3ed5f3d7303078d06e3db993b095be1","len":12292},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"85d7f8cc42b4db916247c4ae3d2131b3","len":12292},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"85a61ece2caede278f5843acb51dde24","len":12292},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"7298e77277cd28ea5d03ecb7a8bf4e4b","len":12292},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"cdaffa6e46f13946f6be00cbae471a80","len":12292},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"a3dd9157d4fca3de8e070c3c5cb9d40a","len":12292},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"432d6bd89e4ed25d63ca7d7977503d45","len":12292},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"fd5c47a8c4e7571b90d1c7f26a498226","len":12292},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"4450fd57ec66606c74bc11073f4cc585","len":12292},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"c0f77d7ff4bd61b8ab8f76db2d97d95c","len":12292},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"bea6fe996d8e248a3765402e7d4d8d8a","len":12292},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"04d07195645d6a762d2073bdcd47222b","len":12292},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"5311502407c1f6897d2d3fce2a95a64e","len":12292},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"a18929e0350f8e2fb17116074acc713d","len":12292},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"ae658203da57104cd100410480b1136f","len":12292},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"550aa49e56358209671fde8c7d344bd2","len":12292},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee727c6ba693c86d5d8c8fbdcc440f6d","len":12292},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"d996d134917f84d902dc9c82af8c733b","len":12292},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"643ad681de2963723f4456020b065f6a","len":12292},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"2fea3ec825ac290c913f438be3a01ff8","len":12292},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"aefdfd2895d3e4a0fbd94988e6bfa868","len":12292},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"3154e06a28cc37addd261c3c6de5a3af","len":12292},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"2a85acb9643df46c1627cc1a8b1bad59","len":12292},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"20d3d32dea37223ccd90bb2d7b08c2cf","len":12292},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"f2411605c2db3293b7a2d2d62494e958","len":12292},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"3c890fbfb07da5b73a041d7322144348","len":12292},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"852c9e4f9c86e2233998598df7ab330c","len":12292},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"7737ca9add281e4f3ab131d174401010","len":12292},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"7226ad8c693edc67a61f42d8ebb7a96a","len":12292},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"91f51723204f5bf8544e89bd0bd30c13","len":12292},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"d6a172d61a9a686a31985e1ae45e3f13","len":12292},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"3454ea5ce87263b0d63af8d70af18988","len":12292},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"870191c25f0f1aa43e69e5b7ebbf71ff","len":12292},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"08cd7f192ee853ab5fef5d8930a06ede","len":12292},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"a651ca28bf96b5ee3ffdd6e58db168f7","len":12292},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"802c3adbf700cdc78b7d20ac503a3f04","len":12292},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"0071f0ac02f83574e307b83a6c34c2c0","len":12292},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a5656df4b21663954a9ef6a93895a64","len":12292},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"317a089ebb628a02e47626fabb66c757","len":12292},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"15c639f76ca5bf5bad1cebd95e575c41","len":12292},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"682bc4e6f85516b926ac797bef70810c","len":12292},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"7864ca2aa50b2bd02fad16497cf611bc","len":12292},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"a781789f8de0cf9698b353ff9f316e8d","len":12292},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"c184199949b8cf8a9c7ab1edee2ed8bc","len":12292},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"3808131bb9c9e8a9265bc2a4fc3e4630","len":12292},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fd9bdcc8c75efc129366221c931486d","len":12292},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2b69333769ed73d2725830c5f7c4fa2","len":12292},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"d126a7d127450fbc98bf683e4e6a9607","len":12292},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"5df32336acf571e882b0a67c0cafd8ea","len":12292},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a1cd80fe0422faa74deb2040ad8512d","len":12292},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"2552769b53f92e48a2543d1f2b3136b4","len":12292},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6ae6ae2dd2d59a93cdc83ec8e1a62f8","len":12292},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"fae1d02640a88a5b4850932f209107db","len":12292},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"bef5f91e22020b07dba8aa3745868e12","len":12292},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"315adc56c4394db656591b6d57ec644b","len":12292},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"c3fa310ab903ce125624e0fe1e4ca288","len":12292},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"5dabc0bf499aacfbd85306b0e66d835f","len":12292},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"15c7ec5674b88d1e3a7738b2c4b82699","len":12292},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ec8e38dcc549f86442d8598c7810114","len":12292},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"a3c4c001b720eb41feb7fbb5c96c76c0","len":12292},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"975ac885338b2b4cc1d448f75c84cddc","len":12292},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"92d2c61159fdb633f95d57bc41a5930f","len":12292},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"57aa0f0b879ad7659452a0e5b1a99922","len":12292},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"92eb3c634d9da1396cf3f14c49724a9f","len":12292},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"5eb6738f6b97fbca8d990c125cabdbee","len":12292},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"14cb0db32c4c2ab5c0b111063d7f78fd","len":12292},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f1bfb1b31c59b76501e3c5bf0c4ac6e","len":12292},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"11dce209195d56a10cf47d1fa358bd17","len":12292},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"be401876f735ef36bc79be2624cf55e8","len":12292},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac561be858d730b458a89417fc8cd472","len":12292},"creator":{"type":"PAR 2.0\u0000Creator","md5":"33e747ebfbcf20640dc2c135668fedad","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"5e7183ec1456a6f15e072fed0f78b9ca","len":140},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"a2f96fdc0a29fc04ca17e31c20e6fc3f","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"c4ee087ce65d3beb7eeeaa194ab7bef8","len":132},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f11c6620a21f01950727de8dad1687f6","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"dff92dc77555b6085e02423f106c4fd5","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"f6aa8895439563d682fb9fe2df0e1c9b","len":200},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"1ffce628da93d13e375591ff67bc654b","len":100},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"0933688bb6983474179824e77f29b250","len":22400},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"21252c20b9aed4d71890b7d5854ccdb2","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"33e747ebfbcf20640dc2c135668fedad","len":104}}],"8":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"8f490677c7370c81405b5dc720de5259","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab4b40f2f8c79be1b72fc3c297034e18","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}}],"9":[{"main":{"type":"PAR 2.0\u0000Main","md5":"d2e9f5f81e8780b703db895c249cbd68","len":92},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"4a2d665a1e6879cd1b9d04025a7a5f80","len":132},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"59a78d4061c80c5b686bc85f1ae871e1","len":120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"aef6213477d5a93d8932e106b971214e","len":104}}],"10":[{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"c98eac33ebcacab03a93c097875db85f","len":16777284},"main":{"type":"PAR 2.0\u0000Main","md5":"fa4e9795952daa58ec0c79f3bb486daa","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"1c0f2b78a1f547f026c817dcb3f78cfa","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"292f79fd2bede637da2b4a771c3e43de","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"46018ec469e1cdd39cffa769c532d229","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"3e007505ad34d9349ff9ec322bb276b1","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"bc61781d8a0b4d7835b0b7af7bc5a0b3","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"d2aea9e97389c23fb277e75ebae5789f","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"67e37e25db5bd8526d39916e2af81195","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"fa4e9795952daa58ec0c79f3bb486daa","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"1c0f2b78a1f547f026c817dcb3f78cfa","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"292f79fd2bede637da2b4a771c3e43de","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"46018ec469e1cdd39cffa769c532d229","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"3e007505ad34d9349ff9ec322bb276b1","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"bc61781d8a0b4d7835b0b7af7bc5a0b3","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"d2aea9e97389c23fb277e75ebae5789f","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"67e37e25db5bd8526d39916e2af81195","len":104}}],"11":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"27492ff0236d2b9f708c20c9037bb477","len":1048644},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca610c37276b236032385de6cfe31ad1","len":1048644},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"681a77c9df99be565a1d80059faa7849","len":1048644},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"869049b0db80bd673dc2e216b1bb558c","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"2014ba94366054b731ce6b7a09be65bc","len":92},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"e63821530a0ee31edbbeb1ea7b34d6eb","len":1048644},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"40943184042218658f4b5f2c42865457","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"d4a6a758a335e861e4653dbd042bb0e9","len":1048644},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f1590ab293fde02149689a0c428cf36a","len":132},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"ebfa4c4d38ea7ba2b730b09b05125a9f","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8acf1a902681c73f2410df6f973ff19","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"05e99361f97158fdfb083898b7b1fee4","len":1048644},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"3fcb592bb781bd4f2dacafa15e64c282","len":340},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"64705b161c0f222f3979de574c5292e5","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"6661d09aa0869100e4c7ac009d732f22","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"662fcae4ec9e8f216ed800402a86a19f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"1aeb37755b286c499e72f16d6b2997e3","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1a04b4cb237cf7d2fcf96f65fa3707a","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"c502f0e383a4913adf41bddc47188fa5","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"f79753d9573717cb313ad28d49b58957","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"6718a475be7d48d4505a816e607ac0cf","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5e08b73c0a116a186200ceb5eee12e1","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"26f49122ec0c65aaeee91f02fe3b298a","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"f5cc151476af228e309246f1135a868c","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"287dd184771362431860f2e79abf6b71","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"15103320ebc4a1d287809b3f550cf466","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"8a1b9c9e29c02f11e7efe347238dce56","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"5bc916db93b8499680444eaa8922c71a","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"81e31a0b3c5f6e52ebef05ac7ebf05ef","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"894270cb9860bf959b94503998c2aa45","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"ecd365739fa43e31fb3b8ad4a0c702aa","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9839ce30516291840c500c62c54c7f3","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e779c716007d2e2e5c13206b61ed242","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b7edfa25e88e2d693acbc5c4ee7e3ab","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ec91f3d8dbb68e15ee288967d4bda70","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"069d8f9fc22a257a3087528f5f0d3e4b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ff092dc292066a6f16ebff3abcddaec","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8396bbe2e1c7cd808a0c19b9e44366d","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"49c529ecd0b1fa399ae7ab878560c92c","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"8bebdbd3d808c72bb1dff32e304797da","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5ba7fc114a2f51d4ca31eab75df5967","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"90f2adae0315250cd0b3e2e061e145eb","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"36fec8721806963a50db398bfe5d3530","len":1048644},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"f7037497d34f5aaf8c80f390c3162a34","len":1048644},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"c68e28d6e559c377c2d641d465e04c74","len":1048644},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1c7deb85b51c16ff84a3b8b0f153db2","len":1048644},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c9437f2aaa7eefb38cb9d2d3b5250da","len":1048644},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"d33f3bc5820245947f80cfbbc9a5650a","len":1048644},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"f989930092b5e09335b0d1744f5cd020","len":1048644},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"abcf091241c99df80cf38d263f233b98","len":1048644},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"5eb7e2ab745961fb460c6313224164db","len":1048644},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6e1903b08958717ec3f45fb30a6ea18","len":1048644},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"f7eed30e7e18813e48e9ef3072d1bf06","len":1048644},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"b6f879be0f4dc7a2a69945b5078b2e2d","len":1048644},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"15a4aab4c0452968a8ea76e360fab4e4","len":1048644},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"6427967242a9cb6e6f87e77d9afe8c76","len":1048644},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"8bdf763300c24964bc063a06c92c0311","len":1048644},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"e53b551ec383f9a2e5a0869838c5cb62","len":1048644},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"12f2b1472523b981fddeafa8587fb023","len":1048644},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"01cae744d4718c396afd6dd2a66ea3cb","len":1048644},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fc330e37834451f80325895fc0c05c2","len":1048644},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"15000e384e5cc3035964779684a8a63d","len":1048644},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b5584869ec45296ac2ec2ba5020dcbf","len":1048644},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"21e6bd8d4d894064564b667206c4339b","len":1048644},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"27b4816b7409ab074398958dc2b30d05","len":1048644},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"9418ecd6483c380158ebe517c994d0f2","len":1048644},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"373aeeefecdc31b6b003e673d7ce15c3","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"721763935dac2e9935321d25fad86412","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"2014ba94366054b731ce6b7a09be65bc","len":92},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f1590ab293fde02149689a0c428cf36a","len":132},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"3fcb592bb781bd4f2dacafa15e64c282","len":340},"creator":{"type":"PAR 2.0\u0000Creator","md5":"721763935dac2e9935321d25fad86412","len":104}}],"14":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"2b7b3d05808018d22a6e51d7b4541f31","len":4294967364},"main":{"type":"PAR 2.0\u0000Main","md5":"9e8e86abd4093d642d36088b69fa93c0","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"bf520a3d185331665df4cefc0d8b2099","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"6c39df8daf2eec6742268a7ba4d70a68","len":100},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"22f4dae24dcee3451e6d66f05fa8ea04","len":4294967364},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7d88aad2e7641f57678a0c84af8f2ebf","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"9e8e86abd4093d642d36088b69fa93c0","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"bf520a3d185331665df4cefc0d8b2099","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"6c39df8daf2eec6742268a7ba4d70a68","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7d88aad2e7641f57678a0c84af8f2ebf","len":104}}],"18":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"60ca3542a3b114933c184af989c66e52","len":268435528},"main":{"type":"PAR 2.0\u0000Main","md5":"afdda0f6f4c81efa247b9ca7fe8a48dd","len":92},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"15badf8997726cd74a70383d12ee7df4","len":136},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"3e75633d27da2cc7de064a08c3bc0e89","len":260},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"c3ece8b87332cb09588c073ca9d54df8","len":268435528},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0711905a8aa594280a3f4826abc74dad","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"afdda0f6f4c81efa247b9ca7fe8a48dd","len":92},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"15badf8997726cd74a70383d12ee7df4","len":136},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"3e75633d27da2cc7de064a08c3bc0e89","len":260},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0711905a8aa594280a3f4826abc74dad","len":104}}],"20":[{"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}}],"21":[{"main":{"type":"PAR 2.0\u0000Main","md5":"ac75848fa5f4490a90eac71937d2ae59","len":92},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"0251d0bfed34934dff4d8ba49f6e7fdd","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"fffa78966785f1d47dfe63bb2b5aaf6f","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"94709bacceb3c6434a9cf90e822d7792","len":104}}],"22":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"81b7b225c9636e2e948798bad4297bc3","len":262212},"main":{"type":"PAR 2.0\u0000Main","md5":"cbff33656e8183081812de0c17dadf53","len":140},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"8eab9660d91a0b7d093c50a66b9355f9","len":132},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"b0455e6f6d54c2789a9e80f7d33e24cf","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"4afc8cfaccb54ea53674eb7130b8d975","len":262212},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"102ee7c2b1dd5c778b422ca005f70768","len":132},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"4de6d7fd893ecbaea7974ff5d7fdd92c","len":136},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"7b643fb29da65e845878166ef9c16481","len":5200},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"1a68f1cf8dbb9bc94e967868cf528f7f","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"3559df428b57c8df3f10d97519ca272e","len":262212},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a49dd85cdc26defa02f1c21662cabc3b","len":1120},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"1212c757606d123ca2e279fb303034cf","len":176080},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad1f197f813b2a966ab39e324249b936","len":262212},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"43216e38f48a7d4a2f3041d6cb4edc3c","len":262212},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"28d7d71e06c5fb7b049a4d21397bd017","len":262212},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"421e1eb75721816b7017047ee0a5df55","len":262212},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"9d855f5994aac01a34e38281f20c0e85","len":262212},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"a4090de0a584011e0f2edf591989cca5","len":262212},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"aeb7833d06dfded9b2c799bb7b705c0a","len":262212},"creator":{"type":"PAR 2.0\u0000Creator","md5":"c2132720a90fa23abd935f59c5ec96ab","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"cbff33656e8183081812de0c17dadf53","len":140},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"8eab9660d91a0b7d093c50a66b9355f9","len":132},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"b0455e6f6d54c2789a9e80f7d33e24cf","len":132},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"102ee7c2b1dd5c778b422ca005f70768","len":132},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"4de6d7fd893ecbaea7974ff5d7fdd92c","len":136},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"7b643fb29da65e845878166ef9c16481","len":5200},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"1a68f1cf8dbb9bc94e967868cf528f7f","len":100},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a49dd85cdc26defa02f1c21662cabc3b","len":1120},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"1212c757606d123ca2e279fb303034cf","len":176080},"creator":{"type":"PAR 2.0\u0000Creator","md5":"c2132720a90fa23abd935f59c5ec96ab","len":104}}]} \ No newline at end of file diff --git a/test/par-compare.js b/test/par-compare.js index 79c2ab72..e3b2de35 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -194,7 +194,7 @@ function compare_files(file1, file2) { //console.log('Packet mismatch for ' + k, file1[k], file2[k]); var err = new Error('Packet mismatch for ' + k); //err.pkts = [file1[k], file2[k]]; - console.log("Packet dump:", file1[k], file2[k]); + console.log("Packet dump (expected/actual):", file1[k], file2[k]); throw err; } } @@ -314,8 +314,6 @@ function writeRndFile(name, size) { } writeRndFile('test64m.bin', 64*1048576); writeRndFile('test2200m.bin', 2200*1048576); -if(!fastTest) - writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows // we don't test 0 byte files - different implementations seem to treat it differently: // - par2cmdline: skips all 0 byte files @@ -328,6 +326,8 @@ fs.writeFileSync(tmpDir + 'test8b.bin', '01234567'); writeRndFile('test65k.bin', 65521); writeRndFile('test13m.bin', 13631477); +if(!fastTest) // ensure this is last to make input files consistent between fast/slow tests + writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows var cachedResults = {}; var setCacheKeys = {}; @@ -493,8 +493,7 @@ var allTests = [ blocks: 2, singleFile: true, cacheKey: '18' - }, - + } ]; if(!fastTest) { allTests.push( @@ -546,7 +545,7 @@ if(!fastTest) { blocks: 2, singleFile: true, cacheKey: '19' - }, + } ); if(is64bPlatform) { allTests.push({ // recovery > 4GB in memory [https://github.com/animetosho/par2cmdline-turbo/issues/7] From 403737ced71140e123ca1ccc71f62ae08b477284 Mon Sep 17 00:00:00 2001 From: animetosho Date: Wed, 30 Aug 2023 20:48:00 +1000 Subject: [PATCH 91/91] Test workflow fixes --- .github/workflows/test-full.yml | 2 +- .github/workflows/test.yml | 3 ++- test/par-compare.js | 8 ++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index 562917d9..1a47a8f4 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -9,7 +9,7 @@ jobs: fail-fast: false matrix: include: - - version: '0.10.40' + - version: '0.10.48' flags: '' python2: true - version: '4.9.1' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2e1c3534..548ea67d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,8 @@ jobs: steps: - uses: ilammy/setup-nasm@v1 - uses: petarpetrovt/setup-sde@v2.1 - sdeVersion: 8.69.1 + with: + sdeVersion: 8.69.1 - uses: actions/checkout@v3 - run: | mkdir test\gf16\build diff --git a/test/par-compare.js b/test/par-compare.js index e3b2de35..0f149562 100644 --- a/test/par-compare.js +++ b/test/par-compare.js @@ -299,13 +299,17 @@ console.log('Creating random input files...'); function writeRndFile(name, size) { if(skipFileCreate && fs.existsSync(tmpDir + name)) return; var fd = fs.openSync(tmpDir + name, 'w'); - var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, ''); + var rand = crypto.createCipheriv('rc4', 'my_incredibly_strong_password' + name, ''); rand.setAutoPadding(false); var nullBuf = allocBuffer(1024*16); nullBuf.fill(0); var written = 0; while(written < size) { - var b = bufferSlice.call(rand.update(nullBuf), 0, Math.min(1024*16, size-written)); + var b = rand.update(nullBuf); + if(b.subarray) + b = bufferSlice.call(b, 0, Math.min(1024*16, size-written)); + else // on Node v0.10.x, rand is a SlowBuffer, so calling Buffer.slice on it won't work + b = b.slice(0, Math.min(1024*16, size-written)); fsWriteSync(fd, b); written += b.length; }