From c356a663e068a66ecf451cab6fa4e1b1156a3ec8 Mon Sep 17 00:00:00 2001 From: Chris <34682781+monkins1010@users.noreply.github.com> Date: Sat, 8 Dec 2018 08:19:40 +0000 Subject: [PATCH] included @pallas1 optimistions --- verus/haraka.h | 10 +- verus/verus.cu | 540 ++++++++++++++++++++++++++++++++------------ verus/verusscan.cpp | 60 ++--- verus/verusscan.cu | 50 ++-- 4 files changed, 469 insertions(+), 191 deletions(-) diff --git a/verus/haraka.h b/verus/haraka.h index 739964a220..d47308ab7b 100644 --- a/verus/haraka.h +++ b/verus/haraka.h @@ -1,4 +1,4 @@ -/* +/* Plain C implementation of the Haraka256 and Haraka512 permutations. */ @@ -32,4 +32,10 @@ void haraka512_port_zero(unsigned char *out, const unsigned char *in); void haraka256_port(unsigned char *out, const unsigned char *in); /* Implementation of Haraka-256 using sk.seed constants */ -void haraka256_sk(unsigned char *out, const unsigned char *in); \ No newline at end of file +void haraka256_sk(unsigned char *out, const unsigned char *in); + +void aesenc(unsigned char *s, const unsigned char *rk); + +void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b); + +void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b); diff --git a/verus/verus.cu b/verus/verus.cu index 6cc6591713..bd9e5e653e 100644 --- a/verus/verus.cu +++ b/verus/verus.cu @@ -1,39 +1,36 @@ - #include - #include -__device__ uint32_t sbox[64] = -{ 0x7b777c63, 0xc56f6bf2, 0x2b670130, 0x76abd7fe, 0x7dc982ca, 0xf04759fa, 0xafa2d4ad, 0xc072a49c, 0x2693fdb7, 0xccf73f36, 0xf1e5a534, 0x1531d871, 0xc323c704, 0x9a059618, 0xe2801207, 0x75b227eb, 0x1a2c8309, 0xa05a6e1b, 0xb3d63b52, 0x842fe329, 0xed00d153, 0x5bb1fc20, 0x39becb6a, 0xcf584c4a, 0xfbaaefd0, 0x85334d43, 0x7f02f945, 0xa89f3c50, 0x8f40a351, 0xf5389d92, 0x21dab6bc, 0xd2f3ff10, 0xec130ccd, 0x1744975f, 0x3d7ea7c4, 0x73195d64, 0xdc4f8160, 0x88902a22, 0x14b8ee46, 0xdb0b5ede, 0x0a3a32e0, 0x5c240649, 0x62acd3c2, 0x79e49591, 0x6d37c8e7, 0xa94ed58d, 0xeaf4566c, 0x08ae7a65, 0x2e2578ba, 0xc6b4a61c, 0x1f74dde8, 0x8a8bbd4b, 0x66b53e70, 0x0ef60348, 0xb9573561, 0x9e1dc186, 0x1198f8e1, 0x948ed969, 0xe9871e9b, 0xdf2855ce, 0x0d89a18c, 0x6842e6bf, 0x0f2d9941, 0x16bb54b0 }; -#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) +__device__ const uint32_t sbox[] = { + 0x7b777c63, 0xc56f6bf2, 0x2b670130, 0x76abd7fe, 0x7dc982ca, 0xf04759fa, 0xafa2d4ad, 0xc072a49c, 0x2693fdb7, 0xccf73f36, 0xf1e5a534, 0x1531d871, 0xc323c704, 0x9a059618, 0xe2801207, 0x75b227eb, 0x1a2c8309, 0xa05a6e1b, 0xb3d63b52, 0x842fe329, 0xed00d153, 0x5bb1fc20, 0x39becb6a, 0xcf584c4a, 0xfbaaefd0, 0x85334d43, 0x7f02f945, 0xa89f3c50, 0x8f40a351, 0xf5389d92, 0x21dab6bc, 0xd2f3ff10, 0xec130ccd, 0x1744975f, 0x3d7ea7c4, 0x73195d64, 0xdc4f8160, 0x88902a22, 0x14b8ee46, 0xdb0b5ede, 0x0a3a32e0, 0x5c240649, 0x62acd3c2, 0x79e49591, 0x6d37c8e7, 0xa94ed58d, 0xeaf4566c, 0x08ae7a65, 0x2e2578ba, 0xc6b4a61c, 0x1f74dde8, 0x8a8bbd4b, 0x66b53e70, 0x0ef60348, 0xb9573561, 0x9e1dc186, 0x1198f8e1, 0x948ed969, 0xe9871e9b, 0xdf2855ce, 0x0d89a18c, 0x6842e6bf, 0x0f2d9941, 0x16bb54b0, + 0x7b777c63, 0xc56f6bf2, 0x2b670130, 0x76abd7fe, 0x7dc982ca, 0xf04759fa, 0xafa2d4ad, 0xc072a49c, 0x2693fdb7, 0xccf73f36, 0xf1e5a534, 0x1531d871, 0xc323c704, 0x9a059618, 0xe2801207, 0x75b227eb, 0x1a2c8309, 0xa05a6e1b, 0xb3d63b52, 0x842fe329, 0xed00d153, 0x5bb1fc20, 0x39becb6a, 0xcf584c4a, 0xfbaaefd0, 0x85334d43, 0x7f02f945, 0xa89f3c50, 0x8f40a351, 0xf5389d92, 0x21dab6bc, 0xd2f3ff10, 0xec130ccd, 0x1744975f, 0x3d7ea7c4, 0x73195d64, 0xdc4f8160, 0x88902a22, 0x14b8ee46, 0xdb0b5ede, 0x0a3a32e0, 0x5c240649, 0x62acd3c2, 0x79e49591, 0x6d37c8e7, 0xa94ed58d, 0xeaf4566c, 0x08ae7a65, 0x2e2578ba, 0xc6b4a61c, 0x1f74dde8, 0x8a8bbd4b, 0x66b53e70, 0x0ef60348, 0xb9573561, 0x9e1dc186, 0x1198f8e1, 0x948ed969, 0xe9871e9b, 0xdf2855ce, 0x0d89a18c, 0x6842e6bf, 0x0f2d9941, 0x16bb54b0, + 0x7b777c63, 0xc56f6bf2, 0x2b670130, 0x76abd7fe, 0x7dc982ca, 0xf04759fa, 0xafa2d4ad, 0xc072a49c, 0x2693fdb7, 0xccf73f36, 0xf1e5a534, 0x1531d871, 0xc323c704, 0x9a059618, 0xe2801207, 0x75b227eb, 0x1a2c8309, 0xa05a6e1b, 0xb3d63b52, 0x842fe329, 0xed00d153, 0x5bb1fc20, 0x39becb6a, 0xcf584c4a, 0xfbaaefd0, 0x85334d43, 0x7f02f945, 0xa89f3c50, 0x8f40a351, 0xf5389d92, 0x21dab6bc, 0xd2f3ff10, 0xec130ccd, 0x1744975f, 0x3d7ea7c4, 0x73195d64, 0xdc4f8160, 0x88902a22, 0x14b8ee46, 0xdb0b5ede, 0x0a3a32e0, 0x5c240649, 0x62acd3c2, 0x79e49591, 0x6d37c8e7, 0xa94ed58d, 0xeaf4566c, 0x08ae7a65, 0x2e2578ba, 0xc6b4a61c, 0x1f74dde8, 0x8a8bbd4b, 0x66b53e70, 0x0ef60348, 0xb9573561, 0x9e1dc186, 0x1198f8e1, 0x948ed969, 0xe9871e9b, 0xdf2855ce, 0x0d89a18c, 0x6842e6bf, 0x0f2d9941, 0x16bb54b0, + 0x7b777c63, 0xc56f6bf2, 0x2b670130, 0x76abd7fe, 0x7dc982ca, 0xf04759fa, 0xafa2d4ad, 0xc072a49c, 0x2693fdb7, 0xccf73f36, 0xf1e5a534, 0x1531d871, 0xc323c704, 0x9a059618, 0xe2801207, 0x75b227eb, 0x1a2c8309, 0xa05a6e1b, 0xb3d63b52, 0x842fe329, 0xed00d153, 0x5bb1fc20, 0x39becb6a, 0xcf584c4a, 0xfbaaefd0, 0x85334d43, 0x7f02f945, 0xa89f3c50, 0x8f40a351, 0xf5389d92, 0x21dab6bc, 0xd2f3ff10, 0xec130ccd, 0x1744975f, 0x3d7ea7c4, 0x73195d64, 0xdc4f8160, 0x88902a22, 0x14b8ee46, 0xdb0b5ede, 0x0a3a32e0, 0x5c240649, 0x62acd3c2, 0x79e49591, 0x6d37c8e7, 0xa94ed58d, 0xeaf4566c, 0x08ae7a65, 0x2e2578ba, 0xc6b4a61c, 0x1f74dde8, 0x8a8bbd4b, 0x66b53e70, 0x0ef60348, 0xb9573561, 0x9e1dc186, 0x1198f8e1, 0x948ed969, 0xe9871e9b, 0xdf2855ce, 0x0d89a18c, 0x6842e6bf, 0x0f2d9941, 0x16bb54b0 +}; +//#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) +#define XT(x) (((x) << 1) ^ (((x) >> 7) ? 0x1b : 0)) __global__ void verus_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *resNonce); -__device__ void haraka512_perm(unsigned char *out, unsigned char *in); static uint32_t *d_nonces[MAX_GPUS]; -__device__ __constant__ uint8_t blockhash_half[128]; -__device__ __constant__ uint32_t ptarget[8]; - -__device__ void memcpy_decker(unsigned char *dst, unsigned char *src, int len) { - int i; - for (i = 0; i< len; i++) { dst[i] = src[i]; } -} +__device__ __constant__ uint8_t blockhash_half[32]; +__device__ __constant__ uint32_t ptarget; __host__ void verus_init(int thr_id) { -CUDA_SAFE_CALL(cudaMalloc(&d_nonces[thr_id], 2 * sizeof(uint32_t))); -}; + CUDA_SAFE_CALL(cudaMalloc(&d_nonces[thr_id], 2 * sizeof(uint32_t))); +} void verus_setBlock(uint8_t *blockf, uint32_t *pTargetIn) { -CUDA_SAFE_CALL(cudaMemcpyToSymbol(ptarget, (void**)pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); -CUDA_SAFE_CALL(cudaMemcpyToSymbol(blockhash_half, (void**)blockf, 64 * sizeof(uint8_t), 0, cudaMemcpyHostToDevice)); -}; + CUDA_SAFE_CALL(cudaMemcpyToSymbol(ptarget, (void**)&pTargetIn[7], sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(blockhash_half, (void**)blockf, 32, 0, cudaMemcpyHostToDevice)); +} + __host__ void verus_hash(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces) { cudaMemset(d_nonces[thr_id], 0xff, 2 * sizeof(uint32_t)); const uint32_t threadsperblock = 256; - dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); @@ -42,155 +39,414 @@ void verus_hash(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *res cudaMemcpy(resNonces, d_nonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); //memcpy(resNonces, h_nonces[thr_id], NBN * sizeof(uint32_t)); -}; +} +// Simulate _mm_aesenc_si128 instructions from AESNI +__device__ __forceinline__ void aesenc(unsigned char *s, uint32_t *sharedMemory1) +{ + uint32_t t, u; + uint32_t v[4][4]; -//__constant__ static const + v[0][0] = ((uint8_t*)&sharedMemory1[0])[s[ 0]]; + v[3][1] = ((uint8_t*)&sharedMemory1[0])[s[ 1]]; + v[2][2] = ((uint8_t*)&sharedMemory1[0])[s[ 2]]; + v[1][3] = ((uint8_t*)&sharedMemory1[0])[s[ 3]]; + v[1][0] = ((uint8_t*)&sharedMemory1[0])[s[ 4]]; + v[0][1] = ((uint8_t*)&sharedMemory1[0])[s[ 5]]; + v[3][2] = ((uint8_t*)&sharedMemory1[0])[s[ 6]]; + v[2][3] = ((uint8_t*)&sharedMemory1[0])[s[ 7]]; + v[2][0] = ((uint8_t*)&sharedMemory1[0])[s[ 8]]; + v[1][1] = ((uint8_t*)&sharedMemory1[0])[s[ 9]]; + v[0][2] = ((uint8_t*)&sharedMemory1[0])[s[10]]; + v[3][3] = ((uint8_t*)&sharedMemory1[0])[s[11]]; + v[3][0] = ((uint8_t*)&sharedMemory1[0])[s[12]]; + v[2][1] = ((uint8_t*)&sharedMemory1[0])[s[13]]; + v[1][2] = ((uint8_t*)&sharedMemory1[0])[s[14]]; + v[0][3] = ((uint8_t*)&sharedMemory1[0])[s[15]]; -// Simulate _mm_aesenc_si128 instructions from AESNI -__device__ void aesenc(unsigned char *s, volatile uint32_t *sharedMemory1) + t = v[0][0]; + u = v[0][0] ^ v[0][1] ^ v[0][2] ^ v[0][3]; + v[0][0] = v[0][0] ^ u ^ XT(v[0][0] ^ v[0][1]); + v[0][1] = v[0][1] ^ u ^ XT(v[0][1] ^ v[0][2]); + v[0][2] = v[0][2] ^ u ^ XT(v[0][2] ^ v[0][3]); + v[0][3] = v[0][3] ^ u ^ XT(v[0][3] ^ t); + t = v[1][0]; + u = v[1][0] ^ v[1][1] ^ v[1][2] ^ v[1][3]; + v[1][0] = v[1][0] ^ u ^ XT(v[1][0] ^ v[1][1]); + v[1][1] = v[1][1] ^ u ^ XT(v[1][1] ^ v[1][2]); + v[1][2] = v[1][2] ^ u ^ XT(v[1][2] ^ v[1][3]); + v[1][3] = v[1][3] ^ u ^ XT(v[1][3] ^ t); + t = v[2][0]; + u = v[2][0] ^ v[2][1] ^ v[2][2] ^ v[2][3]; + v[2][0] = v[2][0] ^ u ^ XT(v[2][0] ^ v[2][1]); + v[2][1] = v[2][1] ^ u ^ XT(v[2][1] ^ v[2][2]); + v[2][2] = v[2][2] ^ u ^ XT(v[2][2] ^ v[2][3]); + v[2][3] = v[2][3] ^ u ^ XT(v[2][3] ^ t); + t = v[3][0]; + u = v[3][0] ^ v[3][1] ^ v[3][2] ^ v[3][3]; + v[3][0] = v[3][0] ^ u ^ XT(v[3][0] ^ v[3][1]); + v[3][1] = v[3][1] ^ u ^ XT(v[3][1] ^ v[3][2]); + v[3][2] = v[3][2] ^ u ^ XT(v[3][2] ^ v[3][3]); + v[3][3] = v[3][3] ^ u ^ XT(v[3][3] ^ t); + + s[0] = v[0][0]; + s[1] = v[0][1]; + s[2] = v[0][2]; + s[3] = v[0][3]; + s[4] = v[1][0]; + s[5] = v[1][1]; + s[6] = v[1][2]; + s[7] = v[1][3]; + s[8] = v[2][0]; + s[9] = v[2][1]; + s[10] = v[2][2]; + s[11] = v[2][3]; + s[12] = v[3][0]; + s[13] = v[3][1]; + s[14] = v[3][2]; + s[15] = v[3][3]; +} + +__device__ __forceinline__ void aesenc_n1(unsigned char *s, uint32_t *sharedMemory1) { uint32_t t, u; - register uint32_t v[4][4]; - -v[0][0] = ((uint8_t*)&sharedMemory1[0])[s[0]]; -v[3][1] = ((uint8_t*)&sharedMemory1[0])[s[1]]; -v[2][2] = ((uint8_t*)&sharedMemory1[0])[s[2]]; -v[1][3] = ((uint8_t*)&sharedMemory1[0])[s[3]]; -v[1][0] = ((uint8_t*)&sharedMemory1[0])[s[4]]; -v[0][1] = ((uint8_t*)&sharedMemory1[0])[s[5]]; -v[3][2] = ((uint8_t*)&sharedMemory1[0])[s[6]]; -v[2][3] = ((uint8_t*)&sharedMemory1[0])[s[7]]; -v[2][0] = ((uint8_t*)&sharedMemory1[0])[s[8]]; -v[1][1] = ((uint8_t*)&sharedMemory1[0])[s[9]]; -v[0][2] = ((uint8_t*)&sharedMemory1[0])[s[10]]; -v[3][3] = ((uint8_t*)&sharedMemory1[0])[s[11]]; -v[3][0] = ((uint8_t*)&sharedMemory1[0])[s[12]]; -v[2][1] = ((uint8_t*)&sharedMemory1[0])[s[13]]; -v[1][2] = ((uint8_t*)&sharedMemory1[0])[s[14]]; -v[0][3] = ((uint8_t*)&sharedMemory1[0])[s[15]]; - -t = v[0][0]; -u = v[0][0] ^ v[0][1] ^ v[0][2] ^ v[0][3]; -v[0][0] = v[0][0] ^ u ^ XT(v[0][0] ^ v[0][1]); -v[0][1] = v[0][1] ^ u ^ XT(v[0][1] ^ v[0][2]); -v[0][2] = v[0][2] ^ u ^ XT(v[0][2] ^ v[0][3]); -v[0][3] = v[0][3] ^ u ^ XT(v[0][3] ^ t); -t = v[1][0]; -u = v[1][0] ^ v[1][1] ^ v[1][2] ^ v[1][3]; -v[1][0] = v[1][0] ^ u ^ XT(v[1][0] ^ v[1][1]); -v[1][1] = v[1][1] ^ u ^ XT(v[1][1] ^ v[1][2]); -v[1][2] = v[1][2] ^ u ^ XT(v[1][2] ^ v[1][3]); -v[1][3] = v[1][3] ^ u ^ XT(v[1][3] ^ t); -t = v[2][0]; -u = v[2][0] ^ v[2][1] ^ v[2][2] ^ v[2][3]; -v[2][0] = v[2][0] ^ u ^ XT(v[2][0] ^ v[2][1]); -v[2][1] = v[2][1] ^ u ^ XT(v[2][1] ^ v[2][2]); -v[2][2] = v[2][2] ^ u ^ XT(v[2][2] ^ v[2][3]); -v[2][3] = v[2][3] ^ u ^ XT(v[2][3] ^ t); -t = v[3][0]; -u = v[3][0] ^ v[3][1] ^ v[3][2] ^ v[3][3]; -v[3][0] = v[3][0] ^ u ^ XT(v[3][0] ^ v[3][1]); -v[3][1] = v[3][1] ^ u ^ XT(v[3][1] ^ v[3][2]); -v[3][2] = v[3][2] ^ u ^ XT(v[3][2] ^ v[3][3]); -v[3][3] = v[3][3] ^ u ^ XT(v[3][3] ^ t); - - s[0] = v[0][0]; -s[1] = v[0][1]; -s[2] = v[0][2]; -s[3] = v[0][3]; -s[4] = v[1][0]; -s[5] = v[1][1]; -s[6] = v[1][2]; -s[7] = v[1][3]; -s[8] = v[2][0]; -s[9] = v[2][1]; -s[10] = v[2][2]; -s[11] = v[2][3]; -s[12] = v[3][0]; -s[13] = v[3][1]; -s[14] = v[3][2]; -s[15] = v[3][3]; + uint32_t v[4][4]; + + v[0][0] = ((uint8_t*)&sharedMemory1[0])[s[ 0]]; + v[3][1] = ((uint8_t*)&sharedMemory1[0])[s[ 1]]; + v[2][2] = ((uint8_t*)&sharedMemory1[0])[s[ 2]]; + v[1][3] = ((uint8_t*)&sharedMemory1[0])[s[ 3]]; + v[1][0] = 0x63; + v[0][1] = 0x63; + v[3][2] = 0x63; + v[2][3] = 0x63; + v[2][0] = 0x63; + v[1][1] = 0x63; + v[0][2] = 0x63; + v[3][3] = 0x63; + v[3][0] = 0x63; + v[2][1] = 0x63; + v[1][2] = 0x63; + v[0][3] = 0x63; + t = v[0][0]; + u = v[0][0] ^ v[0][1] ^ v[0][2] ^ v[0][3]; + v[0][0] = v[0][0] ^ u ^ XT(v[0][0] ^ v[0][1]); + v[0][1] = v[0][1] ^ u ^ XT(v[0][1] ^ v[0][2]); + v[0][2] = v[0][2] ^ u ^ XT(v[0][2] ^ v[0][3]); + v[0][3] = v[0][3] ^ u ^ XT(v[0][3] ^ t); + t = v[1][0]; + u = v[1][0] ^ v[1][1] ^ v[1][2] ^ v[1][3]; + v[1][0] = v[1][0] ^ u ^ XT(v[1][0] ^ v[1][1]); + v[1][1] = v[1][1] ^ u ^ XT(v[1][1] ^ v[1][2]); + v[1][2] = v[1][2] ^ u ^ XT(v[1][2] ^ v[1][3]); + v[1][3] = v[1][3] ^ u ^ XT(v[1][3] ^ t); + t = v[2][0]; + u = v[2][0] ^ v[2][1] ^ v[2][2] ^ v[2][3]; + v[2][0] = v[2][0] ^ u ^ XT(v[2][0] ^ v[2][1]); + v[2][1] = v[2][1] ^ u ^ XT(v[2][1] ^ v[2][2]); + v[2][2] = v[2][2] ^ u ^ XT(v[2][2] ^ v[2][3]); + v[2][3] = v[2][3] ^ u ^ XT(v[2][3] ^ t); + t = v[3][0]; + u = v[3][0] ^ v[3][1] ^ v[3][2] ^ v[3][3]; + v[3][0] = v[3][0] ^ u ^ XT(v[3][0] ^ v[3][1]); + v[3][1] = v[3][1] ^ u ^ XT(v[3][1] ^ v[3][2]); + v[3][2] = v[3][2] ^ u ^ XT(v[3][2] ^ v[3][3]); + v[3][3] = v[3][3] ^ u ^ XT(v[3][3] ^ t); + + s[0] = v[0][0]; + s[1] = v[0][1]; + s[2] = v[0][2]; + s[3] = v[0][3]; + s[4] = v[1][0]; + s[5] = v[1][1]; + s[6] = v[1][2]; + s[7] = v[1][3]; + s[8] = v[2][0]; + s[9] = v[2][1]; + s[10] = v[2][2]; + s[11] = v[2][3]; + s[12] = v[3][0]; + s[13] = v[3][1]; + s[14] = v[3][2]; + s[15] = v[3][3]; } +__device__ __forceinline__ void aesenc_s1(unsigned char *s, uint32_t *sharedMemory1) +{ + uint32_t t, u; + uint32_t v[4][4]; + + v[0][0] = ((uint8_t*)&sharedMemory1[0])[s[ 0]]; + v[3][1] = ((uint8_t*)&sharedMemory1[0])[s[ 1]]; + v[2][2] = ((uint8_t*)&sharedMemory1[0])[s[ 2]]; + v[1][3] = ((uint8_t*)&sharedMemory1[0])[s[ 3]]; + v[1][0] = ((uint8_t*)&sharedMemory1[0])[s[ 4]]; + v[0][1] = ((uint8_t*)&sharedMemory1[0])[s[ 5]]; + v[3][2] = ((uint8_t*)&sharedMemory1[0])[s[ 6]]; + v[2][3] = ((uint8_t*)&sharedMemory1[0])[s[ 7]]; + v[2][0] = ((uint8_t*)&sharedMemory1[0])[s[ 8]]; + v[1][1] = ((uint8_t*)&sharedMemory1[0])[s[ 9]]; + v[0][2] = ((uint8_t*)&sharedMemory1[0])[s[10]]; + v[3][3] = ((uint8_t*)&sharedMemory1[0])[s[11]]; + v[3][0] = 0x0f; + v[2][1] = 0x0f; + v[1][2] = 0x0f; + v[0][3] = 0x0f; + + t = v[0][0]; + u = v[0][0] ^ v[0][1] ^ v[0][2] ^ v[0][3]; + v[0][0] = v[0][0] ^ u ^ XT(v[0][0] ^ v[0][1]); + v[0][1] = v[0][1] ^ u ^ XT(v[0][1] ^ v[0][2]); + v[0][2] = v[0][2] ^ u ^ XT(v[0][2] ^ v[0][3]); + v[0][3] = v[0][3] ^ u ^ XT(v[0][3] ^ t); + t = v[1][0]; + u = v[1][0] ^ v[1][1] ^ v[1][2] ^ v[1][3]; + v[1][0] = v[1][0] ^ u ^ XT(v[1][0] ^ v[1][1]); + v[1][1] = v[1][1] ^ u ^ XT(v[1][1] ^ v[1][2]); + v[1][2] = v[1][2] ^ u ^ XT(v[1][2] ^ v[1][3]); + v[1][3] = v[1][3] ^ u ^ XT(v[1][3] ^ t); + t = v[2][0]; + u = v[2][0] ^ v[2][1] ^ v[2][2] ^ v[2][3]; + v[2][0] = v[2][0] ^ u ^ XT(v[2][0] ^ v[2][1]); + v[2][1] = v[2][1] ^ u ^ XT(v[2][1] ^ v[2][2]); + v[2][2] = v[2][2] ^ u ^ XT(v[2][2] ^ v[2][3]); + v[2][3] = v[2][3] ^ u ^ XT(v[2][3] ^ t); + t = v[3][0]; + u = v[3][0] ^ v[3][1] ^ v[3][2] ^ v[3][3]; + v[3][0] = v[3][0] ^ u ^ XT(v[3][0] ^ v[3][1]); + v[3][1] = v[3][1] ^ u ^ XT(v[3][1] ^ v[3][2]); + v[3][2] = v[3][2] ^ u ^ XT(v[3][2] ^ v[3][3]); + v[3][3] = v[3][3] ^ u ^ XT(v[3][3] ^ t); + + s[0] = v[0][0]; + s[1] = v[0][1]; + s[2] = v[0][2]; + s[3] = v[0][3]; + s[4] = v[1][0]; + s[5] = v[1][1]; + s[6] = v[1][2]; + s[7] = v[1][3]; + s[8] = v[2][0]; + s[9] = v[2][1]; + s[10] = v[2][2]; + s[11] = v[2][3]; + s[12] = v[3][0]; + s[13] = v[3][1]; + s[14] = v[3][2]; + s[15] = v[3][3]; +} + + +__device__ __forceinline__ void aesenc_s2(unsigned char *s, uint32_t *sharedMemory1) +{ + uint32_t t, u; + uint32_t v[4][4]; + + v[0][0] = ((uint8_t*)&sharedMemory1[0])[s[ 0]]; + v[3][1] = ((uint8_t*)&sharedMemory1[0])[s[ 1]]; + v[2][2] = ((uint8_t*)&sharedMemory1[0])[s[ 2]]; + v[1][3] = ((uint8_t*)&sharedMemory1[0])[s[ 3]]; + v[1][0] = ((uint8_t*)&sharedMemory1[0])[s[ 4]]; + v[0][1] = ((uint8_t*)&sharedMemory1[0])[s[ 5]]; + v[3][2] = ((uint8_t*)&sharedMemory1[0])[s[ 6]]; + v[2][3] = ((uint8_t*)&sharedMemory1[0])[s[ 7]]; + v[2][0] = 0x0f; + v[1][1] = 0x0f; + v[0][2] = 0x0f; + v[3][3] = 0x0f; + v[3][0] = ((uint8_t*)&sharedMemory1[0])[s[12]]; + v[2][1] = ((uint8_t*)&sharedMemory1[0])[s[13]]; + v[1][2] = ((uint8_t*)&sharedMemory1[0])[s[14]]; + v[0][3] = ((uint8_t*)&sharedMemory1[0])[s[15]]; + + t = v[0][0]; + u = v[0][0] ^ v[0][1] ^ v[0][2] ^ v[0][3]; + v[0][0] = v[0][0] ^ u ^ XT(v[0][0] ^ v[0][1]); + v[0][1] = v[0][1] ^ u ^ XT(v[0][1] ^ v[0][2]); + v[0][2] = v[0][2] ^ u ^ XT(v[0][2] ^ v[0][3]); + v[0][3] = v[0][3] ^ u ^ XT(v[0][3] ^ t); + t = v[1][0]; + u = v[1][0] ^ v[1][1] ^ v[1][2] ^ v[1][3]; + v[1][0] = v[1][0] ^ u ^ XT(v[1][0] ^ v[1][1]); + v[1][1] = v[1][1] ^ u ^ XT(v[1][1] ^ v[1][2]); + v[1][2] = v[1][2] ^ u ^ XT(v[1][2] ^ v[1][3]); + v[1][3] = v[1][3] ^ u ^ XT(v[1][3] ^ t); + t = v[2][0]; + u = v[2][0] ^ v[2][1] ^ v[2][2] ^ v[2][3]; + v[2][0] = v[2][0] ^ u ^ XT(v[2][0] ^ v[2][1]); + v[2][1] = v[2][1] ^ u ^ XT(v[2][1] ^ v[2][2]); + v[2][2] = v[2][2] ^ u ^ XT(v[2][2] ^ v[2][3]); + v[2][3] = v[2][3] ^ u ^ XT(v[2][3] ^ t); + t = v[3][0]; + u = v[3][0] ^ v[3][1] ^ v[3][2] ^ v[3][3]; + v[3][0] = v[3][0] ^ u ^ XT(v[3][0] ^ v[3][1]); + v[3][1] = v[3][1] ^ u ^ XT(v[3][1] ^ v[3][2]); + v[3][2] = v[3][2] ^ u ^ XT(v[3][2] ^ v[3][3]); + v[3][3] = v[3][3] ^ u ^ XT(v[3][3] ^ t); + + s[0] = v[0][0]; + s[1] = v[0][1]; + s[2] = v[0][2]; + s[3] = v[0][3]; + s[4] = v[1][0]; + s[5] = v[1][1]; + s[6] = v[1][2]; + s[7] = v[1][3]; + s[8] = v[2][0]; + s[9] = v[2][1]; + s[10] = v[2][2]; + s[11] = v[2][3]; + s[12] = v[3][0]; + s[13] = v[3][1]; + s[14] = v[3][2]; + s[15] = v[3][3]; +} + + // Simulate _mm_unpacklo_epi32 -__device__ void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) +__device__ __forceinline__ void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) { - unsigned char tmp[16]; - memcpy_decker(tmp, a, 4); - memcpy_decker(tmp + 4, b, 4); - memcpy_decker(tmp + 8, a + 4, 4); - memcpy_decker(tmp + 12, b + 4, 4); - memcpy_decker(t, tmp, 16); + uint32_t* t32 = (uint32_t*)t; + uint32_t* a32 = (uint32_t*)a; + uint32_t* b32 = (uint32_t*)b; + t32[0] = a32[0]; + t32[2] = a32[1]; + t32[1] = b32[0]; + t32[3] = b32[1]; } // Simulate _mm_unpackhi_epi32 -__device__ void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b) +__device__ __forceinline__ void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b) { - unsigned char tmp[16]; - memcpy_decker(tmp, a + 8, 4); - memcpy_decker(tmp + 4, b + 8, 4); - memcpy_decker(tmp + 8, a + 12, 4); - memcpy_decker(tmp + 12, b + 12, 4); - memcpy_decker(t, tmp, 16); + uint32_t* t32 = (uint32_t*)t; + uint32_t* a32 = (uint32_t*)a; + uint32_t* b32 = (uint32_t*)b; + t32[0] = a32[2]; + t32[1] = b32[2]; + t32[2] = a32[3]; + t32[3] = b32[3]; +} +// Simulate _mm_unpacklo_epi32 +__device__ __forceinline__ void unpacklo32s(unsigned char *a, unsigned char *b) +{ + uint32_t* a32 = (uint32_t*)a; + uint32_t* b32 = (uint32_t*)b; + a32[2] = a32[1]; + a32[1] = b32[0]; + a32[3] = b32[1]; } -__global__ __launch_bounds__(256, 1) +__global__ __launch_bounds__(256, 2) void verus_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *resNonce) { uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - int i, j; + int i, j; unsigned char s[64], tmp[16]; - __shared__ volatile uint32_t sharedMemory1[64]; - if (threadIdx.x < 64) - sharedMemory1[threadIdx.x] = sbox[threadIdx.x];// for (i = 0; i < 64; ++i) - - - uint32_t nounce = startNonce + thread; - unsigned char in[64]; - - uint64_t blockhash[4]; - memcpy(s, blockhash_half, 32); - memset(s + 32, 0x0, 32); - ((uint32_t *)&s)[8] = nounce; - memcpy(in +48, s + 48, 8); - //memcpy_decker(s, in, 64); - #pragma unroll 5 - for (i = 0; i < 5; ++i) { - // aes round(s) - __syncthreads(); - for (j = 0; j < 2; ++j) { - - aesenc(s, sharedMemory1); - aesenc(s + 16, sharedMemory1); - aesenc(s + 32, sharedMemory1); - aesenc(s + 48, sharedMemory1); - } - unpacklo32(tmp, s, s + 16); - unpackhi32(s, s, s + 16); - unpacklo32(s + 16, s + 32, s + 48); - unpackhi32(s + 32, s + 32, s + 48); - unpacklo32(s + 48, s, s + 32); - unpackhi32(s, s, s + 32); - unpackhi32(s + 32, s + 16, tmp); - unpacklo32(s + 16, s + 16, tmp); + uint32_t nounce = startNonce + thread; - } - for (i = 48; i < 56; i++) { - s[i] = s[i] ^ in[i]; - } + __shared__ uint32_t sharedMemory1[256]; + //if (threadIdx.x < 64) + sharedMemory1[threadIdx.x] = __ldg(&sbox[threadIdx.x]); - - - + #pragma unroll + for (int i = 0; i < 16; i++) s[i] = blockhash_half[i]; + #pragma unroll + for (int i = 0; i < 16; i++) tmp[i] = blockhash_half[i + 16]; + ((uint32_t *)&s)[8] = nounce; + __syncthreads(); - if (((uint64_t*)&s[48])[0] < ((uint64_t*)&ptarget)[3]) { resNonce[0] = nounce; } - -}; + aesenc_n1(s + 32, sharedMemory1); + aesenc(s + 32, sharedMemory1); + + //unpacklo32(tmp, s, s + 16); + //unpackhi32(s, s, s + 16); + + //unpacklo32(s + 16, s + 32, s + 48); + ((uint32_t*)s)[ 4] = ((uint32_t*)s)[ 8]; + ((uint32_t*)s)[ 6] = ((uint32_t*)s)[ 9]; + + //unpackhi32(s + 32, s + 32, s + 48); + ((uint32_t*)s)[ 8] = ((uint32_t*)s)[10]; + ((uint32_t*)s)[10] = ((uint32_t*)s)[11]; + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32s(s + 16, tmp); + aesenc_s1(s, sharedMemory1); + aesenc_s2(s + 16, sharedMemory1); + aesenc_s2(s + 32, sharedMemory1); + aesenc_s1(s + 48, sharedMemory1); + aesenc(s, sharedMemory1); + aesenc(s + 16, sharedMemory1); + aesenc(s + 32, sharedMemory1); + aesenc(s + 48, sharedMemory1); + unpacklo32(tmp, s, s + 16); + unpackhi32(s, s, s + 16); + unpacklo32(s + 16, s + 32, s + 48); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32s(s + 16, tmp); + + #pragma nounroll + for (i = 2; i < 3; ++i) { + #pragma unroll + for (j = 0; j < 2; ++j) { + aesenc(s, sharedMemory1); + aesenc(s + 16, sharedMemory1); + aesenc(s + 32, sharedMemory1); + aesenc(s + 48, sharedMemory1); + } + unpacklo32(tmp, s, s + 16); + unpackhi32(s, s, s + 16); + unpacklo32(s + 16, s + 32, s + 48); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32s(s + 16, tmp); + } + + aesenc(s, sharedMemory1); + aesenc(s + 16, sharedMemory1); + aesenc(s + 32, sharedMemory1); + aesenc(s + 48, sharedMemory1); + aesenc(s, sharedMemory1); + aesenc(s + 16, sharedMemory1); + aesenc(s + 32, sharedMemory1); + aesenc(s + 48, sharedMemory1); + unpacklo32(tmp, s, s + 16); + unpackhi32(s, s, s + 16); + unpacklo32(s + 16, s + 32, s + 48); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32s(s + 16, tmp); + + aesenc(s, sharedMemory1); + aesenc(s + 16, sharedMemory1); + aesenc(s + 32, sharedMemory1); + aesenc(s + 48, sharedMemory1); + aesenc(s, sharedMemory1); + aesenc(s + 16, sharedMemory1); + aesenc(s + 32, sharedMemory1); + aesenc(s + 48, sharedMemory1); + unpackhi32(s, s, s + 16); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + + //if (((uint64_t*)&s[48])[0] < ((uint64_t*)&ptarget)[3]) resNonce[0] = nounce; + if (((uint32_t*)&s[52])[0] <= ptarget) { + uint32_t tmp = atomicExch(&resNonce[0], nounce); + if (tmp != UINT32_MAX) resNonce[1] = tmp; + } +} diff --git a/verus/verusscan.cpp b/verus/verusscan.cpp index 62e8df0011..24eb9853d1 100644 --- a/verus/verusscan.cpp +++ b/verus/verusscan.cpp @@ -43,6 +43,7 @@ extern void verus_init(int thr_id); #define htobe32(x) swab32(x) #endif + extern "C" void VerusHashHalf(void *result, const void *data, size_t len) { unsigned char buf[128]; @@ -85,6 +86,7 @@ extern "C" void VerusHashHalf(void *result, const void *data, size_t len) memcpy(result, bufPtr, 32); }; + static void cb_hashdone(int thr_id) { if (!valid_sols[thr_id]) valid_sols[thr_id] = -1; } @@ -94,6 +96,7 @@ static bool cb_cancel(int thr_id) { return work_restart[thr_id].restart; } + extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t _ALIGN(64) endiandata[35]; @@ -104,7 +107,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, struct timeval tv_start, tv_end, diff; double secs, solps; - uint8_t blockhash_half[64]; + uint8_t blockhash_half[64], blockhash_pre[64], tmp[16]; uint32_t nonce_buf = 0; unsigned char block_41970[] = { 0xfd, 0x40, 0x05 }; // solution @@ -136,15 +139,23 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, VerusHashHalf(blockhash_half, full_data, 1487); // full VerusHash without last iteration - gettimeofday(&tv_start, NULL); //get millisecond timer val for cal of h + work->valid_nonces = 0; - verus_setBlock(blockhash_half, work->target); //set data to gpu kernel + memcpy(blockhash_pre, blockhash_half, 64); + const unsigned char rk[16] = { 0 }; + aesenc(blockhash_pre, rk); + aesenc(blockhash_pre + 16, rk); + aesenc(blockhash_pre, rk); + aesenc(blockhash_pre + 16, rk); + unpacklo32(tmp, blockhash_pre, blockhash_pre + 16); + unpackhi32(blockhash_pre, blockhash_pre, blockhash_pre + 16); + memcpy(blockhash_pre + 16, tmp, 16); + verus_setBlock(blockhash_pre, work->target); //set data to gpu kernel do { - *hashes_done = nonce_buf + throughput; //*hashes_done = mainnonce; //printf("firstnoncef= %08x, maxnonce = %08x,throughput = %08x\n",first_nonce,max_nonce, throughput); @@ -162,47 +173,44 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, memset(blockhash_half + 32, 0x0, 32); memcpy(blockhash_half + 32, full_data + 1486 - 14, 15); //printf("blockhash half\n"); - for (int i = 0; i < 32; i++) printf("", blockhash_half[i]); - //printf("\n"); + //for (int i = 0; i < 32; i++) printf("", blockhash_half[i]); + //printf("\n"); haraka512_port_zero((unsigned char*)vhash, (unsigned char*)blockhash_half); //printf("full hash \n"); - for (int i = 0; i < 32; i++) printf("", ((uint8_t*)(&vhash))[i]); + //for (int i = 0; i < 32; i++) printf("", ((uint8_t*)(&vhash))[i]); //printf("\n"); - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + if (vhash[7] <= Htarg) // && fulltest(vhash, ptarget)) { + if (fulltest(vhash, ptarget)) + { + work->valid_nonces++; - work->valid_nonces++; + memcpy(work->data, endiandata, 140); + int nonce = work->valid_nonces - 1; + memcpy(work->extra, sol_data, 1347); + bn_store_hash_target_ratio(vhash, work->target, work, nonce); - memcpy(work->data, endiandata, 140); - int nonce = work->valid_nonces - 1; - memcpy(work->extra, sol_data, 1347); - bn_store_hash_target_ratio(vhash, work->target, work, nonce); + work->nonces[work->valid_nonces - 1] = endiandata[NONCE_OFT]; + } - work->nonces[work->valid_nonces - 1] = endiandata[NONCE_OFT]; pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1; - - - goto out; + if (work->valid_nonces > 0) goto out; + } + else { + gpulog(LOG_ERR, thr_id, "Invalid nonce"); } } - if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) { - - break; - } + if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) break; nonce_buf += throughput; } while (!work_restart[thr_id].restart); out: - gettimeofday(&tv_end, NULL); - timeval_subtract(&diff, &tv_end, &tv_start); - secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec); - solps = (double)nonce_buf / secs; - gpulog(LOG_INFO, thr_id, "%d k/hashes in %.2f s (%.2f MH/s)", nonce_buf / 1000, secs, solps / 1000000); + // H/s diff --git a/verus/verusscan.cu b/verus/verusscan.cu index 7070067fa7..58f272b245 100644 --- a/verus/verusscan.cu +++ b/verus/verusscan.cu @@ -44,7 +44,6 @@ extern void verus_init(int thr_id); #endif - extern "C" void VerusHashHalf(void *result, const void *data, size_t len) { unsigned char buf[128]; @@ -87,6 +86,7 @@ extern "C" void VerusHashHalf(void *result, const void *data, size_t len) memcpy(result, bufPtr, 32); }; + static void cb_hashdone(int thr_id) { if (!valid_sols[thr_id]) valid_sols[thr_id] = -1; } @@ -107,7 +107,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, struct timeval tv_start, tv_end, diff; double secs, solps; - uint8_t blockhash_half[64]; + uint8_t blockhash_half[64], blockhash_pre[64], tmp[16]; uint32_t nonce_buf = 0; unsigned char block_41970[] = { 0xfd, 0x40, 0x05 }; // solution @@ -143,11 +143,19 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, work->valid_nonces = 0; - verus_setBlock(blockhash_half, work->target); //set data to gpu kernel + memcpy(blockhash_pre, blockhash_half, 64); + const unsigned char rk[16] = {0}; + aesenc(blockhash_pre, rk); + aesenc(blockhash_pre + 16, rk); + aesenc(blockhash_pre, rk); + aesenc(blockhash_pre + 16, rk); + unpacklo32(tmp, blockhash_pre, blockhash_pre + 16); + unpackhi32(blockhash_pre, blockhash_pre, blockhash_pre + 16); + memcpy(blockhash_pre + 16, tmp, 16); + verus_setBlock(blockhash_pre, work->target); //set data to gpu kernel do { - *hashes_done = nonce_buf + throughput; //*hashes_done = mainnonce; //printf("firstnoncef= %08x, maxnonce = %08x,throughput = %08x\n",first_nonce,max_nonce, throughput); @@ -165,36 +173,36 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, memset(blockhash_half + 32, 0x0, 32); memcpy(blockhash_half + 32, full_data + 1486 - 14, 15); //printf("blockhash half\n"); - for (int i = 0; i < 32; i++) printf("", blockhash_half[i]); - //printf("\n"); + //for (int i = 0; i < 32; i++) printf("", blockhash_half[i]); + //printf("\n"); haraka512_port_zero((unsigned char*)vhash, (unsigned char*)blockhash_half); //printf("full hash \n"); - for (int i = 0; i < 32; i++) printf("", ((uint8_t*)(&vhash))[i]); + //for (int i = 0; i < 32; i++) printf("", ((uint8_t*)(&vhash))[i]); //printf("\n"); - if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + if (vhash[7] <= Htarg) // && fulltest(vhash, ptarget)) { + if (fulltest(vhash, ptarget)) + { + work->valid_nonces++; - work->valid_nonces++; + memcpy(work->data, endiandata, 140); + int nonce = work->valid_nonces - 1; + memcpy(work->extra, sol_data, 1347); + bn_store_hash_target_ratio(vhash, work->target, work, nonce); - memcpy(work->data, endiandata, 140); - int nonce = work->valid_nonces - 1; - memcpy(work->extra, sol_data, 1347); - bn_store_hash_target_ratio(vhash, work->target, work, nonce); + work->nonces[work->valid_nonces - 1] = endiandata[NONCE_OFT]; + } - work->nonces[work->valid_nonces - 1] = endiandata[NONCE_OFT]; pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1; - - - goto out; + if (work->valid_nonces > 0) goto out; + } else { + gpulog(LOG_ERR, thr_id, "Invalid nonce"); } } - if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) { - - break; - } + if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) break; nonce_buf += throughput; } while (!work_restart[thr_id].restart);