diff --git a/Makefile.am b/Makefile.am
index 816d835dbd..d7c27b590b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
blake32.o: blake32.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
+ $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
diff --git a/blake32.cu b/blake32.cu
index 75e656a4bc..307d1ac37a 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
ending[3] = nonce; /* our tested value */
blake256_compress(h, ending, 640, rounds);
-
+#if 0
+ if (trace) {
+ printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]);
+ }
+#endif
//if (h[7] == 0 && high64 <= highTarget) {
if (h[7] == 0) {
#if NBN == 2
@@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
#else
resNonce[0] = nonce;
#endif
- if (trace) {
#ifdef _DEBUG
+ if (trace) {
uint64_t high64 = ((uint64_t*)h)[3];
printf("gpu: %16llx\n", high64);
printf("gpu: %08x.%08x\n", h[7], h[6]);
printf("tgt: %16llx\n", highTarget);
-#endif
}
+#endif
}
}
}
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index e4ee1dadfa..73929122a9 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -173,7 +173,7 @@
80
true
false
- compute_30,sm_30;compute_50,sm_50
+ compute_50,sm_50
--ptxas-options="-O2" %(AdditionalOptions)
@@ -306,12 +306,7 @@
-
-
-
-
-
-
+
@@ -358,6 +353,7 @@
+ 80
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index a50b1faf69..c2bf0602ba 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -43,9 +43,6 @@
{c3222908-22ba-4586-a637-6363f455b06d}
-
- {3281db48-f394-49ea-a1ef-6ebd09828d50}
-
{f3ed23a2-8ce7-41a5-b051-6da56047dc35}
@@ -293,23 +290,8 @@
Header Files\sph
-
- Header Files\CUDA\heavy
-
-
- Header Files\CUDA\heavy
-
-
- Header Files\CUDA\heavy
-
-
- Header Files\CUDA\heavy
-
-
- Header Files\CUDA\heavy
-
-
- Header Files\CUDA\heavy
+
+ Header Files\CUDA
Header Files\CUDA
@@ -539,4 +521,4 @@
Source Files\CUDA\x11
-
+
\ No newline at end of file
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
index fe58bc09a1..b177514c25 100644
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -3,11 +3,11 @@
#include "cuda_helper.h"
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher für unsere Ergebnisse
uint32_t *d_hash5output[8];
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
@@ -53,13 +53,13 @@ __constant__ uint64_t c_u512[16];
const uint64_t host_u512[16] =
{
- 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+ 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
- 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+ 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
- 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+ 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
- 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+ 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
@@ -123,7 +123,7 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
- // bestimme den aktuellen Zähler
+ // bestimme den aktuellen Zähler
//uint32_t nounce = startNounce + thread;
uint32_t nounce = nonceVector[thread];
@@ -141,10 +141,10 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
- // 128 Byte für die Message
+ // 128 Byte für die Message
uint64_t buf[16];
- // Message für die erste Runde in Register holen
+ // Message für die erste Runde in Register holen
#pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
@@ -154,7 +154,7 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t
uint32_t *hefty = heftyHashes + 8 * hashPosition;
if (BLOCKSIZE == 84) {
// den thread-spezifischen Hefty1 hash einsetzen
- // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
+ // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
@@ -173,14 +173,14 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t
// erste Runde
blake512_compress( h, buf, 0, c_sigma, c_u512 );
-
-
+
+
// zweite Runde
#pragma unroll 15
for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
blake512_compress( h, buf, 1, c_sigma, c_u512 );
-
+
// Hash rauslassen
uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
#pragma unroll 8
@@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
sizeof(host_SecondRound),
0, cudaMemcpyHostToDevice);
- // Speicher für alle Ergebnisse belegen
- cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
+ // Speicher für alle Ergebnisse belegen
+ CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
}
static int BLOCKSIZE = 84;
@@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
{
unsigned char PaddedMessage[128];
if (len == 84) {
- // Message mit Padding für erste Runde bereitstellen
+ // Message mit Padding für erste Runde bereitstellen
memcpy(PaddedMessage, pdata, 84);
- memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
+ memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+116, 0, 12);
PaddedMessage[116] = 0x80;
} else if (len == 80) {
memcpy(PaddedMessage, pdata, 80);
- memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
+ memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+112, 0, 16);
PaddedMessage[112] = 0x80;
}
@@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
+ // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
if (BLOCKSIZE == 80)
- blake512_gpu_hash<80><<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ blake512_gpu_hash<80><<>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 84)
- blake512_gpu_hash<84><<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ blake512_gpu_hash<84><<>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
}
diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h
deleted file mode 100644
index 7e24973348..0000000000
--- a/heavy/cuda_blake512.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_BLAKE512_H
-#define _CUDA_BLAKE512_H
-
-void blake512_cpu_init(int thr_id, int threads);
-void blake512_cpu_setBlock(void *pdata, int len);
-void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-#endif
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
index 329c83163c..b0b2ead8d6 100644
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@@ -1,16 +1,19 @@
-#include "cuda_helper.h"
+#include
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hashoutput[8];
+#include "cuda_helper.h"
+// globaler Speicher für unsere Ergebnisse
+static uint32_t *d_hashoutput[8];
extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8];
-extern uint32_t *d_nonceVector[8];
+
+extern uint32_t *heavy_nonceVector[8];
/* Combines top 64-bits from each hash into a single hash */
-static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
+__device__
+static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
{
uint32_t lout[8]; // Combining in Registern machen
@@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
out[i] = lout[i];
}
-__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
+__global__
+void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
}
}
-// Setup-Funktionen
-__host__ void combine_cpu_init(int thr_id, int threads)
+__host__
+void combine_cpu_init(int thr_id, int threads)
{
- // Speicher für alle Ergebnisse belegen
- cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+ // Speicher für alle Ergebnisse belegen
+ CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
}
+__host__
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
{
// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
@@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
- size_t shared_size = 0;
-
- combine_gpu_hash<<>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
+ combine_gpu_hash <<>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
- // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
- cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
+ // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
+ CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
}
diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h
deleted file mode 100644
index 5bb5832d19..0000000000
--- a/heavy/cuda_combine.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_COMBINE_H
-#define _CUDA_COMBINE_H
-
-void combine_cpu_init(int thr_id, int threads);
-void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
-
-#endif
diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
index 764b07228a..889002a173 100644
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@@ -3,11 +3,11 @@
#include "cuda_helper.h"
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher für unsere Ergebnisse
uint32_t *d_hash4output[8];
__constant__ uint32_t groestl_gpu_state[32];
@@ -603,22 +603,22 @@ __device__ void groestl512_perm_P(uint32_t *a)
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
- t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^
- T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
- T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
- T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
- T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
- T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
- T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
+ t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^
+ T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
+ T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
+ T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
+ T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
+ T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
+ T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
T3dn( B32_3(a[(k + 23) & 0x1f]) );
- t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^
- T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
- T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
- T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
- T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
- T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
- T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
+ t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^
+ T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
+ T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
+ T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
+ T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
+ T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
+ T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
T3up( B32_3(a[(k + 23) & 0x1f]) );
}
#pragma unroll 32
@@ -645,22 +645,22 @@ __device__ void groestl512_perm_Q(uint32_t *a)
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
- t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
- T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
- T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
- T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
- T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
- T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
- T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
+ t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
+ T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
+ T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
+ T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
+ T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
+ T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
+ T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
T3dn( B32_3(a[(k + 13) & 0x1f]) );
- t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
- T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
- T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
- T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
- T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
- T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
- T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
+ t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
+ T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
+ T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
+ T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
+ T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
+ T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
+ T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
T3up( B32_3(a[(k + 13) & 0x1f]) );
}
#pragma unroll 32
@@ -677,7 +677,7 @@ template __global__ void groestl512_gpu_hash(int threads, uint32
uint32_t message[32];
uint32_t state[32];
- // lese message ein & verknüpfe diese mit dem hash1 von hefty1
+ // lese message ein & verknüpfe diese mit dem hash1 von hefty1
// lese den state ein
#pragma unroll 32
@@ -700,7 +700,7 @@ template __global__ void groestl512_gpu_hash(int threads, uint32
#pragma unroll 8
for (int k=0; k<8; ++k)
message[BLOCKSIZE/4+k] = heftyHash[k];
-
+
uint32_t g[32];
#pragma unroll 32
for(int u=0;u<32;u++)
@@ -709,7 +709,7 @@ template __global__ void groestl512_gpu_hash(int threads, uint32
// Perm
groestl512_perm_P(g);
groestl512_perm_Q(message);
-
+
#pragma unroll 32
for(int u=0;u<32;u++)
{
@@ -753,7 +753,7 @@ __host__ void groestl512_cpu_init(int thr_id, int threads)
texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
- // Speicher für alle Ergebnisse belegen
+ // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
}
@@ -778,31 +778,27 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
msgBlock[28] = 0x80;
msgBlock[31] = 0x01000000;
}
- // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
- // auf der GPU ausgeführt)
+ // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+ // auf der GPU ausgeführt)
- // setze register
+ // setze register
uint32_t groestl_state_init[32];
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
groestl_state_init[31] = 0x20000;
// state speichern
- cudaMemcpyToSymbol( groestl_gpu_state,
- groestl_state_init,
- 128);
+ cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128);
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
- cudaMemcpyToSymbol( groestl_gpu_msg,
- msgBlock,
- 128);
+ cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128);
BLOCKSIZE = len;
}
__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{
// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
- if (copy)
- cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+ if (copy)
+ CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
}
__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
@@ -813,11 +809,11 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
+ // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
if (BLOCKSIZE == 84)
- groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 80)
- groestl512_gpu_hash<80><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ groestl512_gpu_hash<80><<>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
}
diff --git a/heavy/cuda_groestl512.h b/heavy/cuda_groestl512.h
deleted file mode 100644
index 0cdc13b809..0000000000
--- a/heavy/cuda_groestl512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_GROESTL512_H
-#define _CUDA_GROESTL512_H
-
-void groestl512_cpu_init(int thr_id, int threads);
-void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void groestl512_cpu_setBlock(void *data, int len);
-void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
index 6d2b324782..0ca31057ce 100644
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -1,12 +1,14 @@
#include
#include
+#include "miner.h"
+
#include "cuda_helper.h"
#define USE_SHARED 1
-// globaler Speicher für alle HeftyHashes aller Threads
-uint32_t *d_heftyHashes[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+uint32_t *heavy_heftyHashes[8];
/* Hash-Tabellen */
__constant__ uint32_t hefty_gpu_constantTable[64];
@@ -30,7 +32,7 @@ uint32_t hefty_cpu_hashTable[] = {
0x9b05688cUL,
0x1f83d9abUL,
0x5be0cd19UL };
-
+
uint32_t hefty_cpu_constantTable[] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
@@ -50,11 +52,16 @@ uint32_t hefty_cpu_constantTable[] = {
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
-//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
-static __host__ __device__ uint32_t S(uint32_t x, int n)
+#if 0
+#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+__host__ __device__
+static uint32_t S(uint32_t x, int n)
{
return (((x) >> (n)) | ((x) << (32 - (n))));
}
+#endif
+
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
@@ -67,7 +74,9 @@ static __host__ __device__ uint32_t S(uint32_t x, int n)
// uint8_t
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
-__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
+
+__host__ __forceinline__ __device__
+uint8_t smoosh2(uint32_t x)
{
uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
@@ -77,13 +86,14 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
-__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
+__host__ __forceinline__ __device__
+void Mangle(uint32_t *inp)
{
uint32_t r = smoosh4Quad(inp[0]);
uint32_t inp0org;
uint32_t tmp0Mask, tmp1Mask;
uint32_t in1, in2, isAddition;
- uint32_t tmp;
+ int32_t tmp;
uint8_t b;
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
@@ -92,24 +102,24 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
tmp = smoosh2(inp[1]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
- tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
- tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
-
- in1 = (inp[2] & ~inp0org) |
+ tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
+ tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
+
+ in1 = (inp[2] & ~inp0org) |
(tmp1Mask & ~inp[2] & inp0org) |
(~tmp0Mask & ~inp[2] & inp0org);
in2 = inp[2] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[2] = isAddition ? in2 : in1;
-
+
r += 0x01010101;
tmp = smoosh2(inp[1] ^ inp[2]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
- tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
- tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
+ tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
+ tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
- in1 = (inp[3] & ~inp0org) |
+ in1 = (inp[3] & ~inp0org) |
(tmp1Mask & ~inp[3] & inp0org) |
(~tmp0Mask & ~inp[3] & inp0org);
in2 = inp[3] += ~inp0org;
@@ -119,20 +129,23 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
}
-__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
+__host__ __forceinline__ __device__
+void Absorb(uint32_t *inp, uint32_t x)
{
inp[0] ^= x;
Mangle(inp);
}
-__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
+__host__ __forceinline__ __device__
+uint32_t Squeeze(uint32_t *inp)
{
uint32_t y = inp[0];
Mangle(inp);
return y;
}
-__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
+__host__ __forceinline__ __device__
+uint32_t Br(uint32_t *sponge, uint32_t x)
{
uint32_t r = Squeeze(sponge);
uint32_t t = ((r >> 8) & 0x1F);
@@ -146,11 +159,12 @@ __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
return retVal;
}
-__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+__device__ __forceinline__
+void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{
uint32_t tmpBr;
- uint32_t brG = Br(sponge, regs[6]);
+ uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
@@ -169,11 +183,12 @@ __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint
regs[4] += tmpBr;
}
-__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+__host__
+void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{
uint32_t tmpBr;
- uint32_t brG = Br(sponge, regs[6]);
+ uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
@@ -191,11 +206,11 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
regs[4] += tmpBr;
}
-// Die Hash-Funktion
-__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
+__global__
+void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash)
{
- #if USE_SHARED
- extern __shared__ char heftytab[];
+#if USE_SHARED
+ extern __shared__ unsigned char heftytab[];
if(threadIdx.x < 64)
{
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
@@ -207,9 +222,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
- // bestimme den aktuellen Zähler
+ // bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
-
+
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
@@ -219,7 +234,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
-
+
#pragma unroll 4
for(int k=0; k < 4; k++)
sponge[k] = hefty_gpu_sponge[k];
@@ -231,7 +246,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
regs[k] = hefty_gpu_register[k];
hash[k] = regs[k];
}
-
+
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
#pragma unroll 16
for(int k=0;k<16;k++)
@@ -252,7 +267,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
}
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
-
+
#pragma unroll 3
for(int k=0;k<3;k++)
{
@@ -279,7 +294,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
-
+
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
@@ -290,27 +305,28 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
}
}
-// Setup-Funktionen
-__host__ void hefty_cpu_init(int thr_id, int threads)
+__host__
+void hefty_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
- cudaMemcpyToSymbol( hefty_gpu_constantTable,
+ cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
sizeof(uint32_t) * 64 );
- // Speicher für alle Hefty1 hashes belegen
- cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
+ // Speicher für alle Hefty1 hashes belegen
+ CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
}
-__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
+__host__
+void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
// data muss 80/84-Byte haben!
{
// Nachricht expandieren und setzen
uint32_t msgBlock[32];
- memset(msgBlock, 0, sizeof(uint32_t) * 32);
+ memset(msgBlock, 0, sizeof(msgBlock));
memcpy(&msgBlock[0], data, len);
if (len == 84) {
msgBlock[21] |= 0x80;
@@ -319,17 +335,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
msgBlock[20] |= 0x80;
msgBlock[31] = 640; // bitlen
}
-
+
for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]);
- // die erste Runde wird auf der CPU durchgeführt, da diese für
+ // die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads
- // übergeben
+ // übergeben
// Erstelle expandierten Block W
- uint32_t W[64];
- memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
+ uint32_t W[64];
+ memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
@@ -344,7 +360,7 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
{
regs[k] = hefty_cpu_hashTable[k];
hash[k] = regs[k];
- }
+ }
// 1. Runde
for(int j=0;j<16;j++)
@@ -366,39 +382,30 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
hash[k] += regs[k];
// sponge speichern
-
- cudaMemcpyToSymbol( hefty_gpu_sponge,
- sponge,
- sizeof(uint32_t) * 4 );
+ cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16);
// hash speichern
- cudaMemcpyToSymbol( hefty_gpu_register,
- hash,
- sizeof(uint32_t) * 8 );
-
+ cudaMemcpyToSymbol(hefty_gpu_register, hash, 32);
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
- cudaMemcpyToSymbol( hefty_gpu_blockHeader,
- &msgBlock[16],
- 64);
+ CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64));
}
-__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
+__host__
+void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{
- // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
- // alle anderen mit 512 Threads.
- int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
+ int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
- #if USE_SHARED
- size_t shared_size = 8 * 64 * sizeof(uint32_t);
+ // Größe des dynamischen Shared Memory Bereichs
+#if USE_SHARED
+ int shared_size = 8 * 64 * sizeof(uint32_t);
#else
- size_t shared_size = 0;
+ int shared_size = 0;
#endif
- hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+ hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
diff --git a/heavy/cuda_hefty1.h b/heavy/cuda_hefty1.h
deleted file mode 100644
index 17b196c836..0000000000
--- a/heavy/cuda_hefty1.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_HEFTY1_H
-#define _CUDA_HEFTY1_H
-
-void hefty_cpu_hash(int thr_id, int threads, int startNounce);
-void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
-void hefty_cpu_init(int thr_id, int threads);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
index 94aadf9562..8c96b66e91 100644
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@@ -3,11 +3,11 @@
#include "cuda_helper.h"
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher für unsere Ergebnisse
uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8];
@@ -15,13 +15,11 @@ extern uint32_t *d_hash5output[8];
// der Keccak512 State nach der ersten Runde (72 Bytes)
__constant__ uint64_t c_State[25];
-// die Message (72 Bytes) für die zweite Runde auf der GPU
+// die Message (72 Bytes) für die zweite Runde auf der GPU
__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
-#include "cuda_helper.h"
-
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@@ -144,7 +142,7 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
- // bestimme den aktuellen Zähler
+ // bestimme den aktuellen Zähler
//uint32_t nounce = startNounce + thread;
uint32_t nounce = nonceVector[thread];
@@ -156,7 +154,7 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_
#pragma unroll 25
for (int i=0; i < 25; ++i)
keccak_gpu_state[i] = c_State[i];
-
+
// Message2 in den Puffer holen
uint32_t msgBlock[18];
mycpy72(msgBlock, c_PaddedMessage2);
@@ -167,7 +165,7 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_
// den individuellen Hefty1 Hash einsetzen
mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
- // den Block einmal gut durchschütteln
+ // den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
// das Hash erzeugen
@@ -187,8 +185,8 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_
// ---------------------------- END CUDA keccak512 functions ------------------------------------
-// Setup-Funktionen
-__host__ void keccak512_cpu_init(int thr_id, int threads)
+__host__
+void keccak512_cpu_init(int thr_id, int threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
@@ -196,7 +194,7 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
- // Speicher für alle Ergebnisse belegen
+ // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
}
@@ -212,23 +210,24 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
static int BLOCKSIZE = 84;
-__host__ void keccak512_cpu_setBlock(void *data, int len)
+__host__
+void keccak512_cpu_setBlock(void *data, int len)
// data muss 80 oder 84-Byte haben!
// heftyHash hat 32-Byte
{
// CH
- // state init
+ // state init
uint64_t keccak_cpu_state[25];
memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
- // erste Runde
+ // erste Runde
keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
// state kopieren
cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
- // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
- // zu jeweils
+ // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
+ // zu jeweils
uint32_t msgBlock[18];
memset(msgBlock, 0, 18 * sizeof(uint32_t));
@@ -238,29 +237,31 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
else if (len == 80)
memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
- // Nachricht abschließen
+ // Nachricht abschließen
if (len == 84)
msgBlock[11] = 0x01;
else if (len == 80)
msgBlock[10] = 0x01;
msgBlock[17] = 0x80000000;
-
- // Message 2 ins Constant Memory kopieren (die variable Nonce und
+
+ // Message 2 ins Constant Memory kopieren (die variable Nonce und
// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
BLOCKSIZE = len;
}
-
-__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+__host__
+void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{
// Hefty1 Hashes kopieren
- if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+ if (copy)
+ CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
//else cudaThreadSynchronize();
}
-__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+__host__
+void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
{
const int threadsperblock = 128;
@@ -268,11 +269,11 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
+ // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
if (BLOCKSIZE==84)
- keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE==80)
- keccak512_gpu_hash<80><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ keccak512_gpu_hash<80><<>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
}
diff --git a/heavy/cuda_keccak512.h b/heavy/cuda_keccak512.h
deleted file mode 100644
index 1182447573..0000000000
--- a/heavy/cuda_keccak512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_KECCAK512_H
-#define _CUDA_KECCAK512_H
-
-void keccak512_cpu_init(int thr_id, int threads);
-void keccak512_cpu_setBlock(void *data, int len);
-void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu
index 043422b136..3b63b76e4e 100644
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@@ -3,11 +3,11 @@
#include "cuda_helper.h"
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher für unsere Ergebnisse
uint32_t *d_hash2output[8];
@@ -47,10 +47,10 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
- // bestimme den aktuellen Zähler
+ // bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
nonceVector[thread] = nounce;
-
+
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
uint32_t W1[16];
uint32_t W2[16];
@@ -66,10 +66,10 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s
regs[k] = sha256_gpu_register[k];
hash[k] = regs[k];
}
-
+
// 2. Runde
- //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
- //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
+ //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
+ //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
#pragma unroll 16
for(int k=0;k<16;k++)
W1[k] = sha256_gpu_blockHeader[k];
@@ -90,7 +90,7 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
+
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
@@ -121,7 +121,7 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
+
#pragma unroll 7
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2;
@@ -136,14 +136,14 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s
/*
for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-
+
#pragma unroll 64
for(int j=0;j<64;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
+
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
@@ -168,7 +168,7 @@ __host__ void sha256_cpu_init(int thr_id, int threads)
sha256_cpu_constantTable,
sizeof(uint32_t) * 64 );
- // Speicher für alle Ergebnisse belegen
+ // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
}
@@ -184,25 +184,25 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, len);
if (len == 84) {
- memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
+ memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
msgBlock[29] |= 0x80;
msgBlock[31] = 928; // bitlen
} else if (len == 80) {
- memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
+ memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
msgBlock[28] |= 0x80;
msgBlock[31] = 896; // bitlen
}
-
+
for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]);
- // die erste Runde wird auf der CPU durchgeführt, da diese für
+ // die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads
- // übergeben
+ // übergeben
uint32_t W[64];
// Erstelle expandierten Block W
- memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
+ memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
@@ -223,7 +223,7 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
+
//#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
// sollte mal noch durch memmov ersetzt werden!
@@ -251,7 +251,8 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{
// Hefty1 Hashes kopieren
- if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+ if (copy)
+ CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
//else cudaThreadSynchronize();
}
@@ -263,12 +264,12 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- // Größe des dynamischen Shared Memory Bereichs
+ // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
if (BLOCKSIZE == 84)
- sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) {
- sha256_gpu_hash<80><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+ sha256_gpu_hash<80><<>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
}
}
diff --git a/heavy/cuda_sha256.h b/heavy/cuda_sha256.h
deleted file mode 100644
index 03385d125a..0000000000
--- a/heavy/cuda_sha256.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_SHA256_H
-#define _CUDA_SHA256_H
-
-void sha256_cpu_init(int thr_id, int threads);
-void sha256_cpu_setBlock(void *data, int len);
-void sha256_cpu_hash(int thr_id, int threads, int startNounce);
-void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-#endif
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index f3891e8467..eb9a03eb5d 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -1,35 +1,19 @@
#include
-#include
-#include
-
-#include