Skip to content
This repository has been archived by the owner on Apr 8, 2024. It is now read-only.

Fixed lookup buffer size calculation for Vulkan implementation. #99

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ Usage:
--pow-diff or -d <0-256> count of leading zero bits in target D value [default - 16]
--srand-seed or -ss <unsigned int> set srand seed value for POW test: 0 - use zero id/seed [default], -1 - use random value
--solution-idx or -si <unsigned int> set solution index for POW test: index will be compared to be the found solution for Pow [default - unset]
-N <scrypt N> set scrypt parameter N [default - 512]
```

## Mixing CUDA and Vulkan
Expand Down
22 changes: 7 additions & 15 deletions src/vulkan/driver-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ static _vulkanState *initVulkan(struct cgpu_info *cgpu, char *name, size_t nameS
{
_vulkanState *state = (_vulkanState *)calloc(1, sizeof(_vulkanState));

uint32_t scrypt_mem = 128 * cgpu->r;
uint32_t scrypt_mem = 128 * cgpu->r * cgpu->N;

uint32_t computeQueueFamilyIndex = getComputeQueueFamilyIndex(cgpu->driver_id);
if (computeQueueFamilyIndex < 0) {
Expand All @@ -135,32 +135,24 @@ static _vulkanState *initVulkan(struct cgpu_info *cgpu, char *name, size_t nameS
gVulkan.vkFreeMemory(state->vkDevice, tmpMem, NULL);

cgpu->work_size = 64;

applog(LOG_NOTICE, "GPU %d: selecting lookup gap of 4", cgpu->driver_id);
cgpu->lookup_gap = 4;

unsigned int bsize = 1024;
size_t ipt = (bsize / cgpu->lookup_gap + (bsize % cgpu->lookup_gap > 0));
size_t ipt = scrypt_mem / cgpu->lookup_gap;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please document and explain in comments the logic behind calculations of buffers/memory sizes in this function and what variables mean (for example, what "ipt" stands for)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scrypt memory size in bytes scrypt_mem = 128 * r * N
Lookup gap reduces memory usage but increases computational complexity. With lookup gap memory usage is
ipt = scrypt_mem / lookup_gap
max concurrent threads = allocated_buffer_size / ipt


if (!cgpu->buffer_size) {
unsigned int base_alloc = (int)(cgpu->gpu_max_alloc * 88 / 100 / 1024 / 1024 / 8) * 8 * 1024 * 1024;
cgpu->thread_concurrency = (uint32_t)(base_alloc / scrypt_mem / ipt);
unsigned int base_alloc = (int)(cgpu->gpu_max_alloc * 92 / 100 / 1024 / 1024 / 8) * 8 * 1024 * 1024;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use 92% of max memory?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A value of 88% is generally accepted, but this is a legacy of cards with 4G memory or less. 92% is performance improvement and is acceptable as long as there are no memory allocation errors. In fact, for cards with more than 4G memory, a value of 100% is acceptable.

cgpu->thread_concurrency = (uint32_t)(base_alloc / ipt);
cgpu->buffer_size = base_alloc / 1024 / 1024;
applog(LOG_DEBUG, "88%% Max Allocation: %u", base_alloc);
applog(LOG_DEBUG, "92%% Max Allocation: %u", base_alloc);
applog(LOG_NOTICE, "GPU %d: selecting buffer_size of %zu", cgpu->driver_id, cgpu->buffer_size);
}

if (cgpu->buffer_size) {
// use the buffer-size to overwrite the thread-concurrency
cgpu->thread_concurrency = (int)((cgpu->buffer_size * 1024 * 1024) / ipt / scrypt_mem);
}

cgpu->thread_concurrency = min(cgpu->thread_concurrency, /*cgpu->work_size*/ 32 * 1024);
cgpu->thread_concurrency = min(cgpu->thread_concurrency, 32 * 1024);
uint32_t chunkSize = copy_only ? (cgpu->thread_concurrency * 32) : ((cgpu->thread_concurrency * hash_len_bits + 7) / 8);

applog(LOG_DEBUG, "GPU %d: setting thread_concurrency to %d based on buffer size %d and lookup gap %d", cgpu->driver_id, (int)(cgpu->thread_concurrency), (int)(cgpu->buffer_size), (int)(cgpu->lookup_gap));

state->bufSize = alignBuffer(scrypt_mem * ipt * cgpu->thread_concurrency, state->alignment);
state->bufSize = alignBuffer(ipt * cgpu->thread_concurrency, state->alignment);
state->memConstantSize = alignBuffer(sizeof(AlgorithmConstants), state->alignment);
state->memParamsSize = alignBuffer(sizeof(AlgorithmParams), state->alignment);
state->memInputSize = alignBuffer(PREIMAGE_SIZE, state->alignment);
Expand Down
4 changes: 4 additions & 0 deletions src/vulkan/gen/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
include_directories(${CMAKE_CURRENT_BINARY_DIR})

if (MSVC)
set (CMAKE_CXX_FLAGS_DEBUG "/MD /Zi /O2 /Ob1 /DNDEBUG")
endif()

# A custom command and target to turn the Vulkan kernel into a byte array header

add_custom_command(
Expand Down
13 changes: 2 additions & 11 deletions src/vulkan/gen/scrypt-chacha.comp
Original file line number Diff line number Diff line change
Expand Up @@ -413,8 +413,7 @@ main()
uvec4 X[8];
const uint lid = gl_LocalInvocationID.x;
const uint gid = gl_GlobalInvocationID.x;
uint Nfactor = 0;
uint tmp = N >> 1;
uint tmp;
uvec2 nonce;

scrypt_hmac_state hmac_pw, work;
Expand All @@ -432,14 +431,6 @@ main()
nonce.x = global_work_offset.x + gid;
}

/* Determine the Nfactor */
while ((tmp & 1) == 0) {
tmp >>= 1;
Nfactor++;
}

const uint effective_concurrency = (concurrent_threads << 9) >> Nfactor;

password[0] = buffer0[0];
password[1] = buffer0[1];
password[2] = buffer0[2];
Expand Down Expand Up @@ -555,7 +546,7 @@ main()
}
//////////////////////////////////////////////////
/* 2: X = ROMix(X) */
scrypt_ROMix(X, N, gid, effective_concurrency);
scrypt_ROMix(X, N, gid, concurrent_threads);

/* 3: Out = PBKDF2(password, X) */
//////////////////////////////////////////////////
Expand Down
18 changes: 9 additions & 9 deletions src/vulkan/vulkan-helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ void vulkan_library_shutdown()
int getComputeQueueFamilyIndex(uint32_t index)
{
if (index >= gPhysicalDeviceCount) {
applog(LOG_ERR, "Card index %u not found\n", index);
applog(LOG_ERR, "Card index %u not found", index);
return -1;
}
uint32_t queueFamilyPropertiesCount = 0;
Expand Down Expand Up @@ -210,7 +210,7 @@ VkDeviceMemory allocateGPUMemory(int index, VkDevice vkDevice, const VkDeviceSi

VkResult ret = (memoryTypeIndex == VK_MAX_MEMORY_TYPES ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS);
if (ret != VK_SUCCESS) {
applog(LOG_ERR, "Cannot allocated %u kB GPU memory type for GPU index %u\n", (unsigned)(memorySize / 1024), index);
applog(LOG_ERR, "Cannot allocated %u kB GPU memory type for GPU index %u", (unsigned)(memorySize / 1024), index);
return NULL;
}

Expand Down Expand Up @@ -345,7 +345,7 @@ VkPipeline loadShaderFromFile(VkDevice vkDevice, VkPipelineLayout pipelineLayout

FILE *fp = fopen(spirv_file_name, "rb");
if (fp == NULL) {
applog(LOG_ERR, "SPIR-V program %s not found\n", spirv_file_name);
applog(LOG_ERR, "SPIR-V program %s not found", spirv_file_name);
return NULL;
}
fseek(fp, 0, SEEK_END);
Expand All @@ -357,7 +357,7 @@ VkPipeline loadShaderFromFile(VkDevice vkDevice, VkPipelineLayout pipelineLayout
size_t read_size = fread(shader, sizeof(char), shader_size, fp);
if (read_size != shader_size) {
free(shader);
applog(LOG_ERR, "Failed to read shader %s!\n", spirv_file_name);
applog(LOG_ERR, "Failed to read shader %s!", spirv_file_name);
return NULL;
}

Expand Down Expand Up @@ -410,15 +410,15 @@ static uint32_t * getShader(uint32_t workSize, uint32_t labelSize, uint32_t *sha
if (vulkan_shaders_vault_header[0] == labelSize) {
uint32_t *shader = (uint32_t*)calloc(1, vulkan_shaders_vault_header[1]);
if (NULL == shader) {
applog(LOG_ERR, "Failed to allocate shader %u:%u %u\n", workSize, labelSize, vulkan_shaders_vault_header[1]);
applog(LOG_ERR, "Failed to allocate shader %u:%u %u", workSize, labelSize, vulkan_shaders_vault_header[1]);
return NULL;
}
applog(LOG_INFO, "64:%03u %u -> %u\n", vulkan_shaders_vault_header[0], vulkan_shaders_vault_header[2], vulkan_shaders_vault_header[1]);
applog(LOG_INFO, "64:%03u %u -> %u", vulkan_shaders_vault_header[0], vulkan_shaders_vault_header[2], vulkan_shaders_vault_header[1]);
*shader_size = vulkan_shaders_vault_header[1];
uint8_t *src = vulkan_shaders_vault + vulkan_shaders_vault_header[3];
uLongf shaderSize = vulkan_shaders_vault_header[1];
if (Z_OK != uncompress((uint8_t*)shader, &shaderSize, src, vulkan_shaders_vault_header[2])) {
applog(LOG_ERR, "Failed to uncompress shader %u:%u\n", workSize, labelSize);
applog(LOG_ERR, "Failed to uncompress shader %u:%u", workSize, labelSize);
free(shader);
return NULL;
}
Expand All @@ -437,11 +437,11 @@ VkPipeline loadShader(VkDevice vkDevice, VkPipelineLayout pipelineLayout, VkShad
uint32_t *shader = getShader(workSize, labelSize, &shader_size);

if (NULL == shader) {
applog(LOG_ERR, "SPIR-V program %d:%d not found\n", workSize, labelSize);
applog(LOG_ERR, "SPIR-V program %d:%d not found", workSize, labelSize);
return NULL;
}

applog(LOG_INFO, "SPIR-V program %u:%u %u bytes\n", workSize, labelSize, shader_size);
applog(LOG_INFO, "SPIR-V program %u:%u %u bytes", workSize, labelSize, shader_size);

VkShaderModuleCreateInfo shaderModuleCreateInfo = {
VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
Expand Down
1 change: 1 addition & 0 deletions test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ Benchmarking and Testing
--pow-diff or -d <0-256> count of leading zero bits in target D value [default - 16]
--srand-seed or -ss <unsigned int> set srand seed value for POW test: 0 - use zero id/seed [default], -1 - use random value
--solution-idx or -si <unsigned int> set solution index for POW test: index will be compared to be the found solution for Pow [default - unset]
-N <scrypt N> set scrypt parameter N [default - 512]
```
38 changes: 26 additions & 12 deletions test/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ static uint8_t s_salt[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

static const uint8_t zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

static uint32_t scryptN = 512;

/* find binary substring */
void * memstr(const void *src, size_t length, const uint8_t *token, int token_length)
{
Expand Down Expand Up @@ -117,7 +119,7 @@ void do_benchmark(int aLabelSize, int aLabelsCount)
{
uint64_t hashes_computed;
uint64_t hashes_per_sec;
int status = scryptPositions(providers[i].id, id, 0, aLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, out, 512, 1, 1, NULL, NULL, &hashes_computed, &hashes_per_sec);
int status = scryptPositions(providers[i].id, id, 0, aLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, out, scryptN, 1, 1, NULL, NULL, &hashes_computed, &hashes_per_sec);
printf("%s: status %d, %u hashes, %u h/s\n", providers[i].model, status, (uint32_t)hashes_computed, (uint32_t)hashes_per_sec);
}
}
Expand Down Expand Up @@ -167,8 +169,8 @@ void do_test(int aLabelSize, int aLabelsCount, int aReferenceProvider, bool aPri
uint8_t D[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
referenceLabels = out + i * labelsBufferAlignedSize;
memset(referenceLabels, 0, labelsBufferSize);
scryptPositions(providers[i].id, id, 0, referenceLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, referenceLabels, 512, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
printf("%s: %u hashes, %u h/s\n", providers[i].model, (uint32_t)hashes_computed, (uint32_t)hashes_per_sec);
int status = scryptPositions(providers[i].id, id, 0, referenceLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, referenceLabels, scryptN, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
printf("%s: %u hashes, %u h/s, status: %d\n", providers[i].model, (uint32_t)hashes_computed, (uint32_t)hashes_per_sec, status);
aReferenceProvider = i;
checkOutput = true;
break;
Expand All @@ -180,8 +182,8 @@ void do_test(int aLabelSize, int aLabelsCount, int aReferenceProvider, bool aPri
uint8_t D[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
referenceLabels = out + aReferenceProvider * labelsBufferAlignedSize;
memset(referenceLabels, 0, labelsBufferSize);
scryptPositions(providers[aReferenceProvider].id, id, 0, referenceLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, referenceLabels, 512, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
printf("%s: %u hashes, %u h/s\n", providers[aReferenceProvider].model, (uint32_t)hashes_computed, (uint32_t)hashes_per_sec);
scryptPositions(providers[aReferenceProvider].id, id, 0, referenceLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, referenceLabels, scryptN, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
int status = printf("%s: %u hashes, %u h/s, status %d\n", providers[aReferenceProvider].model, (uint32_t)hashes_computed, (uint32_t)hashes_per_sec, status);
checkOutput = true;
}

Expand All @@ -191,7 +193,7 @@ void do_test(int aLabelSize, int aLabelsCount, int aReferenceProvider, bool aPri
uint8_t D[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
uint8_t *labels = out + i * labelsBufferAlignedSize;
memset(labels, 0, labelsBufferSize);
scryptPositions(providers[i].id, id, 0, aLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, labels, 512, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
scryptPositions(providers[i].id, id, 0, aLabelsCount - 1, aLabelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, labels, scryptN, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
printf("%s: %u hashes, %u h/s\n", providers[i].model, (uint32_t)hashes_computed, (uint32_t)hashes_per_sec);
if (memstr(labels, labelsBufferSize, zeros, 8)) {
printf("ZEROS result\n");
Expand Down Expand Up @@ -309,7 +311,7 @@ void test_core(int aLabelsCount, unsigned aDiff, unsigned aSeed, int labelSize)

if (idx_solution == -1ull) {
printf("Compute labels and look for a pow solution... Iteration: %d\n", j);
int status = scryptPositions(providers[i].id, id, idx, idx + labels_per_iter - 1, labelSize, salt, SPACEMESH_API_COMPUTE_LEAFS | SPACEMESH_API_COMPUTE_POW, out, 512, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
int status = scryptPositions(providers[i].id, id, idx, idx + labels_per_iter - 1, labelSize, salt, SPACEMESH_API_COMPUTE_LEAFS | SPACEMESH_API_COMPUTE_POW, out, scryptN, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);

if (status != SPACEMESH_API_ERROR_NONE && status != SPACEMESH_API_POW_SOLUTION_FOUND) {
printf("Compute error: %u\n", status);
Expand All @@ -331,7 +333,7 @@ void test_core(int aLabelsCount, unsigned aDiff, unsigned aSeed, int labelSize)
printf("Compute labels only... Iteration: %d\n", j);

uint64_t idx_temp = -1;
int status = scryptPositions(providers[i].id, id, idx, idx + labels_per_iter - 1, labelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, out, 512, 1, 1, D, &idx_temp, &hashes_computed, &hashes_per_sec);
int status = scryptPositions(providers[i].id, id, idx, idx + labels_per_iter - 1, labelSize, salt, SPACEMESH_API_COMPUTE_LEAFS, out, scryptN, 1, 1, D, &idx_temp, &hashes_computed, &hashes_per_sec);

if (status != SPACEMESH_API_ERROR_NONE && status != SPACEMESH_API_POW_SOLUTION_FOUND) {
printf("Compute returned an error: %u", status);
Expand All @@ -351,7 +353,7 @@ void test_core(int aLabelsCount, unsigned aDiff, unsigned aSeed, int labelSize)

printf("Calling pow compute...\n");

int status = scryptPositions(providers[i].id, id, idx, idx + labels_per_iter - 1, labelSize, salt, SPACEMESH_API_COMPUTE_POW, out, 512, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
int status = scryptPositions(providers[i].id, id, idx, idx + labels_per_iter - 1, labelSize, salt, SPACEMESH_API_COMPUTE_POW, out, scryptN, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);

printf("Compute pow only at index: %llu. hashes computed: %llu (%llu h/s)\n", idx, hashes_computed, hashes_per_sec);

Expand Down Expand Up @@ -379,7 +381,7 @@ void test_core(int aLabelsCount, unsigned aDiff, unsigned aSeed, int labelSize)

// compute 256 hash at solution index:
uint8_t hash[32];
scryptPositions(cpu_id, id, idx_solution, idx_solution, 256, salt, SPACEMESH_API_COMPUTE_LEAFS, hash, 512, 1, 1, NULL, NULL, &hashes_computed, &hashes_per_sec);
scryptPositions(cpu_id, id, idx_solution, idx_solution, 256, salt, SPACEMESH_API_COMPUTE_LEAFS, hash, scryptN, 1, 1, NULL, NULL, &hashes_computed, &hashes_per_sec);

printf("D: ");
print_hex32(D);
Expand Down Expand Up @@ -455,14 +457,14 @@ int do_test_pow(uint64_t aStartPos, int aLabelsCount, unsigned aDiff, unsigned a
uint64_t hashes_computed;
uint64_t hashes_per_sec;
printf("%s: ", providers[i].model);
int status = scryptPositions(providers[i].id, s_id, aStartPos, aStartPos + aLabelsCount - 1, 8, s_salt, SPACEMESH_API_COMPUTE_POW, NULL, 512, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
int status = scryptPositions(providers[i].id, s_id, aStartPos, aStartPos + aLabelsCount - 1, 8, s_salt, SPACEMESH_API_COMPUTE_POW, NULL, scryptN, 1, 1, D, &idx_solution, &hashes_computed, &hashes_per_sec);
switch (status) {
case SPACEMESH_API_POW_SOLUTION_FOUND:
printf("%u hashes, %u h/s, solution at %u\n", (uint32_t)hashes_computed, (uint32_t)hashes_per_sec, (uint32_t)idx_solution);
if (-1 != cpu_id) {
uint8_t hash[32];
memset(hash, 0, sizeof(hash));
scryptPositions(cpu_id, s_id, idx_solution, idx_solution, 256, s_salt, SPACEMESH_API_COMPUTE_LEAFS, hash, 512, 1, 1, NULL, NULL, &hashes_computed, &hashes_per_sec);
scryptPositions(cpu_id, s_id, idx_solution, idx_solution, 256, s_salt, SPACEMESH_API_COMPUTE_LEAFS, hash, scryptN, 1, 1, NULL, NULL, &hashes_computed, &hashes_per_sec);
printf("id: ");
print_hex32(s_id);
printf("\n");
Expand Down Expand Up @@ -891,6 +893,18 @@ int main(int argc, char **argv)
solutionIdx = strtoull(argv[i], NULL, 10);
}
}
else if (0 == strcmp(argv[i], "-N")) {
i++;
if (i < argc) {
scryptN = strtoul(argv[i], NULL, 10);
}
}
else if (0 == strcmp(argv[i], "--srand-seed") || 0 == strcmp(argv[i], "-ss")) {
i++;
if (i < argc) {
srand_seed = strtoul(argv[i], NULL, 10);
}
}
else if (0 == strcmp(argv[i], "-id")) {
i++;
if (i < argc) {
Expand Down