From 6b9d8e6d542b0d8be4388657297379cb34695400 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Mon, 21 May 2018 00:23:31 +0200 Subject: [PATCH] Rework initialization so devices can be initialized in parallell. --- Dispatcher.cpp | 54 ++++++++++++++++++++++++++++++++++++++++---------- Dispatcher.hpp | 16 +++++++++------ 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/Dispatcher.cpp b/Dispatcher.cpp index 81bb4ca..c19a4d3 100755 --- a/Dispatcher.cpp +++ b/Dispatcher.cpp @@ -48,7 +48,7 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score const std::string strPublic = toHex(r.foundHash, 20); // Print - std::cout << "Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << " Public: 0x" << strPublic << std::endl; + std::cout << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << " Public: 0x" << strPublic << std::endl; } Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) : @@ -106,10 +106,11 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, 40), m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), - m_speed(PROFANITY_SPEEDSAMPLES), m_clSeed(createSeed()), - m_seeded(false), - m_round(0) + m_round(0), + m_speed(PROFANITY_SPEEDSAMPLES), + m_sizeInitialized(0), + m_eventFinished(NULL) { } @@ -140,7 +141,13 @@ void Dispatcher::run() { m_countRunning = m_vDevices.size(); timeStart = std::chrono::steady_clock::now(); - std::cout << "Running" << std::endl; + std::cout << "Running..." << std::endl; + std::cout << " Always verify that a private key generated by this program corresponds to the" << std::endl; + std::cout << " public key printed by importing it to a wallet of your choice. This program" << std::endl; + std::cout << " like any software might contain bugs and it does by design cut corners to" << std::endl; + std::cout << " improve overall performance." << std::endl; + std::cout << std::endl; + for (auto it = m_vDevices.begin(); it != m_vDevices.end(); ++it) { dispatch(*(*it)); } @@ -157,6 +164,7 @@ void Dispatcher::init() { std::cout << " initialization time (and memory footprint) I suggest lowering the" << std::endl; std::cout << " inverse-multiple first. You can do this via the -I switch. Do note that" << std::endl; std::cout << " this might negatively impact your performance." << std::endl; + std::cout << std::endl; const auto deviceCount = m_vDevices.size(); @@ -164,18 +172,22 @@ void Dispatcher::init() { for (size_t i = 0; i < deviceCount; ++i) { pInitEvents[i] = clCreateUserEvent(m_clContext, NULL); - init(*m_vDevices[i], pInitEvents[i]); + m_vDevices[i]->m_eventFinished = pInitEvents[i]; + initBegin(*m_vDevices[i]); } clWaitForEvents(deviceCount, pInitEvents); for (size_t i = 0; i < deviceCount; ++i) { + m_vDevices[i]->m_eventFinished = NULL; clReleaseEvent(pInitEvents[i]); } delete[] pInitEvents; + + std::cout << std::endl; } -void Dispatcher::init(Device & d, cl_event & event) { +void Dispatcher::initBegin(Device & d) { // Set mode data for (auto i = 0; i < 20; ++i) { d.m_memData1[i] = m_mode.data1[i]; @@ -214,10 +226,28 @@ void Dispatcher::init(Device & d, cl_event & event) { CLMemory::setKernelArg(d.m_kernelScore, 4, d.m_clScoreMax); // Updated in handleResult() // Seed device - enqueueKernelDevice(d, d.m_kernelBegin, m_size, true); + initContinue(d); +} - // Mark completion - clSetUserEventStatus(event, CL_COMPLETE); +void Dispatcher::initContinue(Device & d) { + size_t sizeLeft = m_size - d.m_sizeInitialized; + + if (sizeLeft) { + cl_event event; + const size_t sizeRun = std::min(sizeLeft, m_worksizeMax); + const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelBegin, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event); + OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue); + + const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d); + OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback); + + d.m_sizeInitialized += sizeRun; + } else { + // Printing one whole string at once helps in avoiding garbled output when executed in parallell + const std::string strOutput = " GPU" + toString(d.m_index) + " initialized"; + std::cout << strOutput << std::endl; + clSetUserEventStatus(d.m_eventFinished, CL_COMPLETE); + } } void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bOneAtATime = false) { @@ -302,7 +332,9 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) { if (status != CL_COMPLETE) { std::cout << "Dispatcher::onEvent - Got bad status: " << status << std::endl; } - else { + else if (d.m_eventFinished != NULL) { + initContinue(d); + } else { handleResult(d); bool bDispatch = true; diff --git a/Dispatcher.hpp b/Dispatcher.hpp index 3cb562f..43a58b0 100755 --- a/Dispatcher.hpp +++ b/Dispatcher.hpp @@ -51,20 +51,22 @@ class Dispatcher { CLMemory m_memPrecomp; CLMemory m_memPoints; CLMemory m_memInverse; - CLMemory m_memResult; // Data parameters used in some modes CLMemory m_memData1; CLMemory m_memData2; + // Seed and round information + cl_ulong4 m_clSeed; + cl_ulong m_round; + // Speed sampling SpeedSample m_speed; - cl_ulong4 m_clSeed; - bool m_seeded; - - cl_ulong m_round; + // Initialization + size_t m_sizeInitialized; + cl_event m_eventFinished; }; public: @@ -76,7 +78,9 @@ class Dispatcher { private: void init(); - void init(Device & d, cl_event & event); + void initBegin(Device & d); + void initContinue(Device & d); + void dispatch(Device & d); void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bSynchronous); void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bSynchronous);