Skip to content
This repository has been archived by the owner on Sep 15, 2022. It is now read-only.

Commit

Permalink
Rework initialization so devices can be initialized in parallell.
Browse files Browse the repository at this point in the history
  • Loading branch information
Johan Gustafsson committed May 20, 2018
1 parent d8b5f01 commit 6b9d8e6
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 17 deletions.
54 changes: 43 additions & 11 deletions Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
const std::string strPublic = toHex(r.foundHash, 20);

// Print
std::cout << "Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << " Public: 0x" << strPublic << std::endl;
std::cout << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << " Public: 0x" << strPublic << std::endl;
}

Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) :
Expand Down Expand Up @@ -106,10 +106,11 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, 40),
m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
m_speed(PROFANITY_SPEEDSAMPLES),
m_clSeed(createSeed()),
m_seeded(false),
m_round(0)
m_round(0),
m_speed(PROFANITY_SPEEDSAMPLES),
m_sizeInitialized(0),
m_eventFinished(NULL)
{

}
Expand Down Expand Up @@ -140,7 +141,13 @@ void Dispatcher::run() {
m_countRunning = m_vDevices.size();
timeStart = std::chrono::steady_clock::now();

std::cout << "Running" << std::endl;
std::cout << "Running..." << std::endl;
std::cout << " Always verify that a private key generated by this program corresponds to the" << std::endl;
std::cout << " public key printed by importing it to a wallet of your choice. This program" << std::endl;
std::cout << " like any software might contain bugs and it does by design cut corners to" << std::endl;
std::cout << " improve overall performance." << std::endl;
std::cout << std::endl;

for (auto it = m_vDevices.begin(); it != m_vDevices.end(); ++it) {
dispatch(*(*it));
}
Expand All @@ -157,25 +164,30 @@ void Dispatcher::init() {
std::cout << " initialization time (and memory footprint) I suggest lowering the" << std::endl;
std::cout << " inverse-multiple first. You can do this via the -I switch. Do note that" << std::endl;
std::cout << " this might negatively impact your performance." << std::endl;
std::cout << std::endl;

const auto deviceCount = m_vDevices.size();

cl_event * const pInitEvents = new cl_event[deviceCount];

for (size_t i = 0; i < deviceCount; ++i) {
pInitEvents[i] = clCreateUserEvent(m_clContext, NULL);
init(*m_vDevices[i], pInitEvents[i]);
m_vDevices[i]->m_eventFinished = pInitEvents[i];
initBegin(*m_vDevices[i]);
}

clWaitForEvents(deviceCount, pInitEvents);
for (size_t i = 0; i < deviceCount; ++i) {
m_vDevices[i]->m_eventFinished = NULL;
clReleaseEvent(pInitEvents[i]);
}

delete[] pInitEvents;

std::cout << std::endl;
}

void Dispatcher::init(Device & d, cl_event & event) {
void Dispatcher::initBegin(Device & d) {
// Set mode data
for (auto i = 0; i < 20; ++i) {
d.m_memData1[i] = m_mode.data1[i];
Expand Down Expand Up @@ -214,10 +226,28 @@ void Dispatcher::init(Device & d, cl_event & event) {
CLMemory<cl_uchar>::setKernelArg(d.m_kernelScore, 4, d.m_clScoreMax); // Updated in handleResult()

// Seed device
enqueueKernelDevice(d, d.m_kernelBegin, m_size, true);
initContinue(d);
}

// Mark completion
clSetUserEventStatus(event, CL_COMPLETE);
void Dispatcher::initContinue(Device & d) {
size_t sizeLeft = m_size - d.m_sizeInitialized;

if (sizeLeft) {
cl_event event;
const size_t sizeRun = std::min(sizeLeft, m_worksizeMax);
const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelBegin, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event);
OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue);

const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d);
OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback);

d.m_sizeInitialized += sizeRun;
} else {
// Printing one whole string at once helps in avoiding garbled output when executed in parallell
const std::string strOutput = " GPU" + toString(d.m_index) + " initialized";
std::cout << strOutput << std::endl;
clSetUserEventStatus(d.m_eventFinished, CL_COMPLETE);
}
}

void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bOneAtATime = false) {
Expand Down Expand Up @@ -302,7 +332,9 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) {
if (status != CL_COMPLETE) {
std::cout << "Dispatcher::onEvent - Got bad status: " << status << std::endl;
}
else {
else if (d.m_eventFinished != NULL) {
initContinue(d);
} else {
handleResult(d);

bool bDispatch = true;
Expand Down
16 changes: 10 additions & 6 deletions Dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,22 @@ class Dispatcher {
CLMemory<point> m_memPrecomp;
CLMemory<point> m_memPoints;
CLMemory<mp_number> m_memInverse;

CLMemory<result> m_memResult;

// Data parameters used in some modes
CLMemory<cl_uchar> m_memData1;
CLMemory<cl_uchar> m_memData2;

// Seed and round information
cl_ulong4 m_clSeed;
cl_ulong m_round;

// Speed sampling
SpeedSample m_speed;

cl_ulong4 m_clSeed;
bool m_seeded;

cl_ulong m_round;
// Initialization
size_t m_sizeInitialized;
cl_event m_eventFinished;
};

public:
Expand All @@ -76,7 +78,9 @@ class Dispatcher {

private:
void init();
void init(Device & d, cl_event & event);
void initBegin(Device & d);
void initContinue(Device & d);

void dispatch(Device & d);
void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bSynchronous);
void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bSynchronous);
Expand Down

0 comments on commit 6b9d8e6

Please sign in to comment.