Rework initialization so devices can be initialized in parallell.

johguse · May 20, 2018 · 6b9d8e6 · 6b9d8e6
1 parent d8b5f01
commit 6b9d8e6
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 17 deletions.
diff --git a/Dispatcher.cpp b/Dispatcher.cpp
@@ -48,7 +48,7 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
 	const std::string strPublic = toHex(r.foundHash, 20);
 
 	// Print
-	std::cout << "Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << " Public: 0x" << strPublic << std::endl;
+	std::cout << "  Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << " Public: 0x" << strPublic << std::endl;
 }
 
 Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) :
@@ -106,10 +106,11 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
 	m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, 40),
 	m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
 	m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
-	m_speed(PROFANITY_SPEEDSAMPLES),
 	m_clSeed(createSeed()),
-	m_seeded(false),
-	m_round(0)
+	m_round(0),
+	m_speed(PROFANITY_SPEEDSAMPLES),
+	m_sizeInitialized(0),
+	m_eventFinished(NULL)
 {
 
 }
@@ -140,7 +141,13 @@ void Dispatcher::run() {
 	m_countRunning = m_vDevices.size();
 	timeStart = std::chrono::steady_clock::now();
 
-	std::cout << "Running" << std::endl;
+	std::cout << "Running..." << std::endl;
+	std::cout << "  Always verify that a private key generated by this program corresponds to the" << std::endl;
+	std::cout << "  public key printed by importing it to a wallet of your choice. This program" << std::endl;
+	std::cout << "  like any software might contain bugs and it does by design cut corners to" << std::endl;
+	std::cout << "  improve overall performance." << std::endl;
+	std::cout << std::endl;
+
 	for (auto it = m_vDevices.begin(); it != m_vDevices.end(); ++it) {
 		dispatch(*(*it));
 	}
@@ -157,25 +164,30 @@ void Dispatcher::init() {
 	std::cout << "  initialization time (and memory footprint) I suggest lowering the" << std::endl;
 	std::cout << "  inverse-multiple first. You can do this via the -I switch. Do note that" << std::endl;
 	std::cout << "  this might negatively impact your performance." << std::endl;
+	std::cout << std::endl;
 
 	const auto deviceCount = m_vDevices.size();
 
 	cl_event * const pInitEvents = new cl_event[deviceCount];
 
 	for (size_t i = 0; i < deviceCount; ++i) {
 		pInitEvents[i] = clCreateUserEvent(m_clContext, NULL);
-		init(*m_vDevices[i], pInitEvents[i]);
+		m_vDevices[i]->m_eventFinished = pInitEvents[i];
+		initBegin(*m_vDevices[i]);
 	}
 
 	clWaitForEvents(deviceCount, pInitEvents);
 	for (size_t i = 0; i < deviceCount; ++i) {
+		m_vDevices[i]->m_eventFinished = NULL;
 		clReleaseEvent(pInitEvents[i]);
 	}
 
 	delete[] pInitEvents;
+
+	std::cout << std::endl;
 }
 
-void Dispatcher::init(Device & d, cl_event & event) {
+void Dispatcher::initBegin(Device & d) {
 	// Set mode data
 	for (auto i = 0; i < 20; ++i) {
 		d.m_memData1[i] = m_mode.data1[i];
@@ -214,10 +226,28 @@ void Dispatcher::init(Device & d, cl_event & event) {
 	CLMemory<cl_uchar>::setKernelArg(d.m_kernelScore, 4, d.m_clScoreMax); // Updated in handleResult()
 
 	// Seed device
-	enqueueKernelDevice(d, d.m_kernelBegin, m_size, true);
+	initContinue(d);
+}
 
-	// Mark completion
-	clSetUserEventStatus(event, CL_COMPLETE);
+void Dispatcher::initContinue(Device & d) {
+	size_t sizeLeft = m_size - d.m_sizeInitialized;
+
+	if (sizeLeft) {
+		cl_event event;
+		const size_t sizeRun = std::min(sizeLeft, m_worksizeMax);
+		const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelBegin, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event);
+		OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue);
+
+		const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d);
+		OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback);
+
+		d.m_sizeInitialized += sizeRun;
+	} else {
+		// Printing one whole string at once helps in avoiding garbled output when executed in parallell
+		const std::string strOutput = "  GPU" + toString(d.m_index) + " initialized";
+		std::cout << strOutput << std::endl;
+		clSetUserEventStatus(d.m_eventFinished, CL_COMPLETE);
+	}
 }
 
 void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bOneAtATime = false) {
@@ -302,7 +332,9 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) {
 	if (status != CL_COMPLETE) {
 		std::cout << "Dispatcher::onEvent - Got bad status: " << status << std::endl;
 	}
-	else {
+	else if (d.m_eventFinished != NULL) {
+		initContinue(d);
+	} else {
 		handleResult(d);
 
 		bool bDispatch = true;

diff --git a/Dispatcher.hpp b/Dispatcher.hpp
@@ -51,20 +51,22 @@ class Dispatcher {
 			CLMemory<point> m_memPrecomp;
 			CLMemory<point> m_memPoints;
 			CLMemory<mp_number> m_memInverse;
-
 			CLMemory<result> m_memResult;
 
 			// Data parameters used in some modes
 			CLMemory<cl_uchar> m_memData1;
 			CLMemory<cl_uchar> m_memData2;
 
+			// Seed and round information
+			cl_ulong4 m_clSeed;
+			cl_ulong m_round;
+
 			// Speed sampling
 			SpeedSample m_speed;
 
-			cl_ulong4 m_clSeed;
-			bool m_seeded;
-
-			cl_ulong m_round;
+			// Initialization
+			size_t m_sizeInitialized;
+			cl_event m_eventFinished;
 		};
 
 	public:
@@ -76,7 +78,9 @@ class Dispatcher {
 
 	private:
 		void init();
-		void init(Device & d, cl_event & event);
+		void initBegin(Device & d);
+		void initContinue(Device & d);
+
 		void dispatch(Device & d);
 		void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bSynchronous);
 		void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bSynchronous);