From e46719dc95be1a402871a2ae3981a4b881e9da9e Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Mon, 12 Mar 2018 22:53:21 +0100 Subject: [PATCH] Improve printed GPU index and fallback to automatic local worksize on error. --- Dispatcher.cpp | 42 +++++++++++++++++++++++++++++++----------- Dispatcher.hpp | 11 ++++++++--- profanity.cpp | 6 +++++- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/Dispatcher.cpp b/Dispatcher.cpp index cff98c3..e8ef08b 100755 --- a/Dispatcher.cpp +++ b/Dispatcher.cpp @@ -59,7 +59,8 @@ static void printResult(cl_ulong4 seed, result r, const std::chrono::time_point< } Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) : - std::runtime_error( s + " (res = " + toString(res) + ")") + std::runtime_error( s + " (res = " + toString(res) + ")"), + m_res(res) { } @@ -80,8 +81,9 @@ cl_kernel Dispatcher::Device::createKernel(cl_program & clProgram, const std::st return ret == NULL ? throw std::runtime_error("failed to create kernel") : ret; } -Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal) : +Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) : m_parent(parent), + m_index(index), m_clDeviceId(clDeviceId), m_worksizeLocal(worksizeLocal), m_clScoreMax(0), @@ -117,8 +119,8 @@ Dispatcher::~Dispatcher() { } -void Dispatcher::addDevice(cl_device_id clDeviceId, const size_t worksizeLocal) { - Device * pDevice = new Device(*this, m_clContext, m_clProgram, clDeviceId, worksizeLocal); +void Dispatcher::addDevice(cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) { + Device * pDevice = new Device(*this, m_clContext, m_clProgram, clDeviceId, worksizeLocal, index); m_lDevices.push_back(pDevice); init(*pDevice); } @@ -194,7 +196,8 @@ void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeOffset = 0; while (worksizeGlobal) { const size_t worksizeRun = std::min(worksizeGlobal, worksizeMax); - const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, &worksizeLocal, 0, NULL, NULL); + const size_t * const pWorksizeLocal = (worksizeLocal == 0 ? NULL : &worksizeLocal); + const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, pWorksizeLocal, 0, NULL, NULL); OpenCLException::throwIfError("kernel queueing failed", res); worksizeGlobal -= worksizeRun; @@ -202,20 +205,37 @@ void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, } } +void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal) { + try { + enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal); + } + catch ( OpenCLException & e ) { + // If local work size is invalid, abandon it and let implementation decide + if ((e.m_res == CL_INVALID_WORK_GROUP_SIZE || e.m_res == CL_INVALID_WORK_ITEM_SIZE) && d.m_worksizeLocal != 0) { + std::cout << std::endl << "warning: local work size abandoned on GPU" << d.m_index << std::endl; + d.m_worksizeLocal = 0; + enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal); + } + else { + throw; + } + } +} + void Dispatcher::dispatch(Device & d) { // Write new seed randomizeSeed(d); CLMemory::setKernelArg(d.m_kernelBegin, 4, d.m_clSeed); - enqueueKernel(d.m_clQueue, d.m_kernelBegin, 1, d.m_worksizeLocal); + enqueueKernelDevice(d, d.m_kernelBegin, 1); for (auto i = 1; i < PROFANITY_PASSES + 1; ++i) { - enqueueKernel(d.m_clQueue, d.m_kernelInversePre, g_worksizes[i], d.m_worksizeLocal); - enqueueKernel(d.m_clQueue, d.m_kernelInverse, g_worksizes[i] / 255, d.m_worksizeLocal); - enqueueKernel(d.m_clQueue, d.m_kernelInversePost, g_worksizes[i], d.m_worksizeLocal); + enqueueKernelDevice(d, d.m_kernelInversePre, g_worksizes[i]); + enqueueKernelDevice(d, d.m_kernelInverse, g_worksizes[i] / 255); + enqueueKernelDevice(d, d.m_kernelInversePost, g_worksizes[i]); } - enqueueKernel(d.m_clQueue, d.m_kernelEnd, g_worksizes[PROFANITY_PASSES], d.m_worksizeLocal); + enqueueKernelDevice(d, d.m_kernelEnd, g_worksizes[PROFANITY_PASSES]); cl_event event; d.m_memResult.read(false, &event); @@ -297,7 +317,7 @@ void Dispatcher::printSpeed() { for (auto & e : m_lDevices) { const auto curSpeed = e->m_speed.getSpeed(); speedTotal += curSpeed; - strGPUs += " GPU" + toString(i) + ": " + formatSpeed(curSpeed); + strGPUs += " GPU" + toString(e->m_index) + ": " + formatSpeed(curSpeed); ++i; } diff --git a/Dispatcher.hpp b/Dispatcher.hpp index aa10cfa..15e4ebd 100755 --- a/Dispatcher.hpp +++ b/Dispatcher.hpp @@ -22,19 +22,22 @@ class Dispatcher { OpenCLException(const std::string s, const cl_int res); static void throwIfError(const std::string s, const cl_int res); + + const cl_int m_res; }; struct Device { static cl_command_queue createQueue(cl_context & clContext, cl_device_id & clDeviceId); static cl_kernel createKernel(cl_program & clProgram, const std::string s); - Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal); + Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index); ~Device(); Dispatcher & m_parent; + const size_t m_index; cl_device_id m_clDeviceId; - const size_t m_worksizeLocal; + size_t m_worksizeLocal; cl_uchar m_clScoreMax; cl_command_queue m_clQueue; @@ -67,13 +70,15 @@ class Dispatcher { Dispatcher(cl_context & clContext, cl_program & clProgram, const Mode mode, const size_t worksizeMax, const cl_uchar clScoreQuit = 0); ~Dispatcher(); - void addDevice(cl_device_id clDeviceId, const size_t worksizeLocal); + void addDevice(cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index); void run(); private: void init(Device & d); void dispatch(Device & d); void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal); + void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal); + void handleResult(Device & d); void randomizeSeed(Device & d); diff --git a/profanity.cpp b/profanity.cpp index 745a888..22a3b98 100755 --- a/profanity.cpp +++ b/profanity.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -193,6 +194,8 @@ int main(int argc, char * * argv) { std::vector vFoundDevices = getAllDevices(); std::vector vDevices; + std::map mDeviceIndex; + std::vector vDeviceBinary; std::vector vDeviceBinarySize; cl_int errorCode; @@ -223,6 +226,7 @@ int main(int argc, char * * argv) { std::cout << "\tGPU" << i << ": " << strName << ", " << globalMemSize << " bytes available, " << computeUnits << " compute units (precompiled = " << (precompiled ? "yes" : "no") << ")" << std::endl; vDevices.push_back(vFoundDevices[i]); + mDeviceIndex[vFoundDevices[i]] = i; } if (vDevices.empty()) { @@ -303,7 +307,7 @@ int main(int argc, char * * argv) { Dispatcher d(clContext, clProgram, mode, worksizeMax, 0); for (auto & i : vDevices) { - d.addDevice(i, worksizeLocal); + d.addDevice(i, worksizeLocal, mDeviceIndex[i]); } d.run();