Skip to content
This repository has been archived by the owner on Sep 15, 2022. It is now read-only.

Commit

Permalink
Improve printed GPU index and fallback to automatic local worksize on…
Browse files Browse the repository at this point in the history
… error.
  • Loading branch information
Johan Gustafsson committed Mar 12, 2018
1 parent 89ed68c commit e46719d
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 15 deletions.
42 changes: 31 additions & 11 deletions Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ static void printResult(cl_ulong4 seed, result r, const std::chrono::time_point<
}

Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) :
std::runtime_error( s + " (res = " + toString(res) + ")")
std::runtime_error( s + " (res = " + toString(res) + ")"),
m_res(res)
{

}
Expand All @@ -80,8 +81,9 @@ cl_kernel Dispatcher::Device::createKernel(cl_program & clProgram, const std::st
return ret == NULL ? throw std::runtime_error("failed to create kernel") : ret;
}

Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal) :
Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) :
m_parent(parent),
m_index(index),
m_clDeviceId(clDeviceId),
m_worksizeLocal(worksizeLocal),
m_clScoreMax(0),
Expand Down Expand Up @@ -117,8 +119,8 @@ Dispatcher::~Dispatcher() {

}

void Dispatcher::addDevice(cl_device_id clDeviceId, const size_t worksizeLocal) {
Device * pDevice = new Device(*this, m_clContext, m_clProgram, clDeviceId, worksizeLocal);
void Dispatcher::addDevice(cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) {
Device * pDevice = new Device(*this, m_clContext, m_clProgram, clDeviceId, worksizeLocal, index);
m_lDevices.push_back(pDevice);
init(*pDevice);
}
Expand Down Expand Up @@ -194,28 +196,46 @@ void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel,
size_t worksizeOffset = 0;
while (worksizeGlobal) {
const size_t worksizeRun = std::min(worksizeGlobal, worksizeMax);
const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, &worksizeLocal, 0, NULL, NULL);
const size_t * const pWorksizeLocal = (worksizeLocal == 0 ? NULL : &worksizeLocal);
const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, pWorksizeLocal, 0, NULL, NULL);
OpenCLException::throwIfError("kernel queueing failed", res);

worksizeGlobal -= worksizeRun;
worksizeOffset += worksizeRun;
}
}

void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal) {
try {
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal);
}
catch ( OpenCLException & e ) {
// If local work size is invalid, abandon it and let implementation decide
if ((e.m_res == CL_INVALID_WORK_GROUP_SIZE || e.m_res == CL_INVALID_WORK_ITEM_SIZE) && d.m_worksizeLocal != 0) {
std::cout << std::endl << "warning: local work size abandoned on GPU" << d.m_index << std::endl;
d.m_worksizeLocal = 0;
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal);
}
else {
throw;
}
}
}

void Dispatcher::dispatch(Device & d) {
// Write new seed
randomizeSeed(d);
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelBegin, 4, d.m_clSeed);

enqueueKernel(d.m_clQueue, d.m_kernelBegin, 1, d.m_worksizeLocal);
enqueueKernelDevice(d, d.m_kernelBegin, 1);

for (auto i = 1; i < PROFANITY_PASSES + 1; ++i) {
enqueueKernel(d.m_clQueue, d.m_kernelInversePre, g_worksizes[i], d.m_worksizeLocal);
enqueueKernel(d.m_clQueue, d.m_kernelInverse, g_worksizes[i] / 255, d.m_worksizeLocal);
enqueueKernel(d.m_clQueue, d.m_kernelInversePost, g_worksizes[i], d.m_worksizeLocal);
enqueueKernelDevice(d, d.m_kernelInversePre, g_worksizes[i]);
enqueueKernelDevice(d, d.m_kernelInverse, g_worksizes[i] / 255);
enqueueKernelDevice(d, d.m_kernelInversePost, g_worksizes[i]);
}

enqueueKernel(d.m_clQueue, d.m_kernelEnd, g_worksizes[PROFANITY_PASSES], d.m_worksizeLocal);
enqueueKernelDevice(d, d.m_kernelEnd, g_worksizes[PROFANITY_PASSES]);

cl_event event;
d.m_memResult.read(false, &event);
Expand Down Expand Up @@ -297,7 +317,7 @@ void Dispatcher::printSpeed() {
for (auto & e : m_lDevices) {
const auto curSpeed = e->m_speed.getSpeed();
speedTotal += curSpeed;
strGPUs += " GPU" + toString(i) + ": " + formatSpeed(curSpeed);
strGPUs += " GPU" + toString(e->m_index) + ": " + formatSpeed(curSpeed);
++i;
}

Expand Down
11 changes: 8 additions & 3 deletions Dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,22 @@ class Dispatcher {
OpenCLException(const std::string s, const cl_int res);

static void throwIfError(const std::string s, const cl_int res);

const cl_int m_res;
};

struct Device {
static cl_command_queue createQueue(cl_context & clContext, cl_device_id & clDeviceId);
static cl_kernel createKernel(cl_program & clProgram, const std::string s);

Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal);
Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index);
~Device();

Dispatcher & m_parent;
const size_t m_index;

cl_device_id m_clDeviceId;
const size_t m_worksizeLocal;
size_t m_worksizeLocal;
cl_uchar m_clScoreMax;
cl_command_queue m_clQueue;

Expand Down Expand Up @@ -67,13 +70,15 @@ class Dispatcher {
Dispatcher(cl_context & clContext, cl_program & clProgram, const Mode mode, const size_t worksizeMax, const cl_uchar clScoreQuit = 0);
~Dispatcher();

void addDevice(cl_device_id clDeviceId, const size_t worksizeLocal);
void addDevice(cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index);
void run();

private:
void init(Device & d);
void dispatch(Device & d);
void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal);
void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal);

void handleResult(Device & d);
void randomizeSeed(Device & d);

Expand Down
6 changes: 5 additions & 1 deletion profanity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <cstdlib>
#include <cstdio>
#include <vector>
#include <map>
#include <set>

#include <CL/cl.h>
Expand Down Expand Up @@ -193,6 +194,8 @@ int main(int argc, char * * argv) {

std::vector<cl_device_id> vFoundDevices = getAllDevices();
std::vector<cl_device_id> vDevices;
std::map<cl_device_id, size_t> mDeviceIndex;

std::vector<std::string> vDeviceBinary;
std::vector<size_t> vDeviceBinarySize;
cl_int errorCode;
Expand Down Expand Up @@ -223,6 +226,7 @@ int main(int argc, char * * argv) {

std::cout << "\tGPU" << i << ": " << strName << ", " << globalMemSize << " bytes available, " << computeUnits << " compute units (precompiled = " << (precompiled ? "yes" : "no") << ")" << std::endl;
vDevices.push_back(vFoundDevices[i]);
mDeviceIndex[vFoundDevices[i]] = i;
}

if (vDevices.empty()) {
Expand Down Expand Up @@ -303,7 +307,7 @@ int main(int argc, char * * argv) {

Dispatcher d(clContext, clProgram, mode, worksizeMax, 0);
for (auto & i : vDevices) {
d.addDevice(i, worksizeLocal);
d.addDevice(i, worksizeLocal, mDeviceIndex[i]);
}

d.run();
Expand Down

0 comments on commit e46719d

Please sign in to comment.