Unverified Commit 8528d8eb authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Profiling of OpenCL kernels (#3954)

* Profiling of OpenCL kernels

* Minor improvements to profiling
parent 58ee361f
......@@ -666,6 +666,7 @@ public:
void flushQueue();
private:
OpenCLPlatform::PlatformData& platformData;
void printProfilingEvents();
int deviceIndex;
int platformIndex;
int contextIndex;
......@@ -708,6 +709,9 @@ private:
std::map<std::string, double> energyParamDerivWorkspace;
std::vector<cl::Memory*> autoclearBuffers;
std::vector<int> autoclearBufferSizes;
std::vector<cl::Event> profilingEvents;
std::vector<std::string> profilingKernelNames;
cl_ulong profileStartTime;
OpenCLIntegrationUtilities* integration;
OpenCLExpressionUtilities* expression;
OpenCLBondedUtilities* bonded;
......
......@@ -52,6 +52,10 @@
using namespace OpenMM;
using namespace std;
// Uncomment the following line to enable profiling of all kernel launches. The results are written
// to stdout in the JSON format used by https://ui.perfetto.dev.
//#define ENABLE_PROFILING
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
......@@ -78,7 +82,7 @@ static bool isSupported(cl::Platform platform) {
}
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) :
ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false),
ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false), profileStartTime(0),
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), pinnedBuffer(NULL) {
if (precision == "single") {
useDoublePrecision = false;
......@@ -293,7 +297,12 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
if (originalContext == NULL) {
context = cl::Context(contextDevices, cprops, errorCallback);
#ifdef ENABLE_PROFILING
defaultQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
printf("[ ");
#else
defaultQueue = cl::CommandQueue(context, device);
#endif
}
else {
context = originalContext->context;
......@@ -495,6 +504,10 @@ OpenCLContext::~OpenCLContext() {
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
#ifdef ENABLE_PROFILING
printProfilingEvents();
printf(" ]\n");
#endif
}
void OpenCLContext::initialize() {
......@@ -675,7 +688,16 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
blockSize = ThreadBlockSize;
int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
try {
#ifdef ENABLE_PROFILING
cl::Event event;
currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize), NULL, &event);
profilingEvents.push_back(event);
profilingKernelNames.push_back(kernel.getInfo<CL_KERNEL_FUNCTION_NAME>());
if (profilingEvents.size() >= 500)
printProfilingEvents();
#else
currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
#endif
}
catch (cl::Error err) {
stringstream str;
......@@ -684,6 +706,24 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
}
}
void OpenCLContext::printProfilingEvents() {
for (int i = 0; i < profilingEvents.size(); i++) {
cl::Event event = profilingEvents[i];
event.wait();
cl_ulong start, end;
event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
if (profileStartTime == 0)
profileStartTime = start;
else
printf(",\n");
printf("{ \"pid\":1, \"tid\":1, \"ts\":%.6g, \"dur\":%g, \"ph\":\"X\", \"name\":\"%s\" }",
0.001*(start-profileStartTime), 0.001*(end-start), profilingKernelNames[i].c_str());
}
profilingEvents.clear();
profilingKernelNames.clear();
}
int OpenCLContext::computeThreadBlockSize(double memory) const {
int maxShared = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
// On some implementations, more local memory gets used than we calculate by
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment