Unverified Commit 8528d8eb authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Profiling of OpenCL kernels (#3954)

* Profiling of OpenCL kernels

* Minor improvements to profiling
parent 58ee361f
...@@ -666,6 +666,7 @@ public: ...@@ -666,6 +666,7 @@ public:
void flushQueue(); void flushQueue();
private: private:
OpenCLPlatform::PlatformData& platformData; OpenCLPlatform::PlatformData& platformData;
void printProfilingEvents();
int deviceIndex; int deviceIndex;
int platformIndex; int platformIndex;
int contextIndex; int contextIndex;
...@@ -708,6 +709,9 @@ private: ...@@ -708,6 +709,9 @@ private:
std::map<std::string, double> energyParamDerivWorkspace; std::map<std::string, double> energyParamDerivWorkspace;
std::vector<cl::Memory*> autoclearBuffers; std::vector<cl::Memory*> autoclearBuffers;
std::vector<int> autoclearBufferSizes; std::vector<int> autoclearBufferSizes;
std::vector<cl::Event> profilingEvents;
std::vector<std::string> profilingKernelNames;
cl_ulong profileStartTime;
OpenCLIntegrationUtilities* integration; OpenCLIntegrationUtilities* integration;
OpenCLExpressionUtilities* expression; OpenCLExpressionUtilities* expression;
OpenCLBondedUtilities* bonded; OpenCLBondedUtilities* bonded;
......
...@@ -52,6 +52,10 @@ ...@@ -52,6 +52,10 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
// Uncomment the following line to enable profiling of all kernel launches. The results are written
// to stdout in the JSON format used by https://ui.perfetto.dev.
//#define ENABLE_PROFILING
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif #endif
...@@ -78,7 +82,7 @@ static bool isSupported(cl::Platform platform) { ...@@ -78,7 +82,7 @@ static bool isSupported(cl::Platform platform) {
} }
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) : OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) :
ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false), ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false), profileStartTime(0),
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), pinnedBuffer(NULL) { integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), pinnedBuffer(NULL) {
if (precision == "single") { if (precision == "single") {
useDoublePrecision = false; useDoublePrecision = false;
...@@ -293,7 +297,12 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -293,7 +297,12 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0}; cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
if (originalContext == NULL) { if (originalContext == NULL) {
context = cl::Context(contextDevices, cprops, errorCallback); context = cl::Context(contextDevices, cprops, errorCallback);
#ifdef ENABLE_PROFILING
defaultQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
printf("[ ");
#else
defaultQueue = cl::CommandQueue(context, device); defaultQueue = cl::CommandQueue(context, device);
#endif
} }
else { else {
context = originalContext->context; context = originalContext->context;
...@@ -495,6 +504,10 @@ OpenCLContext::~OpenCLContext() { ...@@ -495,6 +504,10 @@ OpenCLContext::~OpenCLContext() {
delete bonded; delete bonded;
if (nonbonded != NULL) if (nonbonded != NULL)
delete nonbonded; delete nonbonded;
#ifdef ENABLE_PROFILING
printProfilingEvents();
printf(" ]\n");
#endif
} }
void OpenCLContext::initialize() { void OpenCLContext::initialize() {
...@@ -675,7 +688,16 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi ...@@ -675,7 +688,16 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
blockSize = ThreadBlockSize; blockSize = ThreadBlockSize;
int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize; int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
try { try {
#ifdef ENABLE_PROFILING
cl::Event event;
currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize), NULL, &event);
profilingEvents.push_back(event);
profilingKernelNames.push_back(kernel.getInfo<CL_KERNEL_FUNCTION_NAME>());
if (profilingEvents.size() >= 500)
printProfilingEvents();
#else
currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize)); currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
#endif
} }
catch (cl::Error err) { catch (cl::Error err) {
stringstream str; stringstream str;
...@@ -684,6 +706,24 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi ...@@ -684,6 +706,24 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
} }
} }
void OpenCLContext::printProfilingEvents() {
for (int i = 0; i < profilingEvents.size(); i++) {
cl::Event event = profilingEvents[i];
event.wait();
cl_ulong start, end;
event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
if (profileStartTime == 0)
profileStartTime = start;
else
printf(",\n");
printf("{ \"pid\":1, \"tid\":1, \"ts\":%.6g, \"dur\":%g, \"ph\":\"X\", \"name\":\"%s\" }",
0.001*(start-profileStartTime), 0.001*(end-start), profilingKernelNames[i].c_str());
}
profilingEvents.clear();
profilingKernelNames.clear();
}
int OpenCLContext::computeThreadBlockSize(double memory) const { int OpenCLContext::computeThreadBlockSize(double memory) const {
int maxShared = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); int maxShared = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
// On some implementations, more local memory gets used than we calculate by // On some implementations, more local memory gets used than we calculate by
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment