Profiling of OpenCL kernels (#3954)

* Profiling of OpenCL kernels * Minor improvements to profiling

Profiling of OpenCL kernels (#3954)
* Profiling of OpenCL kernels * Minor improvements to profiling
8528d8eb · Peter Eastman · GitHub · 58ee361f · 8528d8eb · 8528d8eb
Unverified Commit 8528d8eb authored Feb 09, 2023 by Peter Eastman Committed by GitHub Feb 09, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 45 additions and 1 deletion

platforms/opencl/include/OpenCLContext.h platforms/opencl/include/OpenCLContext.h +4 -0

platforms/opencl/src/OpenCLContext.cpp platforms/opencl/src/OpenCLContext.cpp +41 -1

No files found.
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -666,6 +666,7 @@ public:
    void flushQueue();
 private:
    OpenCLPlatform::PlatformData& platformData;
+    void printProfilingEvents();
    int deviceIndex;
    int platformIndex;
    int contextIndex;
@@ -708,6 +709,9 @@ private:
    std::map<std::string, double> energyParamDerivWorkspace;
    std::vector<cl::Memory*> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
+    std::vector<cl::Event> profilingEvents;
+    std::vector<std::string> profilingKernelNames;
+    cl_ulong profileStartTime;
    OpenCLIntegrationUtilities* integration;
    OpenCLExpressionUtilities* expression;
    OpenCLBondedUtilities* bonded;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -52,6 +52,10 @@
 using namespace OpenMM;
 using namespace std;

+// Uncomment the following line to enable profiling of all kernel launches.  The results are written
+// to stdout in the JSON format used by https://ui.perfetto.dev.
+//#define ENABLE_PROFILING
+
 #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
 #endif
@@ -78,7 +82,7 @@ static bool isSupported(cl::Platform platform) {
 }

 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) :
-        ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false),
+        ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false), profileStartTime(0),
        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), pinnedBuffer(NULL) {
    if (precision == "single") {
        useDoublePrecision = false;
@@ -293,7 +297,12 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
        if (originalContext == NULL) {
            context = cl::Context(contextDevices, cprops, errorCallback);
+#ifdef ENABLE_PROFILING
+            defaultQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
+            printf("[ ");
+#else
            defaultQueue = cl::CommandQueue(context, device);
+#endif
        }
        else {
            context = originalContext->context;
@@ -495,6 +504,10 @@ OpenCLContext::~OpenCLContext() {
        delete bonded;
    if (nonbonded != NULL)
        delete nonbonded;
+#ifdef ENABLE_PROFILING
+    printProfilingEvents();
+    printf(" ]\n");
+#endif
 }

 void OpenCLContext::initialize() {
@@ -675,7 +688,16 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
        blockSize = ThreadBlockSize;
    int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
    try {
+#ifdef ENABLE_PROFILING
+    cl::Event event;
+    currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize), NULL, &event);
+    profilingEvents.push_back(event);
+    profilingKernelNames.push_back(kernel.getInfo<CL_KERNEL_FUNCTION_NAME>());
+    if (profilingEvents.size() >= 500)
+        printProfilingEvents();
+#else
        currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
+#endif
    }
    catch (cl::Error err) {
        stringstream str;
@@ -684,6 +706,24 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
    }
 }

+void OpenCLContext::printProfilingEvents() {
+    for (int i = 0; i < profilingEvents.size(); i++) {
+        cl::Event event = profilingEvents[i];
+        event.wait();
+        cl_ulong start, end;
+        event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
+        event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
+        if (profileStartTime == 0)
+            profileStartTime = start;
+        else
+            printf(",\n");
+        printf("{ \"pid\":1, \"tid\":1, \"ts\":%.6g, \"dur\":%g, \"ph\":\"X\", \"name\":\"%s\" }",
+                0.001*(start-profileStartTime), 0.001*(end-start), profilingKernelNames[i].c_str());
+    }
+    profilingEvents.clear();
+    profilingKernelNames.clear();
+}
+
 int OpenCLContext::computeThreadBlockSize(double memory) const {
    int maxShared = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
    // On some implementations, more local memory gets used than we calculate by