Unified interface for queues (#4913)

* Unified interface for queues * Simplified stream handling in CudaFFT3D * HIP implementation of ComputeQueue

Unified interface for queues (#4913)
* Unified interface for queues * Simplified stream handling in CudaFFT3D * HIP implementation of ComputeQueue
dd320bcf · Peter Eastman · GitHub · baf7942c · dd320bcf · dd320bcf
Unverified Commit dd320bcf authored Apr 28, 2025 by Peter Eastman Committed by GitHub Apr 28, 2025
5 changed files
--- a/platforms/opencl/include/OpenCLQueue.h
+++ b/platforms/opencl/include/OpenCLQueue.h
+#ifndef OPENMM_OPENCLQUEUE_H_
+#define OPENMM_OPENCLQUEUE_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2025 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ComputeQueue.h"
+#include "opencl.hpp"
+
+namespace OpenMM {
+
+/**
+ * This is the OpenCL implementation of the ComputeQueue interface.  It wraps a cl::CommandQueue.
+ */
+
+class OpenCLQueue : public ComputeQueueImpl {
+public:
+    /**
+     * Create an OpenCLQueue that wraps a cl::CommandQueue.
+     */
+    OpenCLQueue(cl::CommandQueue queue) : queue(queue) {
+    }
+    /**
+     * Get the cl::CommandQueue.
+     */
+    cl::CommandQueue getQueue() {
+        return queue;
+    }
+private:
+    cl::CommandQueue queue;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_OPENCLQUEUE_H_*/
--- a/platforms/opencl/src/OpenCLArray.cpp
+++ b/platforms/opencl/src/OpenCLArray.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2012-2025 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -26,6 +26,7 @@

 #include "OpenCLArray.h"
 #include "OpenCLContext.h"
+#include "OpenCLQueue.h"
 #include <iostream>
 #include <sstream>
 #include <vector>
@@ -96,13 +97,17 @@ ComputeContext& OpenCLArray::getContext() {
    return *context;
 }

+cl::CommandQueue OpenCLArray::getQueue() const {
+    return dynamic_cast<OpenCLQueue*>(context->getCurrentQueue().get())->getQueue();
+}
+
 void OpenCLArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
    if (buffer == NULL)
        throw OpenMMException("OpenCLArray has not been initialized");
    if (offset < 0 || offset+elements > getSize())
        throw OpenMMException("uploadSubArray: data exceeds range of array");
    try {
-        context->getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, offset*elementSize, elements*elementSize, data);
+        getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, offset*elementSize, elements*elementSize, data);
    }
    catch (cl::Error err) {
        std::stringstream str;
@@ -115,7 +120,7 @@ void OpenCLArray::download(void* data, bool blocking) const {
    if (buffer == NULL)
        throw OpenMMException("OpenCLArray has not been initialized");
    try {
-        context->getQueue().enqueueReadBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*elementSize, data);
+        getQueue().enqueueReadBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*elementSize, data);
    }
    catch (cl::Error err) {
        std::stringstream str;
@@ -131,7 +136,7 @@ void OpenCLArray::copyTo(ArrayInterface& dest) const {
        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
    OpenCLArray& clDest = context->unwrap(dest);
    try {
-        context->getQueue().enqueueCopyBuffer(*buffer, clDest.getDeviceBuffer(), 0, 0, size*elementSize);
+        getQueue().enqueueCopyBuffer(*buffer, clDest.getDeviceBuffer(), 0, 0, size*elementSize);
    }
    catch (cl::Error err) {
        std::stringstream str;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -37,6 +37,7 @@
 #include "OpenCLKernelSources.h"
 #include "OpenCLNonbondedUtilities.h"
 #include "OpenCLProgram.h"
+#include "OpenCLQueue.h"
 #include "openmm/common/ComputeArray.h"
 #include "openmm/MonteCarloFlexibleBarostat.h"
 #include "openmm/Platform.h"
@@ -302,10 +303,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        if (originalContext == NULL) {
            context = cl::Context(contextDevices, cprops, errorCallback);
 #ifdef ENABLE_PROFILING
-            defaultQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
+            defaultQueue = shared_ptr<ComputeQueueImpl>(new OpenCLQueue(cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE)));
            printf("[ ");
 #else
-            defaultQueue = cl::CommandQueue(context, device);
+            defaultQueue = shared_ptr<ComputeQueueImpl>(new OpenCLQueue(cl::CommandQueue(context, device)));
 #endif
        }
        else {
@@ -559,7 +560,7 @@ void OpenCLContext::initialize() {
            energyBufferSize*energyBuffer.getElementSize()),
            (int) longForceBuffer.getSize()*longForceBuffer.getElementSize());
    pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
-    pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
+    pinnedMemory = getQueue().enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision || useMixedPrecision)
@@ -670,16 +671,12 @@ double& OpenCLContext::getEnergyWorkspace() {
    return platformData.contextEnergy[contextIndex];
 }

-cl::CommandQueue& OpenCLContext::getQueue() {
-    return currentQueue;
+ComputeQueue OpenCLContext::createQueue() {
+    return shared_ptr<ComputeQueueImpl>(new OpenCLQueue(cl::CommandQueue(context, device)));
 }

-void OpenCLContext::setQueue(cl::CommandQueue& queue) {
-    currentQueue = queue;
-}
-
-void OpenCLContext::restoreDefaultQueue() {
-    currentQueue = defaultQueue;
+cl::CommandQueue OpenCLContext::getQueue() {
+    return dynamic_cast<OpenCLQueue*>(currentQueue.get())->getQueue();
 }

 OpenCLArray* OpenCLContext::createArray() {
@@ -714,13 +711,13 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
    try {
 #ifdef ENABLE_PROFILING
    cl::Event event;
-    currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize), NULL, &event);
+    getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize), NULL, &event);
    profilingEvents.push_back(event);
    profilingKernelNames.push_back(kernel.getInfo<CL_KERNEL_FUNCTION_NAME>());
    if (profilingEvents.size() >= 500)
        printProfilingEvents();
 #else
-        currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
+        getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
 #endif
    }
    catch (cl::Error err) {

--- a/platforms/opencl/src/OpenCLEvent.cpp
+++ b/platforms/opencl/src/OpenCLEvent.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019-2025 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */

 #include "OpenCLEvent.h"
+#include "OpenCLQueue.h"

 using namespace OpenMM;

@@ -32,7 +33,7 @@ OpenCLEvent::OpenCLEvent(OpenCLContext& context) : context(context) {
 }

 void OpenCLEvent::enqueue() {
-    context.getQueue().enqueueMarkerWithWaitList(NULL, &event);
+    dynamic_cast<OpenCLQueue*>(context.getCurrentQueue().get())->getQueue().enqueueMarkerWithWaitList(NULL, &event);
 }

 void OpenCLEvent::wait() {

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -35,6 +35,7 @@
 #include "OpenCLIntegrationUtilities.h"
 #include "OpenCLNonbondedUtilities.h"
 #include "OpenCLKernelSources.h"
+#include "OpenCLQueue.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
 #include <algorithm>
@@ -222,18 +223,18 @@ private:

 class OpenCLCalcNonbondedForceKernel::SyncQueuePreComputation : public OpenCLContext::ForcePreComputation {
 public:
-    SyncQueuePreComputation(OpenCLContext& cl, cl::CommandQueue queue, int forceGroup) : cl(cl), queue(queue), forceGroup(forceGroup) {
+    SyncQueuePreComputation(OpenCLContext& cl, ComputeQueue queue, int forceGroup) : cl(cl), queue(queue), forceGroup(forceGroup) {
    }
    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0) {
            vector<cl::Event> events(1);
            cl.getQueue().enqueueMarkerWithWaitList(NULL, &events[0]);
-            queue.enqueueBarrierWithWaitList(&events);
+            dynamic_cast<OpenCLQueue*>(queue.get())->getQueue().enqueueBarrierWithWaitList(&events);
        }
    }
 private:
    OpenCLContext& cl;
-    cl::CommandQueue queue;
+    ComputeQueue queue;
    int forceGroup;
 };

@@ -523,7 +524,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                usePmeQueue = (!cl.getPlatformData().disablePmeStream && !cl.getPlatformData().useCpuPme && isNvidia);
                if (usePmeQueue) {
                    pmeDefines["USE_PME_STREAM"] = "1";
-                    pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
+                    pmeQueue = cl.createQueue();
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
@@ -941,7 +942,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        if (usePmeQueue) {
            vector<cl::Event> events(1);
            cl.getQueue().enqueueMarkerWithWaitList(NULL, &events[0]);
-            pmeQueue.enqueueBarrierWithWaitList(&events);
+            dynamic_cast<OpenCLQueue*>(pmeQueue.get())->getQueue().enqueueBarrierWithWaitList(&events);
        }
        if (hasOffsets) {
            // The Ewald self energy was computed in the kernel.
@@ -979,7 +980,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    }
    if (pmeGrid1.isInitialized() && includeReciprocal) {
        if (usePmeQueue && !includeEnergy)
-            cl.setQueue(pmeQueue);
+            cl.setCurrentQueue(pmeQueue);
        
        // Invert the periodic box vectors.
        
@@ -1131,7 +1132,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
                cl.executeKernel(pmeDispersionInterpolateForceKernel, cl.getNumAtoms());
        }
        if (usePmeQueue) {
-            pmeQueue.enqueueMarkerWithWaitList(NULL, &pmeSyncEvent);
+            dynamic_cast<OpenCLQueue*>(pmeQueue.get())->getQueue().enqueueMarkerWithWaitList(NULL, &pmeSyncEvent);
            cl.restoreDefaultQueue();
        }
    }