Unified interface for queues (#4913)

* Unified interface for queues * Simplified stream handling in CudaFFT3D * HIP implementation of ComputeQueue

Unified interface for queues (#4913)
* Unified interface for queues * Simplified stream handling in CudaFFT3D * HIP implementation of ComputeQueue
dd320bcf · Peter Eastman · GitHub · baf7942c · dd320bcf · dd320bcf
Unverified Commit dd320bcf authored Apr 28, 2025 by Peter Eastman Committed by GitHub Apr 28, 2025
20 changed files
--- a/platforms/common/include/openmm/common/ComputeContext.h
+++ b/platforms/common/include/openmm/common/ComputeContext.h
@@ -36,6 +36,7 @@
 #include "openmm/common/ComputeEvent.h"
 #include "openmm/common/ComputeForceInfo.h"
 #include "openmm/common/ComputeProgram.h"
+#include "openmm/common/ComputeQueue.h"
 #include "openmm/common/ComputeVectorTypes.h"
 #include "openmm/common/FFT3D.h"
 #include "openmm/common/IntegrationUtilities.h"
@@ -143,6 +144,22 @@ public:
     * multiple devices.
     */
    virtual double& getEnergyWorkspace() = 0;
+    /**
+     * Create a new ComputeQueue for use with this context.
+     */
+    virtual ComputeQueue createQueue() = 0;
+    /**
+     * Get the ComputeQueue currently being used for execution.
+     */
+    ComputeQueue getCurrentQueue();
+    /**
+     * Set the ComputeQueue to use for execution.
+     */
+    void setCurrentQueue(ComputeQueue queue);
+    /**
+     * Reset the context to using the default queue for execution.
+     */
+    void restoreDefaultQueue();
    /**
     * Construct an uninitialized array of the appropriate class for this platform.  The returned
     * value should be created on the heap with the "new" operator.
@@ -560,6 +577,7 @@ protected:
    int numAtoms, paddedNumAtoms, computeForceCount, stepsSinceReorder;
    long long stepCount;
    bool forceNextReorder, atomsWereReordered, forcesValid;
+    ComputeQueue defaultQueue, currentQueue;
    std::vector<ComputeForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;

--- a/platforms/common/include/openmm/common/ComputeQueue.h
+++ b/platforms/common/include/openmm/common/ComputeQueue.h
+#ifndef OPENMM_COMPUTEQUEUE_H_
+#define OPENMM_COMPUTEQUEUE_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2025 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/common/windowsExportCommon.h"
+#include <memory>
+namespace OpenMM {
+/**
+ * This abstract class represents a queue within which kernels can be executed.  Call
+ * createQueue() on a ComputeContext to create an instance of a platform-specific
+ * subclass.  You can then pass it to the ComputeContext's setQueue() method to cause
+ * kernels to be launched on it.
+ * 
+ * Instead of referring to this class directly, it is best to use ComputeQueue, which is
+ * a typedef for a shared_ptr to a ComputeQueueImpl.  This allows you to treat it as having
+ * value semantics, and frees you from having to manage memory.  
+ */
+class OPENMM_EXPORT_COMMON ComputeQueueImpl {
+public:
+    virtual ~ComputeQueueImpl() {
+    }
+};
+typedef std::shared_ptr<ComputeQueueImpl> ComputeQueue;
+} // namespace OpenMM
+#endif /*OPENMM_COMPUTEQUEUE_H_*/
--- a/platforms/common/src/ComputeContext.cpp
+++ b/platforms/common/src/ComputeContext.cpp
@@ -52,6 +52,18 @@ ComputeContext::ComputeContext(const System& system) : system(system), time(0.0)
 ComputeContext::~ComputeContext() {
 }
+ComputeQueue ComputeContext::getCurrentQueue() {
+    return currentQueue;
+}
+void ComputeContext::setCurrentQueue(ComputeQueue queue) {
+    currentQueue = queue;
+}
+void ComputeContext::restoreDefaultQueue() {
+    currentQueue = defaultQueue;
+}
 void ComputeContext::addForce(ComputeForceInfo* force) {
    forces.push_back(force);
 }

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -46,6 +46,7 @@
 #include "CudaIntegrationUtilities.h"
 #include "CudaNonbondedUtilities.h"
 #include "CudaPlatform.h"
+#include "CudaQueue.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/common/ComputeContext.h"
 #include "openmm/Kernel.h"
@@ -159,17 +160,13 @@ public:
     */
    double& getEnergyWorkspace();
    /**
-     * Get the stream currently being used for execution.
+     * Create a new ComputeQueue for use with this context.
-     */
-    CUstream getCurrentStream();
-    /**
-     * Set the stream to use for execution.
     */
-    void setCurrentStream(CUstream stream);
+    ComputeQueue createQueue();
    /**
-     * Reset the context to using the default stream for execution.
+     * Get the stream currently being used for execution.
     */
-    void restoreDefaultStream();
+    CUstream getCurrentStream();
    /**
     * Construct an uninitialized array of the appropriate class for this platform.  The returned
     * value should be created on the heap with the "new" operator.
@@ -587,7 +584,6 @@ private:
    std::map<std::string, std::string> compilationDefines;
    CUcontext context;
    CUdevice device;
-    CUstream currentStream;
    CUfunction clearBufferKernel;
    CUfunction clearTwoBuffersKernel;
    CUfunction clearThreeBuffersKernel;

--- a/platforms/cuda/include/CudaFFT3D.h
+++ b/platforms/cuda/include/CudaFFT3D.h
@@ -63,10 +63,6 @@ public:
     */
    CudaFFT3D(CudaContext& context, int xsize, int ysize, int zsize, bool realToComplex=false);
    ~CudaFFT3D();
-    /**
-     * Set the stream to perform the FFT on.
-     */
-    void setStream(CUstream stream);
    /**
     * Perform a Fourier transform.  The transform cannot be done in-place: the input and output
     * arrays must be different.  Also, the input array is used as workspace, so its contents

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -185,7 +185,7 @@ private:
    CudaSort* sort;
    Kernel cpuPme;
    PmeIO* pmeio;
-    CUstream pmeStream;
+    ComputeQueue pmeQueue;
    CUevent pmeSyncEvent, paramsSyncEvent;
    CudaFFT3D* fft;
    CudaFFT3D* dispersionFft;

--- a/platforms/cuda/include/CudaQueue.h
+++ b/platforms/cuda/include/CudaQueue.h
+#ifndef OPENMM_CUDAQUEUE_H_
+#define OPENMM_CUDAQUEUE_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2025 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/common/ComputeQueue.h"
+#include <cuda.h>
+namespace OpenMM {
+/**
+ * This is the CUDA implementation of the ComputeQueue interface.  It wraps a CUstream.
+ */
+class CudaQueue : public ComputeQueueImpl {
+public:
+    /**
+     * Create a CudaQueue that wraps an existing CUstream.
+     */
+    CudaQueue(CUstream stream);
+    /**
+     * Create a CudaQueue that create a new CUstream.
+     */
+    CudaQueue();
+    ~CudaQueue();
+    /**
+     * Get the CUstream.
+     */
+    CUstream getStream() {
+        return stream;
+    }
+private:
+    CUstream stream;
+    bool initialized;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAQUEUE_H_*/
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -84,7 +84,7 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
 bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, CudaPlatform::PlatformData& platformData,
-        CudaContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
+        CudaContext* originalContext) : ComputeContext(system), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), useBlockingSync(useBlockingSync) {
    int cudaDriverVersion;
    cuDriverGetVersion(&cudaDriverVersion);
@@ -200,6 +200,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
        }
    }
+    defaultQueue = shared_ptr<ComputeQueueImpl>(new CudaQueue(0));
+    currentQueue = defaultQueue;
    numAtoms = system.getNumParticles();
    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
@@ -649,16 +651,12 @@ double& CudaContext::getEnergyWorkspace() {
    return platformData.contextEnergy[contextIndex];
 }
-CUstream CudaContext::getCurrentStream() {
+ComputeQueue CudaContext::createQueue() {
-    return currentStream;
+    return shared_ptr<ComputeQueueImpl>(new CudaQueue());
-}
-void CudaContext::setCurrentStream(CUstream stream) {
-    currentStream = stream;
 }
-void CudaContext::restoreDefaultStream() {
+CUstream CudaContext::getCurrentStream() {
-    setCurrentStream(0);
+    return dynamic_cast<CudaQueue*>(currentQueue.get())->getStream();
 }
 CudaArray* CudaContext::createArray() {
@@ -697,7 +695,7 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
-    CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, getCurrentStream(), arguments, NULL);
    if (result != CUDA_SUCCESS) {
        stringstream str;
        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";

--- a/platforms/cuda/src/CudaFFT3D.cpp
+++ b/platforms/cuda/src/CudaFFT3D.cpp
@@ -64,16 +64,12 @@ CudaFFT3D::~CudaFFT3D() {
    }
 }
-void CudaFFT3D::setStream(CUstream stream) {
-    cufftSetStream(fftForward, stream);
-    cufftSetStream(fftBackward, stream);
-}
 void CudaFFT3D::execFFT(ArrayInterface& in, ArrayInterface& out, bool forward) {
    CUdeviceptr in2 = context.unwrap(in).getDevicePointer();
    CUdeviceptr out2 = context.unwrap(out).getDevicePointer();
    cufftResult result;
    if (forward) {
+        cufftSetStream(fftForward, context.getCurrentStream());
        if (realToComplex) {
            if (context.getUseDoublePrecision())
                result = cufftExecD2Z(fftForward, (double*) in2, (double2*) out2);
@@ -88,6 +84,7 @@ void CudaFFT3D::execFFT(ArrayInterface& in, ArrayInterface& out, bool forward) {
        }
    }
    else {
+        cufftSetStream(fftBackward, context.getCurrentStream());
        if (realToComplex) {
            if (context.getUseDoublePrecision())
                result = cufftExecZ2D(fftBackward, (double2*) in2, (double*) out2);

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -207,17 +207,17 @@ private:
 class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
 public:
-    SyncStreamPreComputation(CudaContext& cu, CUstream stream, CUevent event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
+    SyncStreamPreComputation(CudaContext& cu, ComputeQueue queue, CUevent event, int forceGroup) : cu(cu), queue(queue), event(event), forceGroup(forceGroup) {
    }
    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0) {
            cuEventRecord(event, cu.getCurrentStream());
-            cuStreamWaitEvent(stream, event, 0);
+            cuStreamWaitEvent(dynamic_cast<CudaQueue*>(queue.get())->getStream(), event, 0);
        }
    }
 private:
    CudaContext& cu;
-    CUstream stream;
+    ComputeQueue queue;
    CUevent event;
    int forceGroup;
 };
@@ -256,8 +256,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete dispersionFft;
    if (pmeio != NULL)
        delete pmeio;
-    if (hasInitializedFFT && usePmeStream)
-        cuStreamDestroy(pmeStream);
 }
 void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -544,16 +542,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                // Prepare for doing PME on its own stream.
                if (usePmeStream) {
-                    cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
+                    pmeQueue = cu.createQueue();
-                    fft->setStream(pmeStream);
-                    if (doLJPME)
-                        dispersionFft->setStream(pmeStream);
                    CHECK_RESULT(cuEventCreate(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
                    CHECK_RESULT(cuEventCreate(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
-                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
+                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeQueue, pmeSyncEvent, recipForceGroup));
                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
                }
                hasInitializedFFT = true;
@@ -857,7 +852,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        }
        if (usePmeStream) {
            cuEventRecord(paramsSyncEvent, cu.getCurrentStream());
-            cuStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
+            cuStreamWaitEvent(dynamic_cast<CudaQueue*>(pmeQueue.get())->getStream(), paramsSyncEvent, 0);
        }
        if (hasOffsets) {
            // The Ewald self energy was computed in the kernel.
@@ -893,7 +888,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
    }
    if (pmeGrid1.isInitialized() && includeReciprocal) {
        if (usePmeStream)
-            cu.setCurrentStream(pmeStream);
+            cu.setCurrentQueue(pmeQueue);
        // Invert the periodic box vectors.
@@ -1015,8 +1010,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
        }
        if (usePmeStream) {
-            cuEventRecord(pmeSyncEvent, pmeStream);
+            cuEventRecord(pmeSyncEvent, dynamic_cast<CudaQueue*>(pmeQueue.get())->getStream());
-            cu.restoreDefaultStream();
+            cu.restoreDefaultQueue();
        }
    }

--- a/platforms/cuda/src/CudaQueue.cpp
+++ b/platforms/cuda/src/CudaQueue.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2025 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaQueue.h"
+#include "CudaContext.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+CudaQueue::CudaQueue(CUstream stream) : stream(stream), initialized(false) {
+}
+CudaQueue::CudaQueue() : initialized(false) {
+    CUresult result = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
+    if (result != CUDA_SUCCESS)
+        throw OpenMMException("Error creating CUDA stream: "+CudaContext::getErrorString(result));
+    initialized = true;
+}
+CudaQueue::~CudaQueue() {
+    if (initialized)
+        cuStreamDestroy(stream);
+}
--- a/platforms/hip/include/HipContext.h
+++ b/platforms/hip/include/HipContext.h
@@ -162,17 +162,13 @@ public:
     */
    double& getEnergyWorkspace();
    /**
-     * Get the stream currently being used for execution.
+     * Create a new ComputeQueue for use with this context.
-     */
-    hipStream_t getCurrentStream();
-    /**
-     * Set the stream to use for execution.
     */
-    void setCurrentStream(hipStream_t stream);
+    ComputeQueue createQueue();
    /**
-     * Reset the context to using the default stream for execution.
+     * Get the stream currently being used for execution.
     */
-    void restoreDefaultStream();
+    hipStream_t getCurrentStream();
    /**
     * Construct an uninitialized array of the appropriate class for this platform.  The returned
     * value should be created on the heap with the "new" operator.
@@ -632,8 +628,6 @@ private:
    std::map<std::string, std::string> compilationDefines;
    std::vector<hipModule_t> loadedModules;
    hipDevice_t device;
-    hipStream_t currentStream;
-    hipStream_t defaultStream;
    hipFunction_t clearBufferKernel;
    hipFunction_t clearTwoBuffersKernel;
    hipFunction_t clearThreeBuffersKernel;

--- a/platforms/hip/include/HipKernels.h
+++ b/platforms/hip/include/HipKernels.h
@@ -186,7 +186,7 @@ private:
    HipSort* sort;
    Kernel cpuPme;
    PmeIO* pmeio;
-    hipStream_t pmeStream;
+    ComputeQueue pmeQueue;
    hipEvent_t pmeSyncEvent, paramsSyncEvent;
    HipFFT3D* fft;
    HipFFT3D* dispersionFft;

--- a/platforms/hip/include/HipQueue.h
+++ b/platforms/hip/include/HipQueue.h
+#ifndef OPENMM_HIPQUEUE_H_
+#define OPENMM_HIPQUEUE_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2025 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/common/ComputeQueue.h"
+#include <hip/hip_runtime.h>
+namespace OpenMM {
+/**
+ * This is the HIP implementation of the ComputeQueue interface.  It wraps a hipStream_t.
+ */
+class HipQueue : public ComputeQueueImpl {
+public:
+    /**
+     * Create a HipQueue that wraps an existing hipStream_t.
+     */
+    HipQueue(hipStream_t stream);
+    /**
+     * Create a HipQueue that create a new hipStream_t.
+     */
+    HipQueue();
+    ~HipQueue();
+    /**
+     * Get the CUstream.
+     */
+    hipStream_t getStream() {
+        return stream;
+    }
+private:
+    hipStream_t stream;
+    bool initialized;
+};
+} // namespace OpenMM
+#endif /*OPENMM_HIPQUEUE_H_*/
--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2024 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2025 Stanford University and the Authors.      *
 * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
@@ -32,12 +32,13 @@
 #include "HipArray.h"
 #include "HipBondedUtilities.h"
 #include "HipEvent.h"
+#include "HipFFT3D.h"
 #include "HipIntegrationUtilities.h"
 #include "HipKernels.h"
 #include "HipKernelSources.h"
 #include "HipNonbondedUtilities.h"
 #include "HipProgram.h"
-#include "HipFFT3D.h"
+#include "HipQueue.h"
 #include "openmm/common/ComputeArray.h"
 #include "openmm/common/ContextSelector.h"
 #include "SHA1.h"
@@ -86,7 +87,7 @@ bool HipContext::hasInitializedHip = false;
 HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, HipPlatform::PlatformData& platformData,
-        HipContext* originalContext) : ComputeContext(system), currentStream(0), defaultStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
+        HipContext* originalContext) : ComputeContext(system), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
        useBlockingSync(useBlockingSync), supportsHardwareFloatGlobalAtomicAdd(false) {
    if (!hasInitializedHip) {
@@ -149,15 +150,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
            else
                throw OpenMMException("No compatible HIP device is available");
        }
-        CHECK_RESULT(hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));
+        defaultQueue = shared_ptr<ComputeQueueImpl>(new HipQueue());
    }
    else {
        isLinkedContext = true;
        this->deviceIndex = originalContext->deviceIndex;
        this->device = originalContext->device;
-        defaultStream = originalContext->defaultStream;
+        defaultQueue = originalContext->defaultQueue;
    }
-    currentStream = defaultStream;
+    currentQueue = defaultQueue;
    hipDeviceProp_t props;
    CHECK_RESULT(hipGetDeviceProperties(&props, device));
@@ -373,8 +374,6 @@ HipContext::~HipContext() {
        delete nonbonded;
    for (auto module : loadedModules)
        hipModuleUnload(module);
-    if (!isLinkedContext)
-        hipStreamDestroy(defaultStream);
    popAsCurrent();
    contextIsValid = false;
 }
@@ -676,16 +675,12 @@ double& HipContext::getEnergyWorkspace() {
    return platformData.contextEnergy[contextIndex];
 }
-hipStream_t HipContext::getCurrentStream() {
+ComputeQueue HipContext::createQueue() {
-    return currentStream;
+    return shared_ptr<ComputeQueueImpl>(new HipQueue());
-}
-void HipContext::setCurrentStream(hipStream_t stream) {
-    currentStream = stream;
 }
-void HipContext::restoreDefaultStream() {
+hipStream_t HipContext::getCurrentStream() {
-    currentStream = defaultStream;
+    return dynamic_cast<HipQueue*>(currentQueue.get())->getStream();
 }
 HipArray* HipContext::createArray() {
@@ -729,7 +724,7 @@ void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threa
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
-    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, getCurrentStream(), arguments, NULL);
    if (result != hipSuccess) {
        stringstream str;
        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
@@ -741,7 +736,7 @@ void HipContext::executeKernelFlat(hipFunction_t kernel, void** arguments, int t
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
    int gridSize = (threads+blockSize-1)/blockSize;
-    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, getCurrentStream(), arguments, NULL);
    if (result != hipSuccess) {
        stringstream str;
        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";

--- a/platforms/hip/src/HipKernels.cpp
+++ b/platforms/hip/src/HipKernels.cpp
@@ -37,6 +37,7 @@
 #include "HipIntegrationUtilities.h"
 #include "HipNonbondedUtilities.h"
 #include "HipKernelSources.h"
+#include "HipQueue.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
 #include <algorithm>
@@ -208,17 +209,17 @@ private:
 class HipCalcNonbondedForceKernel::SyncStreamPreComputation : public HipContext::ForcePreComputation {
 public:
-    SyncStreamPreComputation(HipContext& cu, hipStream_t stream, hipEvent_t event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
+    SyncStreamPreComputation(HipContext& cu, ComputeQueue queue, hipEvent_t event, int forceGroup) : cu(cu), queue(queue), event(event), forceGroup(forceGroup) {
    }
    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0) {
            hipEventRecord(event, cu.getCurrentStream());
-            hipStreamWaitEvent(stream, event, 0);
+            hipStreamWaitEvent(dynamic_cast<HipQueue*>(queue.get())->getStream(), event, 0);
        }
    }
 private:
    HipContext& cu;
-    hipStream_t stream;
+    ComputeQueue queue;
    hipEvent_t event;
    int forceGroup;
 };
@@ -259,7 +260,6 @@ HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
        delete pmeio;
    if (hasInitializedFFT) {
        if (usePmeStream) {
-            hipStreamDestroy(pmeStream);
            hipEventDestroy(pmeSyncEvent);
            hipEventDestroy(paramsSyncEvent);
        }
@@ -542,17 +542,16 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
                // Prepare for doing PME on its own stream.
                if (usePmeStream) {
-                    CHECK_RESULT(hipStreamCreateWithFlags(&pmeStream, hipStreamNonBlocking), "Error creating stream for NonbondedForce");
+                    pmeQueue = cu.createQueue();
                    CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
                    CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
-                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
+                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeQueue, pmeSyncEvent, recipForceGroup));
                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
                }
-                hipStream_t fftStream = usePmeStream ? pmeStream : cu.getCurrentStream();
                fft = cu.createFFT(gridSizeX, gridSizeY, gridSizeZ, true);
                if (doLJPME)
                    dispersionFft = cu.createFFT(dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
@@ -857,7 +856,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
        }
        if (usePmeStream) {
            hipEventRecord(paramsSyncEvent, cu.getCurrentStream());
-            hipStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
+            hipStreamWaitEvent(dynamic_cast<HipQueue*>(pmeQueue.get())->getStream(), paramsSyncEvent, 0);
        }
        if (hasOffsets) {
            // The Ewald self energy was computed in the kernel.
@@ -893,7 +892,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    }
    if (pmeGrid1.isInitialized() && includeReciprocal) {
        if (usePmeStream)
-            cu.setCurrentStream(pmeStream);
+            cu.setCurrentQueue(pmeQueue);
        // Invert the periodic box vectors.
@@ -1015,8 +1014,8 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            cu.executeKernelFlat(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
        }
        if (usePmeStream) {
-            hipEventRecord(pmeSyncEvent, pmeStream);
+            hipEventRecord(pmeSyncEvent, dynamic_cast<HipQueue*>(pmeQueue.get())->getStream());
-            cu.restoreDefaultStream();
+            cu.restoreDefaultQueue();
        }
    }

--- a/platforms/hip/src/HipQueue.cpp
+++ b/platforms/hip/src/HipQueue.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2025 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipQueue.h"
+#include "HipContext.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+HipQueue::HipQueue(hipStream_t stream) : stream(stream), initialized(false) {
+}
+HipQueue::HipQueue() : initialized(false) {
+    hipError_t result = hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
+    if (result != hipSuccess)
+        throw OpenMMException("Error creating HIP stream: "+HipContext::getErrorString(result));
+    initialized = true;
+}
+HipQueue::~HipQueue() {
+    if (initialized)
+        hipStreamDestroy(stream);
+}
--- a/platforms/opencl/include/OpenCLArray.h
+++ b/platforms/opencl/include/OpenCLArray.h
@@ -187,6 +187,10 @@ public:
     * Get the context this array belongs to.
     */
    ComputeContext& getContext();
+    /**
+     * Get the queue in which to perform transfers.
+     */
+    cl::CommandQueue getQueue() const;
    /**
     * Get the OpenCL Buffer object.
     */

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -203,17 +203,13 @@ public:
     */
    double& getEnergyWorkspace();
    /**
-     * Get the cl::CommandQueue currently being used for execution.
+     * Create a new ComputeQueue for use with this context.
-     */
-    cl::CommandQueue& getQueue();
-    /**
-     * Set the cl::ComandQueue to use for execution.
     */
-    void setQueue(cl::CommandQueue& queue);
+    ComputeQueue createQueue();
-    /**
+    /*
-     * Reset the context to using the default queue for execution.
+     * Get the cl::CommandQueue currently being used for execution.
     */
-    void restoreDefaultQueue();
+    cl::CommandQueue getQueue();
    /**
     * Construct an uninitialized array of the appropriate class for this platform.  The returned
     * value should be created on the heap with the "new" operator.
@@ -706,7 +702,6 @@ private:
    std::map<std::string, std::string> compilationDefines;
    cl::Context context;
    cl::Device device;
-    cl::CommandQueue defaultQueue, currentQueue;
    cl::Kernel clearBufferKernel;
    cl::Kernel clearTwoBuffersKernel;
    cl::Kernel clearThreeBuffersKernel;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -185,7 +185,7 @@ private:
    OpenCLArray pmeEnergyBuffer;
    OpenCLArray chargeBuffer;
    OpenCLSort* sort;
-    cl::CommandQueue pmeQueue;
+    ComputeQueue pmeQueue;
    cl::Event pmeSyncEvent;
    OpenCLFFT3D* fft;
    OpenCLFFT3D* dispersionFft;