Port changes from the main repository

Use cuCtxPushCurrent() and cuCtxPopCurrent() for selecting CUDA context https://github.com/openmm/openmm/pull/3258 Fixed uninitialized memory access https://github.com/openmm/openmm/issues/3392 https://github.com/openmm/openmm/pull/3399 Fixed potential invalid memory access See https://github.com/openmm/openmm/pull/3428 Improved temperature reporting for Drude particles https://github.com/openmm/openmm/pull/3486 https://github.com/openmm/openmm/commit/a5e42f5 Fixed race condition with multiple GPUs https://github.com/openmm/openmm/commit/6fb1c8a41edff980862750bc086f6a204eb50941 Use blocking sync when creating events https://github.com/openmm/openmm/commit/fe21d5ee4f14673a4ea38b7244991772a64ceec2 Very minor optimizations https://github.com/openmm/openmm/commit/109f6b2535da4e0c0dd88007d6ca06b4add2ce81 Use PocketFFT https://github.com/openmm/openmm/commit/1dac981a63300a2a53a7925f570995914f7163ed Improved logic for deciding when to reorder atoms https://github.com/openmm/openmm/commit/48664a1f1a4490a4dabc277757545ac070e7b898 Ensure valid atom order after loading a checkpoint https://github.com/openmm/openmm/commit/a056d5a3754e193105409afa12c9f0c9a2d972a2 Improve performance running on multiple GPUs https://github.com/openmm/openmm/commit/0c82c2647de98da5c6dab7bf7a7b8b19705aadc0 Fixed errors when running on multiple GPUs https://github.com/openmm/openmm/commit/ed9df876d43c037c08d4762721e73e5caae086d9 Optimized reducing energy https://github.com/openmm/openmm/commit/2975f44

Port changes from the main repository
Use cuCtxPushCurrent() and cuCtxPopCurrent() for selecting CUDA context https://github.com/openmm/openmm/pull/3258 Fixed uninitialized memory access https://github.com/openmm/openmm/issues/3392 https://github.com/openmm/openmm/pull/3399 Fixed potential invalid memory access See https://github.com/openmm/openmm/pull/3428 Improved temperature reporting for Drude particles https://github.com/openmm/openmm/pull/3486 https://github.com/openmm/openmm/commit/a5e42f5 Fixed race condition with multiple GPUs https://github.com/openmm/openmm/commit/6fb1c8a41edff980862750bc086f6a204eb50941 Use blocking sync when creating events https://github.com/openmm/openmm/commit/fe21d5ee4f14673a4ea38b7244991772a64ceec2 Very minor optimizations https://github.com/openmm/openmm/commit/109f6b2535da4e0c0dd88007d6ca06b4add2ce81 Use PocketFFT https://github.com/openmm/openmm/commit/1dac981a63300a2a53a7925f570995914f7163ed Improved logic for deciding when to reorder atoms https://github.com/openmm/openmm/commit/48664a1f1a4490a4dabc277757545ac070e7b898 Ensure valid atom order after loading a checkpoint https://github.com/openmm/openmm/commit/a056d5a3754e193105409afa12c9f0c9a2d972a2 Improve performance running on multiple GPUs https://github.com/openmm/openmm/commit/0c82c2647de98da5c6dab7bf7a7b8b19705aadc0 Fixed errors when running on multiple GPUs https://github.com/openmm/openmm/commit/ed9df876d43c037c08d4762721e73e5caae086d9 Optimized reducing energy https://github.com/openmm/openmm/commit/2975f44
ecc2d258 · Anton Gorenko · f717ed89 · ecc2d258 · ecc2d258 · ecc2d258
Unverified Commit ecc2d258 authored Aug 25, 2024 by Anton Gorenko
20 changed files
--- a/platforms/hip/include/HipArray.h
+++ b/platforms/hip/include/HipArray.h
@@ -9,8 +9,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -56,7 +56,7 @@ public:
     * @param name              the name of the array
     */
    template <class T>
-    static HipArray* create(HipContext& context, int size, const std::string& name) {
+    static HipArray* create(HipContext& context, size_t size, const std::string& name) {
        return new HipArray(context, size, sizeof(T), name);
    }
    /**
@@ -72,7 +72,7 @@ public:
     * @param elementSize       the size of each element in bytes
     * @param name              the name of the array
     */
-    HipArray(HipContext& context, int size, int elementSize, const std::string& name);
+    HipArray(HipContext& context, size_t size, int elementSize, const std::string& name);
    ~HipArray();
    /**
     * Initialize this object.
@@ -82,7 +82,7 @@ public:
     * @param elementSize       the size of each element in bytes
     * @param name              the name of the array
     */
-    void initialize(ComputeContext& context, int size, int elementSize, const std::string& name);
+    void initialize(ComputeContext& context, size_t size, int elementSize, const std::string& name);
    /**
     * Initialize this object.  The template argument is the data type of each array element.
     *
@@ -91,13 +91,13 @@ public:
     * @param name              the name of the array
     */
    template <class T>
-    void initialize(ComputeContext& context, int size, const std::string& name) {
+    void initialize(ComputeContext& context, size_t size, const std::string& name) {
        initialize(context, size, sizeof(T), name);
    }
    /**
     * Recreate the internal storage to have a different size.
     */
-    void resize(int size);
+    void resize(size_t size);
    /**
     * Get whether this array has been initialized.
     */
@@ -107,7 +107,7 @@ public:
    /**
     * Get the number of elements in the array.
     */
-    int getSize() const {
+    size_t getSize() const {
        return size;
    }
    /**
@@ -183,7 +183,8 @@ public:
 private:
    HipContext* context;
    hipDeviceptr_t pointer;
-    int size, elementSize;
+    size_t size;
+    int elementSize;
    bool ownsMemory;
    std::string name;
 };

--- a/platforms/hip/include/HipContext.h
+++ b/platforms/hip/include/HipContext.h
@@ -39,6 +39,7 @@


 #include <map>
+#include <stack>
 #include <string>
 #include <utility>
 #define __CL_ENABLE_EXCEPTIONS
@@ -99,10 +100,20 @@ public:
        return contextIsValid;
    }
    /**
-     * Set the hipCtx_t associated with this object to be the current context.  If the context is not
+     * Set the device associated with this object to be the current device.  If the context is not
     * valid, this returns without doing anything.
     */
    void setAsCurrent();
+    /**
+     * Push the device associated with this object to be the current device.  If the context is not
+     * valid, this returns without doing anything.
+     */
+    void pushAsCurrent();
+    /**
+     * Pop the device associated with this object off the stack of contexts.  If the context is not
+     * valid, this returns without doing anything.
+     */
+    void popAsCurrent();
    /**
     * Get the hipDevice_t associated with this object.
     */
@@ -582,6 +593,10 @@ public:
     * expense of reduced simulation performance.
     */
    void flushQueue();
+    /**
+     * Get the flags that should be used when creating hipEvent_t objects.
+     */
+    unsigned int getEventFlags();
    /**
     * Get the flags that should be used when allocating pinned host memory.
     */
@@ -610,6 +625,7 @@ private:
    std::map<std::string, std::string> compilationDefines;
    std::vector<hipModule_t> loadedModules;
    hipDevice_t device;
+    std::stack<hipDevice_t> outerScopeDevices;
    hipStream_t currentStream;
    hipFunction_t clearBufferKernel;
    hipFunction_t clearTwoBuffersKernel;

--- a/platforms/hip/include/HipKernels.h
+++ b/platforms/hip/include/HipKernels.h
@@ -9,8 +9,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2008-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -109,6 +109,18 @@ public:
     * @param context    the context in which to execute this kernel
     */
    void setTime(ContextImpl& context, double time);
+    /**
+     * Get the current step count
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    long long getStepCount(const ContextImpl& context) const;
+    /**
+     * Set the current step count
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void setStepCount(const ContextImpl& context, long long count);
    /**
     * Get the positions of all particles.
     *
@@ -133,6 +145,15 @@ public:
     * @param velocities  a vector containg the particle velocities
     */
    void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
+    /**
+     * Compute velocities, shifted in time to account for a leapfrog integrator.  The shift
+     * is based on the most recently computed forces.
+     * 
+     * @param context     the context in which to execute this kernel
+     * @param timeShift   the amount by which to shift the velocities in time
+     * @param velocities  the shifted velocities are returned in this
+     */
+    void computeShiftedVelocities(ContextImpl& context, double timeShift, std::vector<Vec3>& velocities);
    /**
     * Get the current forces on all particles.
     *

--- a/platforms/hip/include/HipNonbondedUtilities.h
+++ b/platforms/hip/include/HipNonbondedUtilities.h
@@ -9,8 +9,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -339,7 +339,7 @@ private:
    HipArray rebuildNeighborList;
    HipSort* blockSorter;
    hipEvent_t downloadCountEvent;
-    int* pinnedCountBuffer;
+    unsigned int* pinnedCountBuffer;
    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
@@ -349,8 +349,9 @@ private:
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
-    int startTileIndex, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, numTilesInBatch, maxExclusions;
+    int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions;
    int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags;
+    unsigned int maxTiles, maxSinglePairs, tilesAfterReorder;
    long long numTiles;
    std::string kernelSource;
 };

--- a/platforms/hip/include/HipParallelKernels.h
+++ b/platforms/hip/include/HipParallelKernels.h
@@ -91,7 +91,9 @@ private:
    long long* pinnedForceBuffer;
    hipFunction_t sumKernel;
    hipEvent_t event;
-    hipStream_t peerCopyStream;
+    std::vector<hipEvent_t> peerCopyEvent;
+    std::vector<hipEvent_t> peerCopyEventLocal;
+    std::vector<hipStream_t> peerCopyStream;
 };

 /**

--- a/platforms/hip/include/HipPlatform.h
+++ b/platforms/hip/include/HipPlatform.h
@@ -126,8 +126,8 @@ public:
    std::vector<HipContext*> contexts;
    std::vector<double> contextEnergy;
    bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme, disablePmeStream, deterministicForces;
-    int cmMotionFrequency;
-    int stepCount, computeForceCount;
+    int cmMotionFrequency, computeForceCount;
+    long long stepCount;
    double time;
    std::map<std::string, std::string> propertyValues;
    ThreadPool threads;

--- a/platforms/hip/src/HipArray.cpp
+++ b/platforms/hip/src/HipArray.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2012-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2012-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,6 +27,7 @@

 #include "HipArray.h"
 #include "HipContext.h"
+#include "openmm/common/ContextSelector.h"
 #include <iostream>
 #include <sstream>
 #include <vector>
@@ -36,13 +37,13 @@ using namespace OpenMM;
 HipArray::HipArray() : pointer(0), ownsMemory(false) {
 }

-HipArray::HipArray(HipContext& context, int size, int elementSize, const std::string& name) : pointer(0) {
+HipArray::HipArray(HipContext& context, size_t size, int elementSize, const std::string& name) : pointer(0) {
    initialize(context, size, elementSize, name);
 }

 HipArray::~HipArray() {
    if (pointer != 0 && ownsMemory && context->getContextIsValid()) {
-        context->setAsCurrent();
+        ContextSelector selector(*context);
        hipError_t result = hipFree(pointer);
        if (result != hipSuccess) {
            std::stringstream str;
@@ -52,7 +53,7 @@ HipArray::~HipArray() {
    }
 }

-void HipArray::initialize(ComputeContext& context, int size, int elementSize, const std::string& name) {
+void HipArray::initialize(ComputeContext& context, size_t size, int elementSize, const std::string& name) {
    if (this->pointer != 0)
        throw OpenMMException("HipArray has already been initialized");
    this->context = &dynamic_cast<HipContext&>(context);
@@ -60,6 +61,7 @@ void HipArray::initialize(ComputeContext& context, int size, int elementSize, co
    this->elementSize = elementSize;
    this->name = name;
    ownsMemory = true;
+    ContextSelector selector(*this->context);
    hipError_t result = hipMalloc(&pointer, size*elementSize);
    if (result != hipSuccess) {
        std::stringstream str;
@@ -68,11 +70,12 @@ void HipArray::initialize(ComputeContext& context, int size, int elementSize, co
    }
 }

-void HipArray::resize(int size) {
+void HipArray::resize(size_t size) {
    if (pointer == 0)
        throw OpenMMException("HipArray has not been initialized");
    if (!ownsMemory)
        throw OpenMMException("Cannot resize an array that does not own its storage");
+    ContextSelector selector(*context);
    hipError_t result = hipFree(pointer);
    if (result != hipSuccess) {
        std::stringstream str;

--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -39,6 +39,7 @@
 #include "HipProgram.h"
 #include "HipFFT3D.h"
 #include "openmm/common/ComputeArray.h"
+#include "openmm/common/ContextSelector.h"
 #include "SHA1.h"
 #include "openmm/Platform.h"
 #include "openmm/System.h"
@@ -184,13 +185,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
    }

    contextIsValid = true;
+    ContextSelector selector(*this);
    if (contextIndex > 0) {
        int canAccess;
        CHECK_RESULT(hipDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice()));
        if (canAccess) {
-            platformData.contexts[0]->setAsCurrent();
+            {
+                ContextSelector selector2(*platformData.contexts[0]);
                CHECK_RESULT(hipDeviceEnablePeerAccess(getDevice(), 0));
-            setAsCurrent();
+            }
            CHECK_RESULT(hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0));
        }
    }
@@ -345,7 +348,7 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
 }

 HipContext::~HipContext() {
-    setAsCurrent();
+    pushAsCurrent();
    for (auto force : forces)
        delete force;
    for (auto listener : reorderListeners)
@@ -366,28 +369,29 @@ HipContext::~HipContext() {
        delete nonbonded;
    for (auto module : loadedModules)
        hipModuleUnload(module);
+    popAsCurrent();
    contextIsValid = false;
 }

 void HipContext::initialize() {
-    hipSetDevice(device);
+    ContextSelector selector(*this);
    string errorMessage = "Error initializing Context";
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
-        energySum.initialize<double>(*this, 1, "energySum");
+        energySum.initialize<double>(*this, multiprocessors, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), getHostMallocFlags()));
    }
    else if (useMixedPrecision) {
        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
-        energySum.initialize<double>(*this, 1, "energySum");
+        energySum.initialize<double>(*this, multiprocessors, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), getHostMallocFlags()));
    }
    else {
        energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
-        energySum.initialize<float>(*this, 1, "energySum");
+        energySum.initialize<float>(*this, multiprocessors, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), getHostMallocFlags()));
    }
@@ -423,6 +427,29 @@ void HipContext::setAsCurrent() {
        hipSetDevice(device);
 }

+void HipContext::pushAsCurrent() {
+    if (contextIsValid) {
+        // Emulate cuCtxPushCurrent's behavior
+        hipDevice_t outerScopeDevice;
+        hipGetDevice(&outerScopeDevice);
+        outerScopeDevices.push(outerScopeDevice);
+        if (device != outerScopeDevice) {
+            hipSetDevice(device);
+        }
+    }
+}
+
+void HipContext::popAsCurrent() {
+    if (contextIsValid) {
+        // Emulate cuCtxPopCurrent's behavior
+        hipDevice_t outerScopeDevice = outerScopeDevices.top();
+        outerScopeDevices.pop();
+        if (outerScopeDevice != device) {
+            hipSetDevice(outerScopeDevice);
+        }
+    }
+}
+
 string HipContext::getTempFileName() const {
    stringstream tempFileName;
    tempFileName << tempDir;
@@ -784,12 +811,18 @@ double HipContext::reduceEnergy() {
    int bufferSize = energyBuffer.getSize();
    int workGroupSize = getMaxThreadBlockSize();
    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
-    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
+    executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
    energySum.download(pinnedBuffer);
-    if (getUseDoublePrecision() || getUseMixedPrecision())
-        return *((double*) pinnedBuffer);
-    else
-        return *((float*) pinnedBuffer);
+    double result = 0;
+    if (getUseDoublePrecision() || getUseMixedPrecision()) {
+        for (int i = 0; i < energySum.getSize(); i++)
+            result += ((double*) pinnedBuffer)[i];
+    }
+    else {
+        for (int i = 0; i < energySum.getSize(); i++)
+            result += ((float*) pinnedBuffer)[i];
+    }
+    return result;
 }

 void HipContext::setCharges(const vector<double>& charges) {
@@ -850,6 +883,13 @@ vector<int> HipContext::getDevicePrecedence() {
    return precedence;
 }

+unsigned int HipContext::getEventFlags() {
+    unsigned int flags = hipEventDisableTiming;
+    if (useBlockingSync)
+        flags += hipEventBlockingSync;
+    return flags;
+}
+
 unsigned int HipContext::getHostMallocFlags() {
 #ifdef WIN32
    return hipHostMallocDefault;

--- a/platforms/hip/src/HipEvent.cpp
+++ b/platforms/hip/src/HipEvent.cpp
@@ -31,7 +31,7 @@
 using namespace OpenMM;

 HipEvent::HipEvent(HipContext& context) : context(context), eventCreated(false) {
-    hipError_t result = hipEventCreateWithFlags(&event, hipEventDisableTiming);
+    hipError_t result = hipEventCreateWithFlags(&event, context.getEventFlags());
    if (result != hipSuccess)
        throw OpenMMException("Error creating HIP event:"+HipContext::getErrorString(result));
    eventCreated = true;

--- a/platforms/hip/src/HipIntegrationUtilities.cpp
+++ b/platforms/hip/src/HipIntegrationUtilities.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2009-2021 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,6 +27,7 @@

 #include "HipIntegrationUtilities.h"
 #include "HipContext.h"
+#include "openmm/common/ContextSelector.h"

 using namespace OpenMM;
 using namespace std;
@@ -41,13 +42,13 @@ using namespace std;

 HipIntegrationUtilities::HipIntegrationUtilities(HipContext& context, const System& system) : IntegrationUtilities(context, system),
        ccmaConvergedMemory(NULL) {
-        CHECK_RESULT2(hipEventCreateWithFlags(&ccmaEvent, hipEventDisableTiming), "Error creating event for CCMA");
+        CHECK_RESULT2(hipEventCreateWithFlags(&ccmaEvent, context.getEventFlags()), "Error creating event for CCMA");
        CHECK_RESULT2(hipHostMalloc((void**) &ccmaConvergedMemory, sizeof(int), context.getHostMallocFlags()), "Error allocating pinned memory");
        CHECK_RESULT2(hipHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
 }

 HipIntegrationUtilities::~HipIntegrationUtilities() {
-    context.setAsCurrent();
+    ContextSelector selector(context);
    if (ccmaConvergedMemory != NULL) {
        hipHostFree(ccmaConvergedMemory);
        hipEventDestroy(ccmaEvent);
@@ -67,6 +68,7 @@ HipArray& HipIntegrationUtilities::getStepSize() {
 }

 void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
+    ContextSelector selector(context);
    ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
    if (constrainVelocities) {
        settleKernel = settleVelKernel;
@@ -132,6 +134,7 @@ void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, dou
 }

 void HipIntegrationUtilities::distributeForcesFromVirtualSites() {
+    ContextSelector selector(context);
    if (numVsites > 0) {
        vsiteForceKernel->setArg(2, context.getLongForceBuffer());
        vsiteForceKernel->execute(numVsites);

--- a/platforms/hip/src/HipKernels.cpp
+++ b/platforms/hip/src/HipKernels.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2008-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -30,6 +30,7 @@
 #include "openmm/Context.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
+#include "openmm/common/ContextSelector.h"
 #include "CommonKernelSources.h"
 #include "HipBondedUtilities.h"
 #include "HipExpressionUtilities.h"
@@ -59,7 +60,7 @@ void HipCalcForcesAndEnergyKernel::initialize(const System& system) {

 void HipCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
    cu.setForcesValid(true);
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    cu.clearAutoclearBuffers();
    for (auto computation : cu.getPreComputations())
        computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
@@ -72,7 +73,7 @@ void HipCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
 }

 double HipCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    cu.getBondedUtilities().computeInteractions(groups);
    cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
    double sum = 0.0;
@@ -99,8 +100,18 @@ void HipUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
        ctx->setTime(time);
 }

+long long HipUpdateStateDataKernel::getStepCount(const ContextImpl& context) const {
+    return cu.getStepCount();
+}
+
+void HipUpdateStateDataKernel::setStepCount(const ContextImpl& context, long long count) {
+    vector<HipContext*>& contexts = cu.getPlatformData().contexts;
+    for (auto ctx : contexts)
+        ctx->setStepCount(count);
+}
+
 void HipUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
    vector<float4> posCorrection;
@@ -161,7 +172,7 @@ void HipUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
 }

 void HipUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    if (cu.getUseDoublePrecision()) {
@@ -212,7 +223,7 @@ void HipUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<V
 }

 void HipUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    velocities.resize(numParticles);
@@ -237,7 +248,7 @@ void HipUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>&
 }

 void HipUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
@@ -270,8 +281,12 @@ void HipUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<
    }
 }

+void HipUpdateStateDataKernel::computeShiftedVelocities(ContextImpl& context, double timeShift, vector<Vec3>& velocities) {
+    cu.getIntegrationUtilities().computeShiftedVelocities(timeShift, velocities);
+}
+
 void HipUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    long long* force = (long long*) cu.getPinnedBuffer();
    cu.getForce().download(force);
    const vector<int>& order = cu.getAtomIndex();
@@ -284,6 +299,7 @@ void HipUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& for
 }

 void HipUpdateStateDataKernel::getEnergyParameterDerivatives(ContextImpl& context, map<string, double>& derivs) {
+    ContextSelector selector(cu);
    const vector<string>& paramDerivNames = cu.getEnergyParamDerivNames();
    int numDerivs = paramDerivNames.size();
    if (numDerivs == 0)
@@ -337,15 +353,15 @@ void HipUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const
 }

 void HipUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    int version = 3;
    stream.write((char*) &version, sizeof(int));
    int precision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
    stream.write((char*) &precision, sizeof(int));
    double time = cu.getTime();
    stream.write((char*) &time, sizeof(double));
-    int stepCount = cu.getStepCount();
-    stream.write((char*) &stepCount, sizeof(int));
+    long long stepCount = cu.getStepCount();
+    stream.write((char*) &stepCount, sizeof(long long));
    int stepsSinceReorder = cu.getStepsSinceReorder();
    stream.write((char*) &stepsSinceReorder, sizeof(int));
    char* buffer = (char*) cu.getPinnedBuffer();
@@ -367,7 +383,7 @@ void HipUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& s
 }

 void HipUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    int version;
    stream.read((char*) &version, sizeof(int));
    if (version != 3)
@@ -379,8 +395,9 @@ void HipUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& str
        throw OpenMMException("Checkpoint was created with a different numeric precision");
    double time;
    stream.read((char*) &time, sizeof(double));
-    int stepCount, stepsSinceReorder;
-    stream.read((char*) &stepCount, sizeof(int));
+    long long stepCount;
+    stream.read((char*) &stepCount, sizeof(long long));
+    int stepsSinceReorder;
    stream.read((char*) &stepsSinceReorder, sizeof(int));
    vector<HipContext*>& contexts = cu.getPlatformData().contexts;
    for (auto ctx : contexts) {
@@ -408,6 +425,7 @@ void HipUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& str
    SimTKOpenMMUtilities::loadCheckpoint(stream);
    for (auto listener : cu.getReorderListeners())
        listener->execute();
+    cu.validateAtomOrder();
 }

 class HipCalcNonbondedForceKernel::ForceInfo : public HipForceInfo {
@@ -448,7 +466,7 @@ public:
        forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
    }
    float* getPosq() {
-        cu.setAsCurrent();
+        ContextSelector selector(cu);
        cu.getPosq().download(posq);
        return (float*) &posq[0];
    }
@@ -532,7 +550,7 @@ private:
 };

 HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    if (sort != NULL)
        delete sort;
    if (fft != NULL)
@@ -551,7 +569,7 @@ HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
 }

 void HipCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    int forceIndex;
    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
        ;
@@ -650,8 +668,14 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
    if (hasOffsets)
        paramsDefines["HAS_OFFSETS"] = "1";
+    if (force.getNumParticleParameterOffsets() > 0)
+        paramsDefines["HAS_PARTICLE_OFFSETS"] = "1";
+    if (force.getNumExceptionParameterOffsets() > 0)
+        paramsDefines["HAS_EXCEPTION_OFFSETS"] = "1";
    if (usePosqCharges)
        paramsDefines["USE_POSQ_CHARGES"] = "1";
+    if (doLJPME)
+        paramsDefines["INCLUDE_LJPME_EXCEPTIONS"] = "1";
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.

@@ -703,8 +727,16 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
        defines["DO_LJPME"] = doLJPME ? "1" : "0";
-        if (doLJPME)
+        if (doLJPME) {
            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            double invRCut6 = pow(force.getCutoffDistance(), -6);
+            double dalphaR = dispersionAlpha * force.getCutoffDistance();
+            double dar2 = dalphaR*dalphaR;
+            double dar4 = dar2*dar2;
+            double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
+            defines["INVCUT6"] = cu.doubleToString(invRCut6);
+            defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
+        }
        if (cu.getContextIndex() == 0) {
            paramsDefines["INCLUDE_EWALD"] = "1";
            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
@@ -766,13 +798,6 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
                    pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
                    pmeDefines["USE_LJPME"] = "1";
                    pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
-                    double invRCut6 = pow(force.getCutoffDistance(), -6);
-                    double dalphaR = dispersionAlpha * force.getCutoffDistance();
-                    double dar2 = dalphaR*dalphaR;
-                    double dar4 = dar2*dar2;
-                    double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
-                    defines["INVCUT6"] = cu.doubleToString(invRCut6);
-                    defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
                    module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
@@ -813,8 +838,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond

                if (usePmeStream) {
                    CHECK_RESULT(hipStreamCreateWithFlags(&pmeStream, hipStreamNonBlocking), "Error creating stream for NonbondedForce");
-                    CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, hipEventDisableTiming), "Error creating event for NonbondedForce");
-                    CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, hipEventDisableTiming), "Error creating event for NonbondedForce");
+                    CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
+                    CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
@@ -939,6 +964,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
            replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
            if (doLJPME)
                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            if (force.getIncludeDirectSpace())
                cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
        }
    }
@@ -959,7 +985,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        replacements["CHARGE1"] = prefix+"charge1";
        replacements["CHARGE2"] = prefix+"charge2";
    }
-    if (hasCoulomb)
+    if (hasCoulomb && !usePosqCharges)
        cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
    sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    if (hasLJ) {
@@ -968,6 +994,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
    }
    source = cu.replaceStrings(source, replacements);
+    if (force.getIncludeDirectSpace())
        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);

    // Initialize the exceptions.
@@ -993,13 +1020,14 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        map<string, string> replacements;
        replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams.getDevicePointer(), "float4");
+        if (force.getIncludeDirectSpace())
            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
    }

    // Initialize parameter offsets.

    vector<vector<float4> > particleOffsetVec(force.getNumParticles());
-    vector<vector<float4> > exceptionOffsetVec(force.getNumExceptions());
+    vector<vector<float4> > exceptionOffsetVec(numExceptions);
    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
        string param;
        int particle;
@@ -1020,6 +1048,9 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        int exception;
        double charge, sigma, epsilon;
        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        int index = exceptionIndex[exception];
+        if (index < startIndex || index >= endIndex)
+            continue;
        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
        int paramIndex;
        if (paramPos == paramNames.end()) {
@@ -1028,13 +1059,11 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        }
        else
            paramIndex = paramPos-paramNames.begin();
-        exceptionOffsetVec[exceptionIndex[exception]].push_back(make_float4(charge, sigma, epsilon, paramIndex));
+        exceptionOffsetVec[index-startIndex].push_back(make_float4(charge, sigma, epsilon, paramIndex));
    }
    paramValues.resize(paramNames.size(), 0.0);
    particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
-    exceptionParamOffsets.initialize<float4>(cu, max(force.getNumExceptionParameterOffsets(), 1), "exceptionParamOffsets");
    particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
-    exceptionOffsetIndices.initialize<int>(cu, force.getNumExceptions()+1, "exceptionOffsetIndices");
    vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
    vector<float4> p, e;
    for (int i = 0; i < particleOffsetVec.size(); i++) {
@@ -1054,7 +1083,9 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
        particleParamOffsets.upload(p);
        particleOffsetIndices.upload(particleOffsetIndicesVec);
    }
-    if (force.getNumExceptionParameterOffsets() > 0) {
+    exceptionParamOffsets.initialize<float4>(cu, max((int) e.size(), 1), "exceptionParamOffsets");
+    exceptionOffsetIndices.initialize<int>(cu, exceptionOffsetIndicesVec.size(), "exceptionOffsetIndices");
+    if (e.size() > 0) {
        exceptionParamOffsets.upload(e);
        exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
    }
@@ -1075,6 +1106,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
 double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
    // Update particle and exception parameters.

+    ContextSelector selector(cu);
    bool paramChanged = false;
    for (int i = 0; i < paramNames.size(); i++) {
        double value = context.getParameter(paramNames[i]);
@@ -1089,7 +1121,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    }
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    if (recomputeParams || hasOffsets) {
-        bool computeSelfEnergy = (includeEnergy && includeReciprocal);
+        int computeSelfEnergy = (includeEnergy && includeReciprocal);
        int numAtoms = cu.getPaddedNumAtoms();
        vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
                &baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
@@ -1258,7 +1290,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
 void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
    // Make sure the new parameters are acceptable.

-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    if (force.getNumParticles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of particles has changed");
    if (!hasCoulomb || !hasLJ) {
@@ -1271,20 +1303,28 @@ void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
        }
    }
+    set<int> exceptionsWithOffsets;
+    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
+        string param;
+        int exception;
+        double charge, sigma, epsilon;
+        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        exceptionsWithOffsets.insert(exception);
+    }
    vector<int> exceptions;
    for (int i = 0; i < force.getNumExceptions(); i++) {
        int particle1, particle2;
        double chargeProd, sigma, epsilon;
        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
-        if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
+        if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end())
            exceptions.push_back(i);
-        else if (chargeProd != 0.0 || epsilon != 0.0)
-            throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
    }
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
    int numExceptions = endIndex-startIndex;
+    if (numExceptions != exceptionAtoms.size())
+        throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");

    // Record the per-particle parameters.

@@ -1300,11 +1340,13 @@ void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    // Record the exceptions.

    if (numExceptions > 0) {
-        vector<vector<int> > atoms(numExceptions, vector<int>(2));
        vector<float4> baseExceptionParamsVec(numExceptions);
        for (int i = 0; i < numExceptions; i++) {
+            int particle1, particle2;
            double chargeProd, sigma, epsilon;
-            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
+            force.getExceptionParameters(exceptions[startIndex+i], particle1, particle2, chargeProd, sigma, epsilon);
+            if (make_pair(particle1, particle2) != exceptionAtoms[i])
+                throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
        }
        baseExceptionParams.upload(baseExceptionParamsVec);

--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -439,6 +439,10 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
 bool HipNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
+    if (context.getStepsSinceReorder() == 0)
+        tilesAfterReorder = pinnedCountBuffer[0];
+    else if (context.getStepsSinceReorder() > 25 && pinnedCountBuffer[0] > 1.1*tilesAfterReorder)
+        context.forceReorder();
    if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
        return false;

@@ -446,12 +450,13 @@ bool HipNonbondedUtilities::updateNeighborListSize() {
    // this from happening in the future.

    if (pinnedCountBuffer[0] > maxTiles) {
-        maxTiles = (int) (1.2*pinnedCountBuffer[0]);
-        int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+        maxTiles = (unsigned int) (1.2*pinnedCountBuffer[0]);
+        unsigned int numBlocks = context.getNumAtomBlocks();
+        int totalTiles = numBlocks*(numBlocks+1)/2;
        if (maxTiles > totalTiles)
            maxTiles = totalTiles;
        interactingTiles.resize(maxTiles);
-        interactingAtoms.resize(HipContext::TileSize*maxTiles);
+        interactingAtoms.resize(HipContext::TileSize*(size_t) maxTiles);
        if (forceArgs.size() > 0)
            forceArgs[7] = &interactingTiles.getDevicePointer();
        findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();

--- a/platforms/hip/src/HipParallelKernels.cpp
+++ b/platforms/hip/src/HipParallelKernels.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,6 +27,7 @@

 #include "HipParallelKernels.h"
 #include "HipKernelSources.h"
+#include "openmm/common/ContextSelector.h"

 using namespace OpenMM;
 using namespace std;
@@ -70,7 +71,7 @@ public:
    void execute() {
        // Copy coordinates over to this device and execute the kernel.

-        cu.setAsCurrent();
+        ContextSelector selector(cu);
        if (cu.getContextIndex() > 0) {
            hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
            if (!cu.getPlatformData().peerAccessSupported)
@@ -94,13 +95,16 @@ private:
 class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
 public:
    FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces, bool& valid, int2& interactionCount) :
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces,
+            bool& valid, int2& interactionCount, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
-            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
+            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount),
+            stream(stream), event(event), localEvent(localEvent) {
    }
    void execute() {
        // Execute the kernel, then download forces.

+        ContextSelector selector(cu);
        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
        if (cu.getComputeForceCount() < 200) {
            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.
@@ -110,13 +114,16 @@ public:
        }
        if (includeForce) {
            if (cu.getContextIndex() > 0) {
+                hipEventRecord(localEvent, cu.getCurrentStream());
+                hipStreamWaitEvent(stream, localEvent, 0);
                int numAtoms = cu.getPaddedNumAtoms();
                if (cu.getPlatformData().peerAccessSupported) {
                    int numBytes = numAtoms*3*sizeof(long long);
                    int offset = (cu.getContextIndex()-1)*numBytes;
                    HipContext& context0 = *cu.getPlatformData().contexts[0];
-                    CHECK_RESULT(hipMemcpy(static_cast<char*>(contextForces.getDevicePointer())+offset,
-                                           cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice), "Error copying forces");
+                    CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset,
+                                           cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces");
+                    hipEventRecord(event, stream);
                }
                else
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
@@ -140,6 +147,9 @@ private:
    HipArray& contextForces;
    bool& valid;
    int2& interactionCount;
+    hipStream_t stream;
+    hipEvent_t event;
+    hipEvent_t localEvent;
 };

 HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
@@ -150,20 +160,25 @@ HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(strin
 }

 HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
-    data.contexts[0]->setAsCurrent();
+    ContextSelector selector(*data.contexts[0]);
    if (pinnedPositionBuffer != NULL)
        hipHostFree(pinnedPositionBuffer);
    if (pinnedForceBuffer != NULL)
        hipHostFree(pinnedForceBuffer);
    hipEventDestroy(event);
-    hipStreamDestroy(peerCopyStream);
+    for (int i = 0; i < peerCopyEvent.size(); i++)
+        hipEventDestroy(peerCopyEvent[i]);
+    for (int i = 0; i < peerCopyEventLocal.size(); i++)
+        hipEventDestroy(peerCopyEventLocal[i]);
+    for (int i = 0; i < peerCopyStream.size(); i++)
+        hipStreamDestroy(peerCopyStream[i]);
    if (interactionCounts != NULL)
        hipHostFree(interactionCounts);
 }

 void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    HipContext& cu = *data.contexts[0];
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    hipModule_t module = cu.createModule(HipKernelSources::parallel);
    sumKernel = cu.getKernel(module, "sumForces");
    int numContexts = data.contexts.size();
@@ -171,14 +186,25 @@ void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
        getKernel(i).initialize(system);
    for (int i = 0; i < numContexts; i++)
        contextNonbondedFractions[i] = 1/(double) numContexts;
-    CHECK_RESULT(hipEventCreateWithFlags(&event, 0), "Error creating event");
-    CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream, hipStreamNonBlocking), "Error creating stream");
+    CHECK_RESULT(hipEventCreateWithFlags(&event, cu.getEventFlags()), "Error creating event");
+    peerCopyEvent.resize(numContexts);
+    peerCopyEventLocal.resize(numContexts);
+    peerCopyStream.resize(numContexts);
+    for (int i = 0; i < numContexts; i++) {
+        CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
+        CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
+    }
+    for (int i = 0; i < numContexts; i++) {
+        HipContext& cuLocal = *data.contexts[i];
+        ContextSelector selectorLocal(cuLocal);
+        CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
+    }
    CHECK_RESULT(hipHostMalloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
 }

 void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    HipContext& cu = *data.contexts[0];
-    cu.setAsCurrent();
+    ContextSelector selector(cu);
    if (!contextForces.isInitialized()) {
        contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
        CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
@@ -194,36 +220,44 @@ void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context
    else {
        int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
        hipEventRecord(event, cu.getCurrentStream());
-        hipStreamWaitEvent(peerCopyStream, event, 0);
-        for (int i = 1; i < (int) data.contexts.size(); i++)
+        for (int i = 1; i < (int) data.contexts.size(); i++) {
+            hipStreamWaitEvent(peerCopyStream[i], event, 0);
            CHECK_RESULT(hipMemcpyAsync(
                data.contexts[i]->getPosq().getDevicePointer(),
                cu.getPosq().getDevicePointer(), numBytes,
-                hipMemcpyDeviceToDevice, peerCopyStream), "Error copying positions");
-        hipEventRecord(event, peerCopyStream);
+                hipMemcpyDeviceToDevice, peerCopyStream[i]), "Error copying positions");
+            hipEventRecord(peerCopyEvent[i], peerCopyStream[i]);
+        }
    }
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        HipContext& cu = *data.contexts[i];
        ComputeContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
+        hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent, interactionCounts[i]));
    }
+    data.syncContexts();
 }

 double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        HipContext& cu = *data.contexts[i];
        ComputeContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, contextForces, valid, interactionCounts[i]));
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
+                pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
    }
    data.syncContexts();
+    HipContext& cu = *data.contexts[0];
+    ContextSelector selector(cu);
+    if (cu.getPlatformData().peerAccessSupported)
+        for (int i = 1; i < data.contexts.size(); i++)
+            hipStreamWaitEvent(cu.getCurrentStream(), peerCopyEvent[i], 0);
    double energy = 0.0;
    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
        energy += data.contextEnergy[i];
    if (includeForce && valid) {
        // Sum the forces from all devices.

-        HipContext& cu = *data.contexts[0];
        if (!cu.getPlatformData().peerAccessSupported)
            contextForces.upload(pinnedForceBuffer, false);
        int bufferSize = 3*cu.getPaddedNumAtoms();

--- a/platforms/hip/src/kernels/utilities.hip
+++ b/platforms/hip/src/kernels/utilities.hip
@@ -80,7 +80,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
    extern __shared__ mixed tempBuffer[];
    const unsigned int thread = threadIdx.x;
    mixed sum = 0;
-    for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x)
        sum += energyBuffer[index];
    tempBuffer[thread] = sum;
    for (int i = 1; i < workGroupSize; i *= 2) {
@@ -89,7 +89,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
            tempBuffer[thread] += tempBuffer[thread+i];
    }
    if (thread == 0)
-        *result = tempBuffer[0];
+        result[blockIdx.x] = tempBuffer[0];
 }

 /**

--- a/platforms/hip/tests/HipTests.h
+++ b/platforms/hip/tests/HipTests.h
@@ -41,4 +41,6 @@ OpenMM::HipPlatform platform;
 void initializeTests(int argc, char* argv[]) {
    if (argc > 1)
        platform.setPropertyDefaultValue("Precision", std::string(argv[1]));
+    if (argc > 2)
+        platform.setPropertyDefaultValue("DeviceIndex", std::string(argv[2]));
 }
--- a/platforms/hip/tests/TestHipFFT3D.cpp
+++ b/platforms/hip/tests/TestHipFFT3D.cpp
@@ -6,8 +6,8 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2016 Stanford University and the Authors.      *
- * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -38,12 +38,16 @@
 #include "HipArray.h"
 #include "HipContext.h"
 #include "HipFFT3D.h"
-#include "fftpack.h"
 #include "sfmt/SFMT.h"
 #include "openmm/System.h"
+#include <complex>
 #include <iostream>
 #include <cmath>
 #include <set>
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#include "pocketfft_hdronly.h"

 using namespace OpenMM;
 using namespace std;
@@ -67,19 +71,19 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize, double e
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    vector<Real2> original(xsize*ysize*zsize);
-    vector<t_complex> reference(original.size());
+    vector<complex<double>> reference(original.size());
    for (int i = 0; i < (int) original.size(); i++) {
        Real2 value;
        value.x = (float) genrand_real2(sfmt);
        value.y = (float) genrand_real2(sfmt);
        original[i] = value;
-        reference[i] = t_complex(value.x, value.y);
+        reference[i] = complex<double>(value.x, value.y);
    }
    for (int i = 0; i < (int) reference.size(); i++) {
        if (realToComplex)
-            reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
+            reference[i] = complex<double>(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
        else
-            reference[i] = t_complex(original[i].x, original[i].y);
+            reference[i] = complex<double>(original[i].x, original[i].y);
    }
    HipArray grid1(context, original.size(), sizeof(Real2), "grid1");
    HipArray grid2(context, original.size(), sizeof(Real2), "grid2");
@@ -91,19 +95,21 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize, double e
    fft.execFFT(true);
    vector<Real2> result;
    grid2.download(result);
-    fftpack_t plan;
-    fftpack_init_3d(&plan, xsize, ysize, zsize);
-    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
+    vector<size_t> shape = {(size_t) xsize, (size_t) ysize, (size_t) zsize};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (ysize*zsize*sizeof(complex<double>)),
+                                (ptrdiff_t) (zsize*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, reference.data(), reference.data(), 1.0);
    int outputZSize = (realToComplex ? zsize/2+1 : zsize);
    for (int x = 0; x < xsize; x++)
        for (int y = 0; y < ysize; y++)
            for (int z = 0; z < outputZSize; z++) {
                int index1 = x*ysize*zsize + y*zsize + z;
                int index2 = x*ysize*outputZSize + y*outputZSize + z;
-                ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3 * eps);
-                ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3 * eps);
+                ASSERT_EQUAL_TOL(reference[index1].real(), result[index2].x, 1e-3 * eps);
+                ASSERT_EQUAL_TOL(reference[index1].imag(), result[index2].y, 1e-3 * eps);
            }
-    fftpack_destroy(plan);

    // Perform a backward transform and see if we get the original values.


--- a/platforms/hip/tests/TestHipNonbondedForce.cpp
+++ b/platforms/hip/tests/TestHipNonbondedForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2021 Stanford University and the Authors.      *
 * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
@@ -50,12 +50,18 @@ void testParallelComputation(NonbondedForce::NonbondedMethod method) {
    vector<Vec3> positions(numParticles);
    for (int i = 0; i < numParticles; i++)
        positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
+    force->addGlobalParameter("scale", 0.5);
    for (int i = 0; i < numParticles; ++i)
        for (int j = 0; j < i; ++j) {
            Vec3 delta = positions[i]-positions[j];
-            if (delta.dot(delta) < 0.1)
+            if (delta.dot(delta) < 0.1) {
                force->addException(i, j, 0, 1, 0);
            }
+            else if (delta.dot(delta) < 0.2) {
+                int index = force->addException(i, j, 0.5, 1, 1.0);
+                force->addExceptionParameterOffset("scale", index, 0.5, 0.4, 0.3);
+            }
+        }
    
    // Create two contexts, one with a single device and one with two devices.

@@ -179,6 +185,7 @@ void runPlatformTests() {
    testParallelComputation(NonbondedForce::NoCutoff);
    testParallelComputation(NonbondedForce::Ewald);
    testParallelComputation(NonbondedForce::PME);
+    testParallelComputation(NonbondedForce::LJPME);
    testReordering();
    testDeterministicForces();
    if (canRunHugeTest())

--- a/platforms/hip/tests/TestHipRandom.cpp
+++ b/platforms/hip/tests/TestHipRandom.cpp
@@ -60,6 +60,7 @@ void testGaussian() {
            platform.getPropertyDefaultValue(HipPlatform::HipDisablePmeStream()), "false", 1, NULL);
    HipContext& context = *platformData.contexts[0];
    context.initialize();
+    context.setAsCurrent();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);
    HipArray& random = context.getIntegrationUtilities().getRandom();
    context.getIntegrationUtilities().prepareRandomNumbers(random.getSize());

--- a/platforms/hip/tests/TestHipSort.cpp
+++ b/platforms/hip/tests/TestHipSort.cpp
@@ -70,6 +70,7 @@ void verifySorting(vector<float> array, bool uniform) {
            platform.getPropertyDefaultValue(HipPlatform::HipDisablePmeStream()), "false", 1, NULL);
    HipContext& context = *platformData.contexts[0];
    context.initialize();
+    context.setAsCurrent();
    HipArray data(context, array.size(), 4, "sortData");
    data.upload(array);
    HipSort sort(context, new SortTrait(), array.size(), uniform);

--- a/plugins/amoeba/platforms/hip/src/AmoebaHipKernels.cpp
+++ b/plugins/amoeba/platforms/hip/src/AmoebaHipKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2020 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2021 Stanford University and the Authors.      *
 * Portions copyright (c) 2021 Advanced Micro Devices, Inc.                   *
 * Authors: Peter Eastman, Mark Friedrichs                                    *
 * Contributors:                                                              *
@@ -29,6 +29,7 @@
  #define _USE_MATH_DEFINES // Needed to get M_PI
 #endif
 #include "AmoebaHipKernels.h"
+#include "openmm/common/ContextSelector.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/AmoebaGeneralizedKirkwoodForceImpl.h"
 #include "openmm/internal/AmoebaMultipoleForceImpl.h"
@@ -56,7 +57,7 @@ using namespace std;
 * -------------------------------------------------------------------------- */

 HipCalcAmoebaMultipoleForceKernel::~HipCalcAmoebaMultipoleForceKernel() {
-    cc.setAsCurrent();
+    ContextSelector selector(cc);
    if (fft != NULL)
        delete fft;
 }
@@ -64,6 +65,7 @@ HipCalcAmoebaMultipoleForceKernel::~HipCalcAmoebaMultipoleForceKernel() {
 void HipCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
    CommonCalcAmoebaMultipoleForceKernel::initialize(system, force);
    if (usePME) {
+        ContextSelector selector(cc);
        HipArray& grid1 = cu.unwrap(pmeGrid1);
        HipArray& grid2 = cu.unwrap(pmeGrid2);
        fft = cu.createFFT(gridSizeX, gridSizeY, gridSizeZ, false, cu.getCurrentStream(), grid1, grid2);
@@ -79,7 +81,7 @@ void HipCalcAmoebaMultipoleForceKernel::computeFFT(bool forward) {
 * -------------------------------------------------------------------------- */

 HipCalcHippoNonbondedForceKernel::~HipCalcHippoNonbondedForceKernel() {
-    cc.setAsCurrent();
+    ContextSelector selector(cc);
    if (sort != NULL)
        delete sort;
    if (fft != NULL)
@@ -91,6 +93,7 @@ HipCalcHippoNonbondedForceKernel::~HipCalcHippoNonbondedForceKernel() {
 void HipCalcHippoNonbondedForceKernel::initialize(const System& system, const HippoNonbondedForce& force) {
    CommonCalcHippoNonbondedForceKernel::initialize(system, force);
    if (usePME) {
+        ContextSelector selector(cc);
        sort = new HipSort(cu, new SortTrait(), cc.getNumAtoms());
        HipArray& grid1 = cu.unwrap(pmeGrid1);
        HipArray& grid2 = cu.unwrap(pmeGrid2);