Merge pull request #4632 from ex-rzr/make-hip-standard-platform

HIP platform

Merge pull request #4632 from ex-rzr/make-hip-standard-platform
HIP platform
3b8df952 · Peter Eastman · GitHub · 5ce6a85d · 28fb2918 · 3b8df952
Unverified Commit 3b8df952 authored Sep 05, 2024 by Peter Eastman Committed by GitHub Sep 05, 2024
20 changed files
--- a/platforms/hip/include/HipParameterSet.h
+++ b/platforms/hip/include/HipParameterSet.h
+#ifndef OPENMM_HIPPARAMETERSET_H_
+#define OPENMM_HIPPARAMETERSET_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipContext.h"
+#include "HipNonbondedUtilities.h"
+#include "openmm/common/ComputeParameterSet.h"
+
+namespace OpenMM {
+
+class HipNonbondedUtilities;
+
+/**
+ * This class exists for backward compatibility.  For most purposes you can use
+ * ComputeParameterSet directly instead.
+ */
+
+class OPENMM_EXPORT_COMMON HipParameterSet : public ComputeParameterSet {
+public:
+    /**
+     * Create an HipParameterSet.
+     *
+     * @param context          the context for which to create the parameter set
+     * @param numParameters    the number of parameters for each object
+     * @param numObjects       the number of objects to store parameter values for
+     * @param name             the name of the parameter set
+     * @param bufferPerParameter  if true, a separate buffer is created for each parameter.  If false,
+     *                            multiple parameters may be combined into a single buffer.
+     * @param useDoublePrecision  whether values should be stored as single or double precision
+     */
+    HipParameterSet(HipContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
+    /**
+     * Get a set of HipNonbondedUtilities::ParameterInfo objects which describe the Buffers
+     * containing the data.
+     */
+    std::vector<HipNonbondedUtilities::ParameterInfo>& getBuffers() {
+        return buffers;
+    }
+private:
+    std::vector<HipNonbondedUtilities::ParameterInfo> buffers;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_HIPPARAMETERSET_H_*/
--- a/platforms/hip/include/HipPlatform.h
+++ b/platforms/hip/include/HipPlatform.h
+#ifndef OPENMM_HIPPLATFORM_H_
+#define OPENMM_HIPPLATFORM_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
+#include "openmm/common/windowsExportCommon.h"
+
+namespace OpenMM {
+
+class HipContext;
+
+/**
+ * This Platform subclass uses HIP implementations of the OpenMM kernels.
+ */
+
+class OPENMM_EXPORT_COMMON HipPlatform : public Platform {
+public:
+    class PlatformData;
+    HipPlatform();
+    const std::string& getName() const {
+        static const std::string name = "HIP";
+        return name;
+    }
+    double getSpeed() const;
+    bool supportsDoublePrecision() const;
+    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
+    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
+    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
+    void contextDestroyed(ContextImpl& context) const;
+    /**
+     * This is the name of the parameter for selecting which HIP device or devices to use.
+     */
+    static const std::string& HipDeviceIndex() {
+        static const std::string key = "DeviceIndex";
+        return key;
+    }
+    /**
+     * This is the name of the parameter that reports the HIP device or devices being used.
+     */
+    static const std::string& HipDeviceName() {
+        static const std::string key = "DeviceName";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting whether HIP should sync or spin loop while waiting for results.
+     */
+    static const std::string& HipUseBlockingSync() {
+        static const std::string key = "UseBlockingSync";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting what numerical precision to use.
+     */
+    static const std::string& HipPrecision() {
+        static const std::string key = "Precision";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
+     */
+    static const std::string& HipUseCpuPme() {
+        static const std::string key = "UseCpuPme";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for specifying the path to the directory for creating temporary files.
+     */
+    static const std::string& HipTempDirectory() {
+        static const std::string key = "TempDirectory";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for selecting whether to disable use of a separate stream for PME.
+     */
+    static const std::string& HipDisablePmeStream() {
+        static const std::string key = "DisablePmeStream";
+        return key;
+    }
+    /**
+     * This is the name of the parameter for requesting that force computations be fully deterministic.
+     */
+    static const std::string& HipDeterministicForces() {
+        static const std::string key = "DeterministicForces";
+        return key;
+    }
+};
+
+class OPENMM_EXPORT_COMMON HipPlatform::PlatformData {
+public:
+    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+            const std::string& cpuPmeProperty, const std::string& tempProperty,
+            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
+    ~PlatformData();
+    void initializeContexts(const System& system);
+    void syncContexts();
+    ContextImpl* context;
+    std::vector<HipContext*> contexts;
+    std::vector<double> contextEnergy;
+    bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme, disablePmeStream, deterministicForces;
+    int cmMotionFrequency, computeForceCount;
+    long long stepCount;
+    double time;
+    std::map<std::string, std::string> propertyValues;
+    ThreadPool threads;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_HIPPLATFORM_H_*/
--- a/platforms/hip/include/HipProgram.h
+++ b/platforms/hip/include/HipProgram.h
+#ifndef OPENMM_HIPPROGRAM_H_
+#define OPENMM_HIPPROGRAM_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ComputeProgram.h"
+#include "HipContext.h"
+
+namespace OpenMM {
+
+/**
+ * This is the HIP implementation of the ComputeProgramImpl interface.
+ */
+
+class HipProgram : public ComputeProgramImpl {
+public:
+    /**
+     * Create a new HipProgram.
+     *
+     * @param context      the context this kernel belongs to
+     * @param module       the compiled module
+     */
+    HipProgram(HipContext& context, hipModule_t module);
+    /**
+     * Create a ComputeKernel for one of the kernels in this program.
+     *
+     * @param name    the name of the kernel to get
+     */
+    ComputeKernel createKernel(const std::string& name);
+private:
+    HipContext& context;
+    hipModule_t module;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_HIPPROGRAM_H_*/
--- a/platforms/hip/include/HipSort.h
+++ b/platforms/hip/include/HipSort.h
+#ifndef __OPENMM_HIPSORT_H__
+#define __OPENMM_HIPSORT_H__
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipArray.h"
+#include "openmm/common/windowsExportCommon.h"
+#include "HipContext.h"
+
+namespace OpenMM {
+
+/**
+ * This class sorts arrays of values.  It supports any type of values, not just scalars,
+ * so long as an appropriate sorting key can be defined by which to sort them.
+ *
+ * The sorting behavior is specified by a "trait" class that defines the type of data to
+ * sort and the key for sorting it.  Here is an example of a trait class for
+ * sorting floats:
+ *
+ * class FloatTrait : public HipSort::SortTrait {
+ *     int getDataSize() const {return 4;}
+ *     int getKeySize() const {return 4;}
+ *     const char* getDataType() const {return "float";}
+ *     const char* getKeyType() const {return "float";}
+ *     const char* getMinKey() const {return "-3.40282e+38f";}
+ *     const char* getMaxKey() const {return "3.40282e+38f";}
+ *     const char* getMaxValue() const {return "3.40282e+38f";}
+ *     const char* getSortKey() const {return "value";}
+ * };
+ *
+ * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
+ * (in local memory when possible, in global memory otherwise).  This is similar to
+ * the algorithm described in
+ *
+ * Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng.  "An Efficient
+ * Sorting Algorithm with CUDA"  Journal of the Chinese Institute of Engineers, 32(7),
+ * pp. 915-921 (2009)
+ *
+ * but with many modifications and simplifications.  In particular, this algorithm
+ * involves much less communication between host and device, which is critical to get
+ * good performance with the array sizes we typically work with (10,000 to 100,000
+ * elements).
+ */
+
+class OPENMM_EXPORT_COMMON HipSort {
+public:
+    class SortTrait;
+    /**
+     * Create a HipSort object for sorting data of a particular type.
+     *
+     * @param context    the context in which to perform calculations
+     * @param trait      a SortTrait defining the type of data to sort.  It should have been allocated
+     *                   on the heap with the "new" operator.  This object takes over ownership of it,
+     *                   and deletes it when the HipSort is deleted.
+     * @param length     the length of the arrays this object will be used to sort.
+     * @param uniform    whether the input data is expected to follow a uniform or nonuniform
+     *                   distribution.  This argument is used only as a hint.
+     */
+    HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform=true);
+    ~HipSort();
+    /**
+     * Sort an array.
+     */
+    void sort(HipArray& data);
+private:
+    HipContext& context;
+    SortTrait* trait;
+    HipArray counters;
+    HipArray dataRange;
+    HipArray bucketOfElement;
+    HipArray offsetInBucket;
+    HipArray bucketOffset;
+    HipArray buckets;
+    hipFunction_t shortListKernel, shortList2Kernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    unsigned int dataLength, rangeKernelBlocks, rangeKernelSize, positionsKernelSize, sortKernelSize;
+    bool isShortList, uniform;
+};
+
+/**
+ * A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
+ */
+class HipSort::SortTrait {
+public:
+    virtual ~SortTrait() {
+    }
+    /**
+     * Get the size of each data value in bytes.
+     */
+    virtual int getDataSize() const = 0;
+    /**
+     * Get the size of each key value in bytes.
+     */
+    virtual int getKeySize() const = 0;
+    /**
+     * Get the data type of the values to sort.
+     */
+    virtual const char* getDataType() const = 0;
+    /**
+     * Get the data type of the sorting key.
+     */
+    virtual const char* getKeyType() const = 0;
+    /**
+     * Get the minimum value a key can take.
+     */
+    virtual const char* getMinKey() const = 0;
+    /**
+     * Get the maximum value a key can take.
+     */
+    virtual const char* getMaxKey() const = 0;
+    /**
+     * Get a value whose key is guaranteed to equal getMaxKey().
+     */
+    virtual const char* getMaxValue() const = 0;
+    /**
+     * Get the HIP code to select the key from the data value.
+     */
+    virtual const char* getSortKey() const = 0;
+};
+
+
+} // namespace OpenMM
+
+#endif // __OPENMM_HIPSORT_H__
--- a/platforms/hip/src/HipArray.cpp
+++ b/platforms/hip/src/HipArray.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipArray.h"
+#include "HipContext.h"
+#include "openmm/common/ContextSelector.h"
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+using namespace OpenMM;
+
+HipArray::HipArray() : pointer(0), ownsMemory(false) {
+}
+
+HipArray::HipArray(HipContext& context, size_t size, int elementSize, const std::string& name) : pointer(0) {
+    initialize(context, size, elementSize, name);
+}
+
+HipArray::~HipArray() {
+    if (pointer != 0 && ownsMemory && context->getContextIsValid()) {
+        ContextSelector selector(*context);
+        hipError_t result = hipFree(pointer);
+        if (result != hipSuccess) {
+            std::stringstream str;
+            str<<"Error deleting array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
+            throw OpenMMException(str.str());
+        }
+    }
+}
+
+void HipArray::initialize(ComputeContext& context, size_t size, int elementSize, const std::string& name) {
+    if (this->pointer != 0)
+        throw OpenMMException("HipArray has already been initialized");
+    this->context = &dynamic_cast<HipContext&>(context);
+    this->size = size;
+    this->elementSize = elementSize;
+    this->name = name;
+    ownsMemory = true;
+    ContextSelector selector(*this->context);
+    hipError_t result = hipMalloc(&pointer, size*elementSize);
+    if (result != hipSuccess) {
+        std::stringstream str;
+        str<<"Error creating array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void HipArray::resize(size_t size) {
+    if (pointer == 0)
+        throw OpenMMException("HipArray has not been initialized");
+    if (!ownsMemory)
+        throw OpenMMException("Cannot resize an array that does not own its storage");
+    ContextSelector selector(*context);
+    hipError_t result = hipFree(pointer);
+    if (result != hipSuccess) {
+        std::stringstream str;
+        str<<"Error deleting array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+    pointer = 0;
+    initialize(*context, size, elementSize, name);
+}
+
+ComputeContext& HipArray::getContext() {
+    return *context;
+}
+
+void HipArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
+    if (pointer == 0)
+        throw OpenMMException("HipArray has not been initialized");
+    if (offset < 0 || offset+elements > getSize())
+        throw OpenMMException("uploadSubArray: data exceeds range of array");
+    hipError_t result;
+    result = hipMemcpyAsync(reinterpret_cast<char*>(pointer)+offset*elementSize, const_cast<void*>(data), elements*elementSize, hipMemcpyHostToDevice, context->getCurrentStream());
+    if (blocking && result == hipSuccess)
+        result = hipStreamSynchronize(context->getCurrentStream());
+    if (result != hipSuccess) {
+        std::stringstream str;
+        str<<"Error uploading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void HipArray::download(void* data, bool blocking) const {
+    if (pointer == 0)
+        throw OpenMMException("HipArray has not been initialized");
+    hipError_t result;
+    result = hipMemcpyAsync(data, pointer, size*elementSize, hipMemcpyDeviceToHost, context->getCurrentStream());
+    if (blocking && result == hipSuccess)
+        result = hipStreamSynchronize(context->getCurrentStream());
+    if (result != hipSuccess) {
+        std::stringstream str;
+        str<<"Error downloading array "<<name<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void HipArray::copyTo(ArrayInterface& dest) const {
+    if (pointer == 0)
+        throw OpenMMException("HipArray has not been initialized");
+    if (dest.getSize() != size || dest.getElementSize() != elementSize)
+        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
+    HipArray& cuDest = context->unwrap(dest);
+    hipError_t result = hipMemcpyAsync(cuDest.getDevicePointer(), pointer, size*elementSize, hipMemcpyDeviceToDevice, context->getCurrentStream());
+    if (result != hipSuccess) {
+        std::stringstream str;
+        str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2024 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
+#include "HipContext.h"
+#include "HipArray.h"
+#include "HipBondedUtilities.h"
+#include "HipEvent.h"
+#include "HipIntegrationUtilities.h"
+#include "HipKernels.h"
+#include "HipKernelSources.h"
+#include "HipNonbondedUtilities.h"
+#include "HipProgram.h"
+#include "HipFFT3D.h"
+#include "openmm/common/ComputeArray.h"
+#include "openmm/common/ContextSelector.h"
+#include "SHA1.h"
+#include "openmm/MonteCarloFlexibleBarostat.h"
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+#include "openmm/VirtualSite.h"
+#include "HipExpressionUtilities.h"
+#include "openmm/internal/ContextImpl.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <stack>
+#include <thread>
+#include <typeinfo>
+#include <sys/stat.h>
+#include <hip/hiprtc.h>
+
+
+#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
+#define CHECK_RESULT2(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+#define HIPRTC_CHECK_RESULT(result, prefix) \
+    if (result != HIPRTC_SUCCESS) { \
+        stringstream m; \
+        m<<prefix<<": "<<hiprtcGetErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+using namespace OpenMM;
+using namespace std;
+
+const int HipContext::ThreadBlockSize = 64;
+const int HipContext::TileSize = 32;
+bool HipContext::hasInitializedHip = false;
+
+
+HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, HipPlatform::PlatformData& platformData,
+        HipContext* originalContext) : ComputeContext(system), currentStream(0), defaultStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
+        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
+        useBlockingSync(useBlockingSync), supportsHardwareFloatGlobalAtomicAdd(false) {
+    if (!hasInitializedHip) {
+        CHECK_RESULT2(hipInit(0), "Error initializing HIP");
+        hasInitializedHip = true;
+    }
+    if (precision == "single") {
+        useDoublePrecision = false;
+        useMixedPrecision = false;
+    }
+    else if (precision == "mixed") {
+        useDoublePrecision = false;
+        useMixedPrecision = true;
+    }
+    else if (precision == "double") {
+        useDoublePrecision = true;
+        useMixedPrecision = false;
+    }
+    else
+        throw OpenMMException("Illegal value for Precision: "+precision);
+    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
+    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
+#ifdef WIN32
+    this->tempDir = tempDir+"\\";
+    cacheDir = cacheDir+"\\";
+#else
+    this->tempDir = tempDir+"/";
+    cacheDir = cacheDir+"/";
+#endif
+    contextIndex = platformData.contexts.size();
+    string errorMessage = "Error initializing Context";
+    if (originalContext == NULL) {
+        isLinkedContext = false;
+        int numDevices;
+        CHECK_RESULT(hipGetDeviceCount(&numDevices));
+        if (deviceIndex < -1 || deviceIndex >= numDevices)
+            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
+
+        vector<int> devicePrecedence;
+        if (deviceIndex == -1) {
+            devicePrecedence = getDevicePrecedence();
+        } else {
+            devicePrecedence.push_back(deviceIndex);
+        }
+
+        this->deviceIndex = -1;
+        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+            int trialDeviceIndex = devicePrecedence[i];
+            CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
+            // try setting device
+            if (hipSetDevice(device) == hipSuccess) {
+                this->deviceIndex = trialDeviceIndex;
+                break;
+            }
+
+        }
+        if (this->deviceIndex == -1) {
+            if (deviceIndex != -1)
+                throw OpenMMException("The requested HIP device could not be loaded");
+            else
+                throw OpenMMException("No compatible HIP device is available");
+        }
+        CHECK_RESULT(hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));
+    }
+    else {
+        isLinkedContext = true;
+        this->deviceIndex = originalContext->deviceIndex;
+        this->device = originalContext->device;
+        defaultStream = originalContext->defaultStream;
+    }
+    currentStream = defaultStream;
+
+    hipDeviceProp_t props;
+    CHECK_RESULT(hipGetDeviceProperties(&props, device));
+
+    // set device properties
+    this->simdWidth = props.warpSize;
+    this->sharedMemPerBlock = props.sharedMemPerBlock;
+
+    gpuArchitecture = props.gcnArchName;
+    // HIP-TODO: find a good value here
+    int numThreadBlocksPerComputeUnit = 6;
+
+    // GPUs starting from CDNA1 and RDNA3 support atomic add for floats (global_atomic_add_f32),
+    // which can be used in PME. Older GPUs use fixed point charge spreading instead.
+    this->supportsHardwareFloatGlobalAtomicAdd = true;
+    if (gpuArchitecture.find("gfx900") == 0 ||
+        gpuArchitecture.find("gfx906") == 0 ||
+        gpuArchitecture.find("gfx10") == 0) {
+        this->supportsHardwareFloatGlobalAtomicAdd = false;
+    }
+
+    contextIsValid = true;
+    ContextSelector selector(*this);
+    if (contextIndex > 0) {
+        int canAccess;
+        CHECK_RESULT(hipDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice()));
+        if (canAccess) {
+            {
+                ContextSelector selector2(*platformData.contexts[0]);
+                hipError_t result = hipDeviceEnablePeerAccess(getDevice(), 0);
+                if (result != hipErrorPeerAccessAlreadyEnabled) {
+                    CHECK_RESULT(result);
+                }
+            }
+            hipError_t result = hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0);
+            if (result != hipErrorPeerAccessAlreadyEnabled) {
+                CHECK_RESULT(result);
+            }
+        }
+    }
+    numAtoms = system.getNumParticles();
+    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
+    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
+    CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
+    // For RDNA GPUs hipDeviceAttributeMultiprocessorCount means WGP (work-group processors, two compute units), not CUs.
+    if (simdWidth == 32)
+        multiprocessors *= 2;
+    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
+
+    compilationDefines["USE_HIP"] = "1";
+    if (simdWidth == 32)
+        compilationDefines["AMD_RDNA"] = "1";
+    if (useDoublePrecision) {
+        posq.initialize<double4>(*this, paddedNumAtoms, "posq");
+        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
+        compilationDefines["make_real2"] = "make_double2";
+        compilationDefines["make_real3"] = "make_double3";
+        compilationDefines["make_real4"] = "make_double4";
+        compilationDefines["make_mixed2"] = "make_double2";
+        compilationDefines["make_mixed3"] = "make_double3";
+        compilationDefines["make_mixed4"] = "make_double4";
+    }
+    else if (useMixedPrecision) {
+        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
+        posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
+        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["USE_MIXED_PRECISION"] = "1";
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
+        compilationDefines["make_mixed2"] = "make_double2";
+        compilationDefines["make_mixed3"] = "make_double3";
+        compilationDefines["make_mixed4"] = "make_double4";
+    }
+    else {
+        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
+        velm.initialize<float4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
+        compilationDefines["make_mixed2"] = "make_float2";
+        compilationDefines["make_mixed3"] = "make_float3";
+        compilationDefines["make_mixed4"] = "make_float4";
+    }
+    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
+    posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
+    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
+    atomIndex.resize(paddedNumAtoms);
+    for (int i = 0; i < paddedNumAtoms; ++i)
+        atomIndex[i] = i;
+    atomIndexDevice.upload(atomIndex);
+
+    // Create utility kernels that are used in multiple places.
+
+    hipModule_t utilities = createModule(HipKernelSources::vectorOps+HipKernelSources::utilities);
+    clearBufferKernel = getKernel(utilities, "clearBuffer");
+    clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
+    clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
+    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
+    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
+    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
+    setChargesKernel = getKernel(utilities, "setCharges");
+
+    // Set defines based on the requested precision.
+
+    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "__fsqrt_rn";
+    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "__frsqrt_rn";
+    compilationDefines["RECIP(x)"] = useDoublePrecision ? "(1.0/(x))" : "(1.0f/(x))";
+    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "__expf";
+    compilationDefines["LOG"] = useDoublePrecision ? "log" : "__logf";
+    compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
+    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
+    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
+    compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
+    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
+    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
+    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
+    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
+    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
+
+    // Set defines for applying periodic boundary conditions.
+
+    Vec3 boxVectors[3];
+    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
+                      boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
+                      boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
+    for (int i = 0; i < system.getNumForces(); i++)
+        if (dynamic_cast<const MonteCarloFlexibleBarostat*>(&system.getForce(i)) != NULL)
+            boxIsTriclinic = true;
+    if (boxIsTriclinic) {
+        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
+            "{"
+            "real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
+            "delta.x -= scale3*periodicBoxVecZ.x; \\\n"
+            "delta.y -= scale3*periodicBoxVecZ.y; \\\n"
+            "delta.z -= scale3*periodicBoxVecZ.z; \\\n"
+            "real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
+            "delta.x -= scale2*periodicBoxVecY.x; \\\n"
+            "delta.y -= scale2*periodicBoxVecY.y; \\\n"
+            "real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
+            "delta.x -= scale1*periodicBoxVecX.x;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
+            "{"
+            "real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
+            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
+            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
+            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
+            "real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
+            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
+            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
+            "real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
+            "pos.x -= scale1*periodicBoxVecX.x;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
+            "{"
+            "real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
+            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
+            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
+            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
+            "real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
+            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
+            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
+            "real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
+            "pos.x -= scale1*periodicBoxVecX.x;}";
+    }
+    else {
+        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
+            "{"
+            "delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
+            "delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
+            "delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
+            "{"
+            "pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
+            "pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
+            "pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
+            "{"
+            "pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
+            "pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
+            "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
+    }
+
+    // Create utilities objects.
+
+    bonded = new HipBondedUtilities(*this);
+    nonbonded = new HipNonbondedUtilities(*this);
+    integration = new HipIntegrationUtilities(*this, system);
+    expression = new HipExpressionUtilities(*this);
+}
+
+HipContext::~HipContext() {
+    pushAsCurrent();
+    for (auto force : forces)
+        delete force;
+    for (auto listener : reorderListeners)
+        delete listener;
+    for (auto computation : preComputations)
+        delete computation;
+    for (auto computation : postComputations)
+        delete computation;
+    if (pinnedBuffer != NULL)
+        hipHostFree(pinnedBuffer);
+    if (integration != NULL)
+        delete integration;
+    if (expression != NULL)
+        delete expression;
+    if (bonded != NULL)
+        delete bonded;
+    if (nonbonded != NULL)
+        delete nonbonded;
+    for (auto module : loadedModules)
+        hipModuleUnload(module);
+    if (!isLinkedContext)
+        hipStreamDestroy(defaultStream);
+    popAsCurrent();
+    contextIsValid = false;
+}
+
+void HipContext::initialize() {
+    ContextSelector selector(*this);
+    string errorMessage = "Error initializing Context";
+    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
+    if (useDoublePrecision) {
+        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<double>(*this, multiprocessors, "energySum");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), getHostMallocFlags()));
+    }
+    else if (useMixedPrecision) {
+        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<double>(*this, multiprocessors, "energySum");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), getHostMallocFlags()));
+    }
+    else {
+        energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<float>(*this, multiprocessors, "energySum");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
+        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), getHostMallocFlags()));
+    }
+    for (int i = 0; i < numAtoms; i++) {
+        double mass = system.getParticleMass(i);
+        if (useDoublePrecision || useMixedPrecision)
+            ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
+        else
+            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
+    }
+    velm.upload(pinnedBuffer);
+    bonded->initialize(system);
+    addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
+    addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
+    int numEnergyParamDerivs = energyParamDerivNames.size();
+    if (numEnergyParamDerivs > 0) {
+        if (useDoublePrecision || useMixedPrecision)
+            energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        else
+            energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        addAutoclearBuffer(energyParamDerivBuffer);
+    }
+    findMoleculeGroups();
+    nonbonded->initialize(system);
+}
+
+void HipContext::initializeContexts() {
+    getPlatformData().initializeContexts(system);
+}
+
+void HipContext::setAsCurrent() {
+    if (contextIsValid)
+        hipSetDevice(device);
+}
+
+thread_local std::stack<hipDevice_t> outerScopeDevices;
+
+void HipContext::pushAsCurrent() {
+    if (contextIsValid) {
+        // Emulate cuCtxPushCurrent's behavior because hipCtxPushCurrent is deprecated
+        hipDevice_t outerScopeDevice;
+        hipGetDevice(&outerScopeDevice);
+        outerScopeDevices.push(outerScopeDevice);
+        if (device != outerScopeDevice) {
+            hipSetDevice(device);
+        }
+    }
+}
+
+void HipContext::popAsCurrent() {
+    if (contextIsValid) {
+        // Emulate cuCtxPopCurrent's behavior because hipCtxPopCurrent is deprecated
+        hipDevice_t outerScopeDevice = outerScopeDevices.top();
+        outerScopeDevices.pop();
+        if (outerScopeDevice != device) {
+            hipSetDevice(outerScopeDevice);
+        }
+    }
+}
+
+string HipContext::getTempFileName() const {
+    stringstream tempFileName;
+    tempFileName << tempDir;
+    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
+    tempFileName << "_" << std::this_thread::get_id();
+    return tempFileName.str();
+}
+
+string HipContext::getHash(const string& src) const {
+    CSHA1 sha1;
+    sha1.Update((const UINT_8*) src.c_str(), src.size());
+    sha1.Final();
+    UINT_8 hash[20];
+    sha1.GetHash(hash);
+    stringstream cacheHash;
+    cacheHash.flags(ios::hex);
+    for (int i = 0; i < 20; i++)
+        cacheHash << setw(2) << setfill('0') << (int) hash[i];
+    return cacheHash.str();
+}
+
+string HipContext::getCacheFileName(const string& src) const {
+    stringstream cacheFile;
+    cacheFile << cacheDir << "openmm-hip-" << getHash(src + gpuArchitecture);
+    return cacheFile.str();
+}
+
+hipModule_t HipContext::createModule(const string source) {
+    return createModule(source, map<string, string>());
+}
+
+hipModule_t HipContext::createModule(const string source, const map<string, string>& defines) {
+    const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS");
+    const bool saveTemps = saveTempsEnv != nullptr && string(saveTempsEnv) == "1";
+
+    int runtimeVersion;
+    CHECK_RESULT2(hipRuntimeGetVersion(&runtimeVersion), "Error getting HIP runtime version");
+
+    string options = "-O3 -ffast-math -munsafe-fp-atomics -Wall -Wno-hip-only";
+    options += " --offload-arch=" + gpuArchitecture;
+    if (gpuArchitecture.find("gfx90a") == 0 ||
+        gpuArchitecture.find("gfx94") == 0) {
+        // HIP-TODO: Remove it when the compiler does a better job
+        // Disable SLP vectorization as it may generate unoptimal packed math instructions on
+        // >=MI200 (gfx90a, gfx942): more v_mov, higher register usage etc.
+        options += " -fno-slp-vectorize";
+    }
+    if (getMaxThreadBlockSize() < 1024) {
+        options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
+    }
+    if (runtimeVersion < 60140092) {
+        // Workaround for operator* defined for complex types (typedefs for float2, double2) in
+        // ROCm 6.0 headers. This issue has been fixed in 6.1. hipRTC includes amd_hip_complex.h
+        // by default, we fool the include guard into thinking the header is already included.
+        options += " -D HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H";
+    }
+    stringstream src;
+    src << "// Compilation Options: " << options << endl << endl;
+    src << "// HIP Runtime Version: " << runtimeVersion << endl << endl;
+    for (auto& pair : compilationDefines) {
+        // Query defines to avoid duplicate variables
+        if (defines.find(pair.first) == defines.end()) {
+            src << "#define " << pair.first;
+            if (!pair.second.empty())
+                src << " " << pair.second;
+            src << endl;
+        }
+    }
+    if (!compilationDefines.empty())
+        src << endl;
+
+    if (useDoublePrecision) {
+        src << "typedef double real;\n";
+        src << "typedef double2 real2;\n";
+        src << "typedef double3 real3;\n";
+        src << "typedef double4 real4;\n";
+    }
+    else {
+        src << "typedef float real;\n";
+        src << "typedef float2 real2;\n";
+        src << "typedef float3 real3;\n";
+        src << "typedef float4 real4;\n";
+    }
+    if (useDoublePrecision || useMixedPrecision) {
+        src << "typedef double mixed;\n";
+        src << "typedef double2 mixed2;\n";
+        src << "typedef double3 mixed3;\n";
+        src << "typedef double4 mixed4;\n";
+    }
+    else {
+        src << "typedef float mixed;\n";
+        src << "typedef float2 mixed2;\n";
+        src << "typedef float3 mixed3;\n";
+        src << "typedef float4 mixed4;\n";
+    }
+    src << "typedef unsigned int tileflags;\n";
+    src << HipKernelSources::common << endl;
+    for (auto& pair : defines) {
+        src << "#define " << pair.first;
+        if (!pair.second.empty())
+            src << " " << pair.second;
+        src << endl;
+    }
+    if (!defines.empty())
+        src << endl;
+    src << HipKernelSources::intrinsics << endl;
+    src << source << endl;
+
+    // See whether we already have PTX for this kernel cached.
+
+    string cacheFile = getCacheFileName(src.str());
+    hipModule_t module;
+    if (hipModuleLoad(&module, cacheFile.c_str()) == hipSuccess) {
+        loadedModules.push_back(module);
+        return module;
+    }
+
+    // Select names for the various temporary files.
+
+    if (saveTemps) {
+        stringstream tempFileName;
+        const char* saveTempsPrefixEnv = getenv("OPENMM_SAVE_TEMPS_PREFIX");
+        if (saveTempsPrefixEnv) {
+            tempFileName << saveTempsPrefixEnv;
+        }
+        tempFileName << getHash(src.str());
+
+        options += " --save-temps";
+
+        string inputFile = (tempFileName.str()+".hip");
+        std::cout << "Source code: " << inputFile << std::endl;
+        std::cout << "Compile options: " << options << std::endl;
+        ofstream out(inputFile.c_str());
+        out << src.str();
+        out.close();
+    }
+
+    // Split the command line options into an array of options.
+
+    stringstream flagsStream(options);
+    string flag;
+    vector<string> splitFlags;
+    while (flagsStream >> flag)
+        splitFlags.push_back(flag);
+    int numOptions = splitFlags.size();
+    vector<const char*> optionsVec(numOptions);
+    for (int i = 0; i < numOptions; i++)
+        optionsVec[i] = &splitFlags[i][0];
+
+    // Compile the program to CO.
+
+    hiprtcProgram program;
+    HIPRTC_CHECK_RESULT(hiprtcCreateProgram(&program, src.str().c_str(), NULL, 0, NULL, NULL), "Error creating program");
+    try {
+        hiprtcResult result = hiprtcCompileProgram(program, optionsVec.size(), &optionsVec[0]);
+        if (result != HIPRTC_SUCCESS || saveTemps) {
+            size_t logSize;
+            hiprtcGetProgramLogSize(program, &logSize);
+            std::string log(logSize, '\0');
+            if (logSize > 0) {
+                hiprtcGetProgramLog(program, &log[0]);
+                if (saveTemps) {
+                    std::cout << "Log: " << log << std::endl;
+                }
+            }
+            if (result != HIPRTC_SUCCESS) {
+                throw OpenMMException("Error compiling program: "+log);
+            }
+        }
+        size_t codeSize;
+        hiprtcGetCodeSize(program, &codeSize);
+        vector<char> code(codeSize);
+        hiprtcGetCode(program, &code[0]);
+        hiprtcDestroyProgram(&program);
+
+        // If possible, write the CO out to a cache file for later use.
+
+        try {
+            ofstream out(cacheFile.c_str(), ios::out | ios::binary);
+            out.write(&code[0], code.size());
+            out.close();
+        }
+        catch (...) {
+            // An error occurred.  Possibly we don't have permission to write to the temp directory.
+            // Ignore.
+        }
+        CHECK_RESULT2(hipModuleLoadDataEx(&module, &code[0], 0, NULL, NULL), "Error loading HIP module");
+        loadedModules.push_back(module);
+        return module;
+    }
+    catch (...) {
+        hiprtcDestroyProgram(&program);
+        throw;
+    }
+}
+
+hipFunction_t HipContext::getKernel(hipModule_t& module, const string& name) {
+    hipFunction_t function;
+    hipError_t result = hipModuleGetFunction(&function, module, name.c_str());
+    if (result != hipSuccess) {
+        std::stringstream m;
+        m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(m.str());
+    }
+    return function;
+}
+
+vector<ComputeContext*> HipContext::getAllContexts() {
+    vector<ComputeContext*> result;
+    for (HipContext* c : platformData.contexts)
+        result.push_back(c);
+    return result;
+}
+
+hipStream_t HipContext::getCurrentStream() {
+    return currentStream;
+}
+
+void HipContext::setCurrentStream(hipStream_t stream) {
+    currentStream = stream;
+}
+
+void HipContext::restoreDefaultStream() {
+    currentStream = defaultStream;
+}
+
+HipArray* HipContext::createArray() {
+    return new HipArray();
+}
+
+ComputeEvent HipContext::createEvent() {
+    return shared_ptr<ComputeEventImpl>(new HipEvent(*this));
+}
+
+HipFFT3D* HipContext::createFFT(int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) {
+    return new HipFFT3D(*this, xsize, ysize, zsize, realToComplex, stream, in, out);
+}
+
+int HipContext::findLegalFFTDimension(int minimum) {
+    return HipFFT3D::findLegalDimension(minimum);
+}
+
+ComputeProgram HipContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
+    hipModule_t module = createModule(HipKernelSources::vectorOps+source, defines);
+    return shared_ptr<ComputeProgramImpl>(new HipProgram(*this, module));
+}
+
+HipArray& HipContext::unwrap(ArrayInterface& array) const {
+    HipArray* cuarray;
+    ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
+    if (wrapper != NULL)
+        cuarray = dynamic_cast<HipArray*>(&wrapper->getArray());
+    else
+        cuarray = dynamic_cast<HipArray*>(&array);
+    if (cuarray == NULL)
+        throw OpenMMException("Array argument is not an HipArray");
+    return *cuarray;
+}
+
+std::string HipContext::getErrorString(hipError_t result) {
+    return string(hipGetErrorName(result));
+}
+
+void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
+    if (blockSize == -1)
+        blockSize = ThreadBlockSize;
+    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
+    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    if (result != hipSuccess) {
+        stringstream str;
+        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void HipContext::executeKernelFlat(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
+    if (blockSize == -1)
+        blockSize = ThreadBlockSize;
+    int gridSize = (threads+blockSize-1)/blockSize;
+    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    if (result != hipSuccess) {
+        stringstream str;
+        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+int HipContext::computeThreadBlockSize(double memory) const {
+    int maxShared = this->sharedMemPerBlock;
+    int max = (int) (maxShared/memory);
+    if (max < HipContext::ThreadBlockSize) {
+        throw OpenMMException("Too much shared memory requested!");
+    }
+    int threads = this->simdWidth;
+    while (threads+this->simdWidth < max)
+        threads += this->simdWidth;
+    return threads;
+}
+
+void HipContext::clearBuffer(ArrayInterface& array) {
+    clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
+}
+
+void HipContext::clearBuffer(hipDeviceptr_t memory, int size) {
+    int words = size/4;
+    void* args[] = {&memory, &words};
+    executeKernel(clearBufferKernel, args, words, 4 * this->simdWidth);
+}
+
+void HipContext::addAutoclearBuffer(ArrayInterface& array) {
+    addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
+}
+
+void HipContext::addAutoclearBuffer(hipDeviceptr_t memory, int size) {
+    autoclearBuffers.push_back(memory);
+    autoclearBufferSizes.push_back(size/4);
+}
+
+void HipContext::clearAutoclearBuffers() {
+
+    int preferredTBSize = this->simdWidth * 4;
+    int base = 0;
+    int total = autoclearBufferSizes.size();
+    while (total-base >= 6) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
+                        &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
+        executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), preferredTBSize);
+        base += 6;
+    }
+    if (total-base == 5) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
+        executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), preferredTBSize);
+    }
+    else if (total-base == 4) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
+        executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), preferredTBSize);
+    }
+    else if (total-base == 3) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
+        executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), preferredTBSize);
+    }
+    else if (total-base == 2) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
+        executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), preferredTBSize);
+    }
+    else if (total-base == 1) {
+        clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
+    }
+}
+
+double HipContext::reduceEnergy() {
+    int bufferSize = energyBuffer.getSize();
+    int workGroupSize = getMaxThreadBlockSize();
+    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
+    executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
+    energySum.download(pinnedBuffer);
+    double result = 0;
+    if (getUseDoublePrecision() || getUseMixedPrecision()) {
+        for (int i = 0; i < energySum.getSize(); i++)
+            result += ((double*) pinnedBuffer)[i];
+    }
+    else {
+        for (int i = 0; i < energySum.getSize(); i++)
+            result += ((float*) pinnedBuffer)[i];
+    }
+    return result;
+}
+
+void HipContext::setCharges(const vector<double>& charges) {
+    if (!chargeBuffer.isInitialized())
+        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    vector<double> c(numAtoms);
+    for (int i = 0; i < numAtoms; i++)
+        c[i] = charges[i];
+    chargeBuffer.upload(c, true);
+    void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
+    executeKernel(setChargesKernel, args, numAtoms);
+}
+
+bool HipContext::requestPosqCharges() {
+    bool allow = !hasAssignedPosqCharges;
+    hasAssignedPosqCharges = true;
+    return allow;
+}
+
+void HipContext::addEnergyParameterDerivative(const string& param) {
+    // See if this parameter has already been registered.
+
+    for (int i = 0; i < energyParamDerivNames.size(); i++)
+        if (param == energyParamDerivNames[i])
+            return;
+    energyParamDerivNames.push_back(param);
+}
+
+void HipContext::flushQueue() {
+    hipStreamSynchronize(getCurrentStream());
+}
+
+vector<int> HipContext::getDevicePrecedence() {
+    int numDevices;
+    hipDeviceProp_t thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<int, int> > devices;
+
+    CHECK_RESULT(hipGetDeviceCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i));
+        int clock, multiprocessors, speed;
+        clock = thisDevice.clockRate;
+        multiprocessors = thisDevice.multiProcessorCount;
+        speed = clock*multiprocessors;
+        devices.push_back(std::make_pair(speed, -i));
+    }
+
+    // sort first by speed (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+
+    return precedence;
+}
+
+unsigned int HipContext::getEventFlags() {
+    unsigned int flags = hipEventDisableTiming;
+    return flags;
+}
+
+unsigned int HipContext::getHostMallocFlags() {
+#ifdef WIN32
+    return hipHostMallocDefault;
+#else
+    return hipHostMallocNumaUser;
+#endif
+}
--- a/platforms/hip/src/HipEvent.cpp
+++ b/platforms/hip/src/HipEvent.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipEvent.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+
+HipEvent::HipEvent(HipContext& context) : context(context), eventCreated(false) {
+    hipError_t result = hipEventCreateWithFlags(&event, context.getEventFlags());
+    if (result != hipSuccess)
+        throw OpenMMException("Error creating HIP event:"+HipContext::getErrorString(result));
+    eventCreated = true;
+}
+
+HipEvent::~HipEvent() {
+    if (eventCreated)
+        hipEventDestroy(event);
+}
+
+void HipEvent::enqueue() {
+    hipEventRecord(event, context.getCurrentStream());
+}
+
+void HipEvent::wait() {
+    hipEventSynchronize(event);
+}
--- a/platforms/hip/src/HipFFT3D.cpp
+++ b/platforms/hip/src/HipFFT3D.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2021 Advanced Micro Devices, Inc.                   *
+ * Authors:                                                                   *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipFFT3D.h"
+#include "HipContext.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <iterator>
+
+using namespace OpenMM;
+using namespace std;
+
+HipFFT3D::HipFFT3D(HipContext& context, int xsize, int ysize, int zsize, bool realToComplex, hipStream_t stream, HipArray& in, HipArray& out) :
+        context(context), stream(stream) {
+
+    deviceIndex = context.getDeviceIndex();
+    inputBuffer = in.getDevicePointer();
+    outputBuffer = out.getDevicePointer();
+    size_t valueSize = context.getUseDoublePrecision() ? sizeof(double) : sizeof(float);
+    inputBufferSize = zsize * ysize * xsize * valueSize;
+    if (realToComplex) {
+        outputBufferSize = (zsize/2 + 1) * ysize * xsize * valueSize * 2;
+    }
+    else {
+        outputBufferSize = zsize * ysize * xsize * valueSize;
+    }
+
+    VkFFTConfiguration configuration = {};
+    configuration.performR2C = realToComplex;
+    configuration.device = &deviceIndex;
+    configuration.num_streams = 1;
+    configuration.stream = &this->stream;
+    configuration.doublePrecision = context.getUseDoublePrecision();
+
+    configuration.FFTdim = 3;
+    configuration.size[0] = zsize;
+    configuration.size[1] = ysize;
+    configuration.size[2] = xsize;
+
+    configuration.inverseReturnToInputBuffer = true;
+    configuration.isInputFormatted = true;
+    configuration.inputBufferSize = &inputBufferSize;
+    configuration.inputBuffer = &inputBuffer;
+    configuration.inputBufferStride[0] = zsize;
+    configuration.inputBufferStride[1] = configuration.inputBufferStride[0] * ysize;
+    configuration.inputBufferStride[2] = configuration.inputBufferStride[1] * xsize;
+
+    configuration.bufferSize = &outputBufferSize;
+    configuration.buffer = &outputBuffer;
+    configuration.bufferStride[0] = realToComplex ? (zsize/2 + 1) : zsize;
+    configuration.bufferStride[1] = configuration.bufferStride[0] * ysize;
+    configuration.bufferStride[2] = configuration.bufferStride[1] * xsize;
+
+    // Combine all parameters into a unique key
+    stringstream info;
+    int runtimeVersion;
+    (void)hipRuntimeGetVersion(&runtimeVersion);
+    info << runtimeVersion;
+    info << " " << VkFFTGetVersion();
+    info << " " << xsize << " " << ysize << " " << zsize;
+    info << " " << realToComplex << " " << context.getUseDoublePrecision();
+
+    string cacheFile = context.getCacheFileName(info.str());
+
+    bool hasCache = false;
+    vector<char> cacheContent;
+
+    ifstream cache(cacheFile.c_str(), ios::in | ios::binary);
+    if (cache.is_open()) {
+        cacheContent.insert(cacheContent.begin(), istreambuf_iterator<char>(cache), istreambuf_iterator<char>());
+        cache.close();
+        hasCache = true;
+        // There is an existing cache, load VkFFT kernels from it
+        configuration.loadApplicationFromString = 1;
+        configuration.loadApplicationString = cacheContent.data();
+    }
+    else {
+        // There is no existing cache, request saving
+        configuration.saveApplicationToString = 1;
+    }
+
+    app = new VkFFTApplication();
+    VkFFTResult fftResult = initializeVkFFT(app, configuration);
+    if (fftResult != VKFFT_SUCCESS) {
+        throw OpenMMException("Error executing initializeVkFFT: "+context.intToString(fftResult));
+    }
+
+    if (!hasCache) {
+        // There is no existing cache, create it
+        string outputFile = context.getTempFileName() + ".vkfftcache";
+        try {
+            ofstream out(outputFile.c_str(), ios::out | ios::binary);
+            out.write(reinterpret_cast<char*>(app->saveApplicationString), size_t(app->applicationStringSize));
+            out.close();
+            if (!out.fail()) {
+                if (rename(outputFile.c_str(), cacheFile.c_str()) != 0)
+                    remove(outputFile.c_str());
+            }
+        }
+        catch (...) {
+            // An error occurred.  Possibly we don't have permission to write to the temp directory.
+        }
+    }
+}
+
+HipFFT3D::~HipFFT3D() {
+    deleteVkFFT(app);
+    delete app;
+}
+
+void HipFFT3D::execFFT(bool forward) {
+    VkFFTResult fftResult = VkFFTAppend(app, forward ? -1 : 1, NULL);
+    if (fftResult != VKFFT_SUCCESS) {
+        throw OpenMMException("Error executing VkFFTAppend: "+context.intToString(fftResult));
+    }
+}
+
+int HipFFT3D::findLegalDimension(int minimum) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+
+        int unfactored = minimum;
+        // VkFFT supports prime factors up to 13
+        for (int factor = 2; factor <= 13; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1)
+            return minimum;
+        minimum++;
+    }
+}
--- a/platforms/hip/src/HipIntegrationUtilities.cpp
+++ b/platforms/hip/src/HipIntegrationUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2021 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipIntegrationUtilities.h"
+#include "HipContext.h"
+#include "openmm/common/ContextSelector.h"
+
+using namespace OpenMM;
+using namespace std;
+
+#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
+#define CHECK_RESULT2(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<dynamic_cast<HipContext&>(context).getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+HipIntegrationUtilities::HipIntegrationUtilities(HipContext& context, const System& system) : IntegrationUtilities(context, system),
+        ccmaConvergedMemory(NULL) {
+        CHECK_RESULT2(hipEventCreateWithFlags(&ccmaEvent, context.getEventFlags()), "Error creating event for CCMA");
+        CHECK_RESULT2(hipHostMalloc((void**) &ccmaConvergedMemory, sizeof(int), context.getHostMallocFlags()), "Error allocating pinned memory");
+        CHECK_RESULT2(hipHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
+}
+
+HipIntegrationUtilities::~HipIntegrationUtilities() {
+    ContextSelector selector(context);
+    if (ccmaConvergedMemory != NULL) {
+        hipHostFree(ccmaConvergedMemory);
+        hipEventDestroy(ccmaEvent);
+    }
+}
+
+HipArray& HipIntegrationUtilities::getPosDelta() {
+    return dynamic_cast<HipContext&>(context).unwrap(posDelta);
+}
+
+HipArray& HipIntegrationUtilities::getRandom() {
+    return dynamic_cast<HipContext&>(context).unwrap(random);
+}
+
+HipArray& HipIntegrationUtilities::getStepSize() {
+    return dynamic_cast<HipContext&>(context).unwrap(stepSize);
+}
+
+void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
+    ContextSelector selector(context);
+    ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
+    if (constrainVelocities) {
+        settleKernel = settleVelKernel;
+        shakeKernel = shakeVelKernel;
+        ccmaForceKernel = ccmaVelForceKernel;
+    }
+    else {
+        settleKernel = settlePosKernel;
+        shakeKernel = shakePosKernel;
+        ccmaForceKernel = ccmaPosForceKernel;
+    }
+    if (settleAtoms.isInitialized()) {
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            settleKernel->setArg(1, tol);
+        else
+            settleKernel->setArg(1, (float) tol);
+        settleKernel->execute(settleAtoms.getSize());
+    }
+    if (shakeAtoms.isInitialized()) {
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            shakeKernel->setArg(1, tol);
+        else
+            shakeKernel->setArg(1, (float) tol);
+        shakeKernel->execute(shakeAtoms.getSize());
+    }
+    if (ccmaConstraintAtoms.isInitialized()) {
+        if (ccmaConstraintAtoms.getSize() <= 1024) {
+            // Use the version of CCMA that runs in a single kernel with one workgroup.
+            ccmaFullKernel->setArg(0, (int) constrainVelocities);
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+                ccmaFullKernel->setArg(14, tol);
+            else
+                ccmaFullKernel->setArg(14, (float) tol);
+            ccmaFullKernel->execute(128, 128);
+        }
+        else {
+            ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+                ccmaForceKernel->setArg(7, tol);
+            else
+                ccmaForceKernel->setArg(7, (float) tol);
+            ccmaDirectionsKernel->execute(ccmaConstraintAtoms.getSize());
+            const int checkInterval = 4;
+            ccmaConvergedMemory[0] = 0;
+            ccmaUpdateKernel->setArg(4, constrainVelocities ? context.getVelm() : posDelta);
+            for (int i = 0; i < 150; i++) {
+                ccmaForceKernel->setArg(8, i);
+                ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
+                if ((i+1)%checkInterval == 0)
+                    CHECK_RESULT2(hipEventRecord(ccmaEvent, dynamic_cast<HipContext&>(context).getCurrentStream()), "Error recording event for CCMA");
+                ccmaMultiplyKernel->setArg(5, i);
+                ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
+                ccmaUpdateKernel->setArg(9, i);
+                ccmaUpdateKernel->execute(context.getNumAtoms());
+                if ((i+1)%checkInterval == 0) {
+                    CHECK_RESULT2(hipEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
+                    if (ccmaConvergedMemory[0])
+                        break;
+                }
+            }
+        }
+    }
+}
+
+void HipIntegrationUtilities::distributeForcesFromVirtualSites() {
+    ContextSelector selector(context);
+    for (int i = numVsiteStages-1; i >= 0; i--) {
+        vsiteForceKernel->setArg(2, context.getLongForceBuffer());
+        vsiteForceKernel->setArg(15, i);
+        vsiteForceKernel->execute(numVsites);
+    }
+}
--- a/platforms/hip/src/HipKernel.cpp
+++ b/platforms/hip/src/HipKernel.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipKernel.h"
+#include "openmm/common/ComputeArray.h"
+#include "openmm/internal/AssertionUtilities.h"
+#include <cstring>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+HipKernel::HipKernel(HipContext& context, hipFunction_t kernel, const string& name) : context(context), kernel(kernel), name(name) {
+}
+
+string HipKernel::getName() const {
+    return name;
+}
+
+int HipKernel::getMaxBlockSize() const {
+    int size;
+    hipError_t result = hipFuncGetAttribute(&size, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
+    if (result != hipSuccess)
+        throw OpenMMException("Error querying max thread block size: "+context.getErrorString(result));
+    return size;
+}
+
+void HipKernel::execute(int threads, int blockSize) {
+    int numArgs = arrayArgs.size();
+    argPointers.resize(numArgs);
+    for (int i = 0; i < numArgs; i++) {
+        if (arrayArgs[i] != NULL)
+            argPointers[i] = &arrayArgs[i]->getDevicePointer();
+        else
+            argPointers[i] = &primitiveArgs[i];
+    }
+    context.executeKernel(kernel, argPointers.data(), threads, blockSize);
+}
+
+void HipKernel::addArrayArg(ArrayInterface& value) {
+    int index = arrayArgs.size();
+    addEmptyArg();
+    setArrayArg(index, value);
+}
+
+void HipKernel::addPrimitiveArg(const void* value, int size) {
+    int index = arrayArgs.size();
+    addEmptyArg();
+    setPrimitiveArg(index, value, size);
+}
+
+void HipKernel::addEmptyArg() {
+    primitiveArgs.push_back(make_double4(0, 0, 0, 0));
+    arrayArgs.push_back(NULL);
+}
+
+void HipKernel::setArrayArg(int index, ArrayInterface& value) {
+    ASSERT_VALID_INDEX(index, arrayArgs);
+    arrayArgs[index] = &context.unwrap(value);
+}
+
+void HipKernel::setPrimitiveArg(int index, const void* value, int size) {
+    ASSERT_VALID_INDEX(index, primitiveArgs);
+    if (size > sizeof(double4))
+        throw OpenMMException("Unsupported value type for kernel argument");
+    memcpy(&primitiveArgs[index], value, size);
+    arrayArgs[index] = NULL;
+}
--- a/platforms/hip/src/HipKernelFactory.cpp
+++ b/platforms/hip/src/HipKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2024 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipKernelFactory.h"
+#include "HipKernels.h"
+#include "HipParallelKernels.h"
+#include "HipPlatform.h"
+#include "openmm/common/CommonKernels.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+
+KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    HipPlatform::PlatformData& data = *static_cast<HipPlatform::PlatformData*>(context.getPlatformData());
+    if (data.contexts.size() > 1) {
+        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
+
+        if (name == CalcForcesAndEnergyKernel::Name())
+            return new HipParallelCalcForcesAndEnergyKernel(name, platform, data);
+        if (name == CalcHarmonicBondForceKernel::Name())
+            return new HipParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomBondForceKernel::Name())
+            return new HipParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcHarmonicAngleForceKernel::Name())
+            return new HipParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomAngleForceKernel::Name())
+            return new HipParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcPeriodicTorsionForceKernel::Name())
+            return new HipParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcRBTorsionForceKernel::Name())
+            return new HipParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCMAPTorsionForceKernel::Name())
+            return new HipParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomTorsionForceKernel::Name())
+            return new HipParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcNonbondedForceKernel::Name())
+            return new HipParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomNonbondedForceKernel::Name())
+            return new HipParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomExternalForceKernel::Name())
+            return new HipParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomHbondForceKernel::Name())
+            return new HipParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomCompoundBondForceKernel::Name())
+            return new HipParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
+    }
+    HipContext& cu = *data.contexts[0];
+    if (name == CalcForcesAndEnergyKernel::Name())
+        return new HipCalcForcesAndEnergyKernel(name, platform, cu);
+    if (name == UpdateStateDataKernel::Name())
+        return new CommonUpdateStateDataKernel(name, platform, cu);
+    if (name == ApplyConstraintsKernel::Name())
+        return new CommonApplyConstraintsKernel(name, platform, cu);
+    if (name == VirtualSitesKernel::Name())
+        return new CommonVirtualSitesKernel(name, platform, cu);
+    if (name == CalcHarmonicBondForceKernel::Name())
+        return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomBondForceKernel::Name())
+        return new CommonCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcHarmonicAngleForceKernel::Name())
+        return new CommonCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomAngleForceKernel::Name())
+        return new CommonCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcPeriodicTorsionForceKernel::Name())
+        return new CommonCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcRBTorsionForceKernel::Name())
+        return new CommonCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCMAPTorsionForceKernel::Name())
+        return new CommonCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomTorsionForceKernel::Name())
+        return new CommonCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcNonbondedForceKernel::Name())
+        return new HipCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomNonbondedForceKernel::Name())
+        return new CommonCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcGBSAOBCForceKernel::Name())
+        return new CommonCalcGBSAOBCForceKernel(name, platform, cu);
+    if (name == CalcCustomGBForceKernel::Name())
+        return new CommonCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomExternalForceKernel::Name())
+        return new CommonCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomHbondForceKernel::Name())
+        return new CommonCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCentroidBondForceKernel::Name())
+        return new CommonCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCompoundBondForceKernel::Name())
+        return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCVForceKernel::Name())
+        return new HipCalcCustomCVForceKernel(name, platform, cu);
+    if (name == CalcCustomCPPForceKernel::Name())
+        return new CommonCalcCustomCPPForceKernel(name, platform, context, cu);
+    if (name == CalcRMSDForceKernel::Name())
+        return new CommonCalcRMSDForceKernel(name, platform, cu);
+    if (name == CalcCustomManyParticleForceKernel::Name())
+        return new CommonCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcGayBerneForceKernel::Name())
+        return new CommonCalcGayBerneForceKernel(name, platform, cu);
+    if (name == IntegrateVerletStepKernel::Name())
+        return new CommonIntegrateVerletStepKernel(name, platform, cu);
+    if (name == IntegrateLangevinMiddleStepKernel::Name())
+        return new CommonIntegrateLangevinMiddleStepKernel(name, platform, cu);
+    if (name == IntegrateBrownianStepKernel::Name())
+        return new CommonIntegrateBrownianStepKernel(name, platform, cu);
+    if (name == IntegrateVariableVerletStepKernel::Name())
+        return new CommonIntegrateVariableVerletStepKernel(name, platform, cu);
+    if (name == IntegrateVariableLangevinStepKernel::Name())
+        return new CommonIntegrateVariableLangevinStepKernel(name, platform, cu);
+    if (name == IntegrateCustomStepKernel::Name())
+        return new CommonIntegrateCustomStepKernel(name, platform, cu);
+    if (name == ApplyAndersenThermostatKernel::Name())
+        return new CommonApplyAndersenThermostatKernel(name, platform, cu);
+    if (name == IntegrateNoseHooverStepKernel::Name())
+        return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
+    if (name == ApplyMonteCarloBarostatKernel::Name())
+        return new CommonApplyMonteCarloBarostatKernel(name, platform, cu);
+    if (name == RemoveCMMotionKernel::Name())
+        return new CommonRemoveCMMotionKernel(name, platform, cu);
+    if (name == CalcATMForceKernel::Name() )
+        return new HipCalcATMForceKernel(name, platform, cu);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/platforms/hip/src/HipKernelSources.cpp.in
+++ b/platforms/hip/src/HipKernelSources.cpp.in
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipKernelSources.h"
+
+using namespace OpenMM;
+using namespace std;
--- a/platforms/hip/src/HipKernelSources.h.in
+++ b/platforms/hip/src/HipKernelSources.h.in
+#ifndef OPENMM_HIPKERNELSOURCES_H_
+#define OPENMM_HIPKERNELSOURCES_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/windowsExportCommon.h"
+#include <string>
+
+namespace OpenMM {
+
+/**
+ * This class is a central holding place for the source code of HIP kernels.
+ * The CMake build script inserts declarations into it based on the .hip files in the
+ * kernels subfolder.
+ */
+
+class OPENMM_EXPORT_COMMON HipKernelSources {
+public:
+@KERNEL_FILE_DECLARATIONS@
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_HIPKERNELSOURCES_H_*/
--- a/platforms/hip/src/HipKernels.cpp
+++ b/platforms/hip/src/HipKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2022 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipKernels.h"
+#include "HipForceInfo.h"
+#include "openmm/Context.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/NonbondedForceImpl.h"
+#include "openmm/common/ContextSelector.h"
+#include "CommonKernelSources.h"
+#include "HipBondedUtilities.h"
+#include "HipExpressionUtilities.h"
+#include "HipIntegrationUtilities.h"
+#include "HipNonbondedUtilities.h"
+#include "HipKernelSources.h"
+#include "SimTKOpenMMRealType.h"
+#include "SimTKOpenMMUtilities.h"
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+#include <set>
+#include <assert.h>
+
+using namespace OpenMM;
+using namespace std;
+
+#define CHECK_RESULT(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+void HipCalcForcesAndEnergyKernel::initialize(const System& system) {
+}
+
+void HipCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cu.setForcesValid(true);
+    ContextSelector selector(cu);
+    cu.clearAutoclearBuffers();
+    for (auto computation : cu.getPreComputations())
+        computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
+    HipNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    cu.setComputeForceCount(cu.getComputeForceCount()+1);
+    nb.prepareInteractions(groups);
+    map<string, double>& derivs = cu.getEnergyParamDerivWorkspace();
+    for (auto& param : context.getParameters())
+        derivs[param.first] = 0;
+}
+
+double HipCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
+    ContextSelector selector(cu);
+    cu.getBondedUtilities().computeInteractions(groups);
+    cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
+    double sum = 0.0;
+    for (auto computation : cu.getPostComputations())
+        sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
+    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
+    if (includeEnergy)
+        sum += cu.reduceEnergy();
+    if (!cu.getForcesValid())
+        valid = false;
+    return sum;
+}
+
+class HipCalcNonbondedForceKernel::ForceInfo : public HipForceInfo {
+public:
+    ForceInfo(const NonbondedForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
+        force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
+        force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
+        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
+    }
+    int getNumParticleGroups() {
+        return force.getNumExceptions();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
+        particles.resize(2);
+        particles[0] = particle1;
+        particles[1] = particle2;
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        int particle1, particle2;
+        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
+        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
+        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
+        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
+    }
+private:
+    const NonbondedForce& force;
+};
+
+class HipCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    PmeIO(HipContext& cu, hipFunction_t addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel) {
+        forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
+    }
+    float* getPosq() {
+        ContextSelector selector(cu);
+        cu.getPosq().download(posq);
+        return (float*) &posq[0];
+    }
+    void setForce(float* force) {
+        forceTemp.upload(force);
+        void* args[] = {&forceTemp.getDevicePointer(), &cu.getForce().getDevicePointer()};
+        cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
+    }
+private:
+    HipContext& cu;
+    vector<float4> posq;
+    HipArray forceTemp;
+    hipFunction_t addForcesKernel;
+};
+
+class HipCalcNonbondedForceKernel::PmePreComputation : public HipContext::ForcePreComputation {
+public:
+    PmePreComputation(HipContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
+    }
+private:
+    HipContext& cu;
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
+class HipCalcNonbondedForceKernel::PmePostComputation : public HipContext::ForcePostComputation {
+public:
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+    }
+private:
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
+class HipCalcNonbondedForceKernel::SyncStreamPreComputation : public HipContext::ForcePreComputation {
+public:
+    SyncStreamPreComputation(HipContext& cu, hipStream_t stream, hipEvent_t event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        if ((groups&(1<<forceGroup)) != 0) {
+            hipEventRecord(event, cu.getCurrentStream());
+            hipStreamWaitEvent(stream, event, 0);
+        }
+    }
+private:
+    HipContext& cu;
+    hipStream_t stream;
+    hipEvent_t event;
+    int forceGroup;
+};
+
+class HipCalcNonbondedForceKernel::SyncStreamPostComputation : public HipContext::ForcePostComputation {
+public:
+    SyncStreamPostComputation(HipContext& cu, hipEvent_t event, hipFunction_t addEnergyKernel, HipArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
+            addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        if ((groups&(1<<forceGroup)) != 0) {
+            hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
+            if (includeEnergy) {
+                int bufferSize = pmeEnergyBuffer.getSize();
+                void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
+                cu.executeKernel(addEnergyKernel, args, bufferSize);
+            }
+        }
+        return 0.0;
+    }
+private:
+    HipContext& cu;
+    hipEvent_t event;
+    hipFunction_t addEnergyKernel;
+    HipArray& pmeEnergyBuffer;
+    int forceGroup;
+};
+
+HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
+    ContextSelector selector(cu);
+    if (sort != NULL)
+        delete sort;
+    if (fft != NULL)
+        delete fft;
+    if (dispersionFft != NULL)
+        delete dispersionFft;
+    if (pmeio != NULL)
+        delete pmeio;
+    if (hasInitializedFFT) {
+        if (usePmeStream) {
+            hipStreamDestroy(pmeStream);
+            hipEventDestroy(pmeSyncEvent);
+            hipEventDestroy(paramsSyncEvent);
+        }
+    }
+}
+
+void HipCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+    ContextSelector selector(cu);
+    int forceIndex;
+    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
+        ;
+    string prefix = "nonbonded"+cu.intToString(forceIndex)+"_";
+
+    // Identify which exceptions are 1-4 interactions.
+
+    set<int> exceptionsWithOffsets;
+    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
+        string param;
+        int exception;
+        double charge, sigma, epsilon;
+        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        exceptionsWithOffsets.insert(exception);
+    }
+    vector<pair<int, int> > exclusions;
+    vector<int> exceptions;
+    map<int, int> exceptionIndex;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+        exclusions.push_back(pair<int, int>(particle1, particle2));
+        if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
+            exceptionIndex[i] = exceptions.size();
+            exceptions.push_back(i);
+        }
+    }
+
+    // Initialize nonbonded interactions.
+
+    int numParticles = force.getNumParticles();
+    vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    vector<vector<int> > exclusionList(numParticles);
+    hasCoulomb = false;
+    hasLJ = false;
+    for (int i = 0; i < numParticles; i++) {
+        double charge, sigma, epsilon;
+        force.getParticleParameters(i, charge, sigma, epsilon);
+        baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
+        exclusionList[i].push_back(i);
+        if (charge != 0.0)
+            hasCoulomb = true;
+        if (epsilon != 0.0)
+            hasLJ = true;
+    }
+    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
+        string param;
+        int particle;
+        double charge, sigma, epsilon;
+        force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
+        if (charge != 0.0)
+            hasCoulomb = true;
+        if (epsilon != 0.0)
+            hasLJ = true;
+    }
+    for (auto exclusion : exclusions) {
+        exclusionList[exclusion.first].push_back(exclusion.second);
+        exclusionList[exclusion.second].push_back(exclusion.first);
+    }
+    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
+    bool useCutoff = (nonbondedMethod != NoCutoff);
+    bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
+    doLJPME = (nonbondedMethod == LJPME && hasLJ);
+    usePosqCharges = hasCoulomb ? cu.requestPosqCharges() : false;
+
+    map<string, string> defines;
+    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
+    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
+    defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
+    if (useCutoff) {
+        // Compute the reaction field constants.
+
+        double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
+        double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
+        defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
+        defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
+
+        // Compute the switching coefficients.
+
+        if (force.getUseSwitchingFunction()) {
+            defines["LJ_SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
+            defines["LJ_SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
+            defines["LJ_SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
+            defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
+        }
+    }
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
+        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
+    else
+        dispersionCoefficient = 0.0;
+    alpha = 0;
+    ewaldSelfEnergy = 0.0;
+    map<string, string> paramsDefines;
+    paramsDefines["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
+    if (hasOffsets)
+        paramsDefines["HAS_OFFSETS"] = "1";
+    if (force.getNumParticleParameterOffsets() > 0)
+        paramsDefines["HAS_PARTICLE_OFFSETS"] = "1";
+    if (force.getNumExceptionParameterOffsets() > 0)
+        paramsDefines["HAS_EXCEPTION_OFFSETS"] = "1";
+    if (usePosqCharges)
+        paramsDefines["USE_POSQ_CHARGES"] = "1";
+    if (doLJPME)
+        paramsDefines["INCLUDE_LJPME_EXCEPTIONS"] = "1";
+    if (nonbondedMethod == Ewald) {
+        // Compute the Ewald parameters.
+
+        int kmaxx, kmaxy, kmaxz;
+        NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
+        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+        defines["USE_EWALD"] = "1";
+        if (cu.getContextIndex() == 0) {
+            paramsDefines["INCLUDE_EWALD"] = "1";
+            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
+            for (int i = 0; i < numParticles; i++)
+                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
+
+            // Create the reciprocal space kernels.
+
+            map<string, string> replacements;
+            replacements["NUM_ATOMS"] = cu.intToString(numParticles);
+            replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+            replacements["KMAX_X"] = cu.intToString(kmaxx);
+            replacements["KMAX_Y"] = cu.intToString(kmaxy);
+            replacements["KMAX_Z"] = cu.intToString(kmaxz);
+            replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
+            replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+            replacements["M_PI"] = cu.doubleToString(M_PI);
+            hipModule_t module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::ewald, replacements);
+            ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
+            ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
+            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
+            cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
+        }
+    }
+    else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
+        // Compute the PME parameters.
+
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
+        gridSizeX = cu.findLegalFFTDimension(gridSizeX);
+        gridSizeY = cu.findLegalFFTDimension(gridSizeY);
+        gridSizeZ = cu.findLegalFFTDimension(gridSizeZ);
+        if (doLJPME) {
+            NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
+                                                  dispersionGridSizeY, dispersionGridSizeZ, true);
+            dispersionGridSizeX = cu.findLegalFFTDimension(dispersionGridSizeX);
+            dispersionGridSizeY = cu.findLegalFFTDimension(dispersionGridSizeY);
+            dispersionGridSizeZ = cu.findLegalFFTDimension(dispersionGridSizeZ);
+        }
+
+        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+        defines["USE_EWALD"] = "1";
+        defines["DO_LJPME"] = doLJPME ? "1" : "0";
+        if (doLJPME) {
+            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            double invRCut6 = pow(force.getCutoffDistance(), -6);
+            double dalphaR = dispersionAlpha * force.getCutoffDistance();
+            double dar2 = dalphaR*dalphaR;
+            double dar4 = dar2*dar2;
+            double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
+            defines["INVCUT6"] = cu.doubleToString(invRCut6);
+            defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
+        }
+        if (cu.getContextIndex() == 0) {
+            paramsDefines["INCLUDE_EWALD"] = "1";
+            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
+            for (int i = 0; i < numParticles; i++)
+                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
+            if (doLJPME) {
+                paramsDefines["INCLUDE_LJPME"] = "1";
+                paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cu.doubleToString(pow(dispersionAlpha, 6)/3.0);
+                for (int i = 0; i < numParticles; i++)
+                    ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
+            }
+            usePmeStream = (!cu.getPlatformData().disablePmeStream && !cu.getPlatformData().useCpuPme);
+            map<string, string> pmeDefines;
+            pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
+            pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
+            pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+            pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
+            pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
+            pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
+            pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
+            pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["M_PI"] = cu.doubleToString(M_PI);
+            if (cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces)
+                pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
+            if (usePmeStream)
+                pmeDefines["USE_PME_STREAM"] = "1";
+            map<string, string> replacements;
+            replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
+            hipModule_t module = cu.createModule(HipKernelSources::vectorOps+cu.replaceStrings(CommonKernelSources::pme, replacements), pmeDefines);
+            if (cu.getPlatformData().useCpuPme && !doLJPME && usePosqCharges) {
+                // Create the CPU PME kernel.
+
+                try {
+                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
+                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
+                    hipFunction_t addForcesKernel = cu.getKernel(module, "addForces");
+                    pmeio = new PmeIO(cu, addForcesKernel);
+                    cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
+                    cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+                }
+                catch (OpenMMException& ex) {
+                    // The CPU PME plugin isn't available.
+                }
+            }
+            if (pmeio == NULL) {
+                pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+                pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+                pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+                pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
+                pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+                pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+                hipFuncSetCacheConfig(pmeSpreadChargeKernel, hipFuncCachePreferShared);
+                hipFuncSetCacheConfig(pmeInterpolateForceKernel, hipFuncCachePreferL1);
+                if (doLJPME) {
+                    pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
+                    pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
+                    pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
+                    pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
+                    pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
+                    pmeDefines["USE_LJPME"] = "1";
+                    pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
+                    module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
+                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+                    pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+                    pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+                    pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+                    pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
+                    hipFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, hipFuncCachePreferL1);
+                }
+
+                // Create required data structures.
+
+                int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+                int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
+                int gridElements = gridSizeX*gridSizeY*roundedZSize;
+                if (doLJPME) {
+                    roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
+                    gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
+                }
+                pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
+                pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
+                cu.addAutoclearBuffer(pmeGrid2);
+                pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
+                pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
+                pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+                if (doLJPME) {
+                    pmeDispersionBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
+                    pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
+                    pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
+                }
+                pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
+                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+                pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*HipContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
+                cu.clearBuffer(pmeEnergyBuffer);
+                sort = new HipSort(cu, new SortTrait(), cu.getNumAtoms());
+
+                // Prepare for doing PME on its own stream.
+
+                if (usePmeStream) {
+                    CHECK_RESULT(hipStreamCreateWithFlags(&pmeStream, hipStreamNonBlocking), "Error creating stream for NonbondedForce");
+                    CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
+                    CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, cu.getEventFlags()), "Error creating event for NonbondedForce");
+                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
+                    if (recipForceGroup < 0)
+                        recipForceGroup = force.getForceGroup();
+                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
+                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
+                }
+
+                hipStream_t fftStream = usePmeStream ? pmeStream : cu.getCurrentStream();
+                fft = cu.createFFT(gridSizeX, gridSizeY, gridSizeZ, true, fftStream, pmeGrid1, pmeGrid2);
+                if (doLJPME)
+                    dispersionFft = cu.createFFT(dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true, fftStream, pmeGrid1, pmeGrid2);
+                hasInitializedFFT = true;
+
+                // Initialize the b-spline moduli.
+
+                for (int grid = 0; grid < 2; grid++) {
+                    int xsize, ysize, zsize;
+                    HipArray *xmoduli, *ymoduli, *zmoduli;
+                    if (grid == 0) {
+                        xsize = gridSizeX;
+                        ysize = gridSizeY;
+                        zsize = gridSizeZ;
+                        xmoduli = &pmeBsplineModuliX;
+                        ymoduli = &pmeBsplineModuliY;
+                        zmoduli = &pmeBsplineModuliZ;
+                    }
+                    else {
+                        if (!doLJPME)
+                            continue;
+                        xsize = dispersionGridSizeX;
+                        ysize = dispersionGridSizeY;
+                        zsize = dispersionGridSizeZ;
+                        xmoduli = &pmeDispersionBsplineModuliX;
+                        ymoduli = &pmeDispersionBsplineModuliY;
+                        zmoduli = &pmeDispersionBsplineModuliZ;
+                    }
+                    int maxSize = max(max(xsize, ysize), zsize);
+                    vector<double> data(PmeOrder);
+                    vector<double> ddata(PmeOrder);
+                    vector<double> bsplines_data(maxSize);
+                    data[PmeOrder-1] = 0.0;
+                    data[1] = 0.0;
+                    data[0] = 1.0;
+                    for (int i = 3; i < PmeOrder; i++) {
+                        double div = 1.0/(i-1.0);
+                        data[i-1] = 0.0;
+                        for (int j = 1; j < (i-1); j++)
+                            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+                        data[0] = div*data[0];
+                    }
+
+                    // Differentiate.
+
+                    ddata[0] = -data[0];
+                    for (int i = 1; i < PmeOrder; i++)
+                        ddata[i] = data[i-1]-data[i];
+                    double div = 1.0/(PmeOrder-1);
+                    data[PmeOrder-1] = 0.0;
+                    for (int i = 1; i < (PmeOrder-1); i++)
+                        data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+                    data[0] = div*data[0];
+                    for (int i = 0; i < maxSize; i++)
+                        bsplines_data[i] = 0.0;
+                    for (int i = 1; i <= PmeOrder; i++)
+                        bsplines_data[i] = data[i-1];
+
+                    // Evaluate the actual bspline moduli for X/Y/Z.
+
+                    for (int dim = 0; dim < 3; dim++) {
+                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
+                        vector<double> moduli(ndata);
+                        for (int i = 0; i < ndata; i++) {
+                            double sc = 0.0;
+                            double ss = 0.0;
+                            for (int j = 0; j < ndata; j++) {
+                                double arg = (2.0*M_PI*i*j)/ndata;
+                                sc += bsplines_data[j]*cos(arg);
+                                ss += bsplines_data[j]*sin(arg);
+                            }
+                            moduli[i] = sc*sc+ss*ss;
+                        }
+                        for (int i = 0; i < ndata; i++)
+                            if (moduli[i] < 1.0e-7)
+                                moduli[i] = (moduli[(i-1+ndata)%ndata]+moduli[(i+1)%ndata])*0.5;
+                        if (dim == 0)
+                            xmoduli->upload(moduli, true);
+                        else if (dim == 1)
+                            ymoduli->upload(moduli, true);
+                        else
+                            zmoduli->upload(moduli, true);
+                    }
+                }
+            }
+        }
+    }
+
+    // Add code to subtract off the reciprocal part of excluded interactions.
+
+    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
+        int numContexts = cu.getPlatformData().contexts.size();
+        int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
+        int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
+        int numExclusions = endIndex-startIndex;
+        if (numExclusions > 0) {
+            paramsDefines["HAS_EXCLUSIONS"] = "1";
+            vector<vector<int> > atoms(numExclusions, vector<int>(2));
+            exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
+            exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
+            vector<int2> exclusionAtomsVec(numExclusions);
+            for (int i = 0; i < numExclusions; i++) {
+                int j = i+startIndex;
+                exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
+                atoms[i][0] = exclusions[j].first;
+                atoms[i][1] = exclusions[j].second;
+            }
+            exclusionAtoms.upload(exclusionAtomsVec);
+            map<string, string> replacements;
+            replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams, "float4");
+            replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
+            replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
+            replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
+            if (doLJPME)
+                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            if (force.getIncludeDirectSpace())
+                cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
+        }
+    }
+
+    // Add the interaction to the default nonbonded kernel.
+
+    string source = cu.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
+    charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
+    baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
+    baseParticleParams.upload(baseParticleParamVec);
+    map<string, string> replacements;
+    replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+    if (usePosqCharges) {
+        replacements["CHARGE1"] = "posq1.w";
+        replacements["CHARGE2"] = "posq2.w";
+    }
+    else {
+        replacements["CHARGE1"] = prefix+"charge1";
+        replacements["CHARGE2"] = prefix+"charge2";
+    }
+    if (hasCoulomb && !usePosqCharges)
+        cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
+    sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
+    if (hasLJ) {
+        replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
+        replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
+        cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
+    }
+    source = cu.replaceStrings(source, replacements);
+    if (force.getIncludeDirectSpace())
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);
+
+    // Initialize the exceptions.
+
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
+    int numExceptions = endIndex-startIndex;
+    if (numExceptions > 0) {
+        paramsDefines["HAS_EXCEPTIONS"] = "1";
+        exceptionAtoms.resize(numExceptions);
+        vector<vector<int> > atoms(numExceptions, vector<int>(2));
+        exceptionParams.initialize<float4>(cu, numExceptions, "exceptionParams");
+        baseExceptionParams.initialize<float4>(cu, numExceptions, "baseExceptionParams");
+        vector<float4> baseExceptionParamsVec(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            double chargeProd, sigma, epsilon;
+            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
+            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
+            exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
+        }
+        baseExceptionParams.upload(baseExceptionParamsVec);
+        map<string, string> replacements;
+        replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
+        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams, "float4");
+        if (force.getIncludeDirectSpace())
+            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
+    }
+
+    // Initialize parameter offsets.
+
+    vector<vector<float4> > particleOffsetVec(force.getNumParticles());
+    vector<vector<float4> > exceptionOffsetVec(numExceptions);
+    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
+        string param;
+        int particle;
+        double charge, sigma, epsilon;
+        force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
+        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
+        int paramIndex;
+        if (paramPos == paramNames.end()) {
+            paramIndex = paramNames.size();
+            paramNames.push_back(param);
+        }
+        else
+            paramIndex = paramPos-paramNames.begin();
+        particleOffsetVec[particle].push_back(make_float4(charge, sigma, epsilon, paramIndex));
+    }
+    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
+        string param;
+        int exception;
+        double charge, sigma, epsilon;
+        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        int index = exceptionIndex[exception];
+        if (index < startIndex || index >= endIndex)
+            continue;
+        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
+        int paramIndex;
+        if (paramPos == paramNames.end()) {
+            paramIndex = paramNames.size();
+            paramNames.push_back(param);
+        }
+        else
+            paramIndex = paramPos-paramNames.begin();
+        exceptionOffsetVec[index-startIndex].push_back(make_float4(charge, sigma, epsilon, paramIndex));
+    }
+    paramValues.resize(paramNames.size(), 0.0);
+    particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
+    particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
+    vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
+    vector<float4> p, e;
+    for (int i = 0; i < particleOffsetVec.size(); i++) {
+        particleOffsetIndicesVec.push_back(p.size());
+        for (int j = 0; j < particleOffsetVec[i].size(); j++)
+            p.push_back(particleOffsetVec[i][j]);
+    }
+    while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
+        particleOffsetIndicesVec.push_back(p.size());
+    for (int i = 0; i < exceptionOffsetVec.size(); i++) {
+        exceptionOffsetIndicesVec.push_back(e.size());
+        for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
+            e.push_back(exceptionOffsetVec[i][j]);
+    }
+    exceptionOffsetIndicesVec.push_back(e.size());
+    if (force.getNumParticleParameterOffsets() > 0) {
+        particleParamOffsets.upload(p);
+        particleOffsetIndices.upload(particleOffsetIndicesVec);
+    }
+    exceptionParamOffsets.initialize<float4>(cu, max((int) e.size(), 1), "exceptionParamOffsets");
+    exceptionOffsetIndices.initialize<int>(cu, exceptionOffsetIndicesVec.size(), "exceptionOffsetIndices");
+    if (e.size() > 0) {
+        exceptionParamOffsets.upload(e);
+        exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
+    }
+    globalParams.initialize(cu, max((int) paramValues.size(), 1), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
+    if (paramValues.size() > 0)
+        globalParams.upload(paramValues, true);
+    recomputeParams = true;
+
+    // Initialize the kernel for updating parameters.
+
+    hipModule_t module = cu.createModule(CommonKernelSources::nonbondedParameters, paramsDefines);
+    computeParamsKernel = cu.getKernel(module, "computeParameters");
+    computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
+    info = new ForceInfo(force);
+    cu.addForce(info);
+}
+
+double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
+    // Update particle and exception parameters.
+
+    ContextSelector selector(cu);
+    bool paramChanged = false;
+    for (int i = 0; i < paramNames.size(); i++) {
+        double value = context.getParameter(paramNames[i]);
+        if (value != paramValues[i]) {
+            paramValues[i] = value;;
+            paramChanged = true;
+        }
+    }
+    if (paramChanged) {
+        recomputeParams = true;
+        globalParams.upload(paramValues, true);
+    }
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
+    if (recomputeParams || hasOffsets) {
+        int computeSelfEnergy = (includeEnergy && includeReciprocal);
+        int numAtoms = cu.getPaddedNumAtoms();
+        vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
+                &baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
+                &particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
+        int numExceptions;
+        if (exceptionParams.isInitialized()) {
+            numExceptions = exceptionParams.getSize();
+            paramsArgs.push_back(&numExceptions);
+            paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
+            paramsArgs.push_back(&exceptionParams.getDevicePointer());
+            paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
+            paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
+        }
+        cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
+        if (exclusionParams.isInitialized()) {
+            int numExclusions = exclusionParams.getSize();
+            vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
+                    &numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
+            cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
+        }
+        if (usePmeStream) {
+            hipEventRecord(paramsSyncEvent, cu.getCurrentStream());
+            hipStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
+        }
+        if (hasOffsets)
+            energy = 0.0; // The Ewald self energy was computed in the kernel.
+        recomputeParams = false;
+    }
+
+    // Do reciprocal space calculations.
+
+    if (cosSinSums.isInitialized() && includeReciprocal) {
+        void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
+        cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums.getSize());
+        void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
+        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
+    }
+    if (pmeGrid1.isInitialized() && includeReciprocal) {
+        if (usePmeStream)
+            cu.setCurrentStream(pmeStream);
+
+        // Invert the periodic box vectors.
+
+        Vec3 boxVectors[3];
+        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
+        double scale = 1.0/determinant;
+        double4 recipBoxVectors[3];
+        recipBoxVectors[0] = make_double4(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0, 0);
+        recipBoxVectors[1] = make_double4(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0, 0);
+        recipBoxVectors[2] = make_double4((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale, 0);
+        float4 recipBoxVectorsFloat[3];
+        void* recipBoxVectorPointer[3];
+        if (cu.getUseDoublePrecision()) {
+            recipBoxVectorPointer[0] = &recipBoxVectors[0];
+            recipBoxVectorPointer[1] = &recipBoxVectors[1];
+            recipBoxVectorPointer[2] = &recipBoxVectors[2];
+        }
+        else {
+            recipBoxVectorsFloat[0] = make_float4((float) recipBoxVectors[0].x, 0, 0, 0);
+            recipBoxVectorsFloat[1] = make_float4((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0, 0);
+            recipBoxVectorsFloat[2] = make_float4((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z, 0);
+            recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
+            recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
+            recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
+        }
+
+        // Execute the reciprocal space kernels.
+
+        if (hasCoulomb) {
+            void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernelFlat(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
+
+            sort->sort(pmeAtomGridIndex);
+
+            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &charges.getDevicePointer()};
+            cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
+
+            void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
+            cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
+
+            fft->execFFT(true);
+
+            if (includeEnergy) {
+                void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
+                        &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
+                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+                cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
+            }
+
+            void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
+                    &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
+
+            fft->execFFT(false);
+
+            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &charges.getDevicePointer()};
+            cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
+        }
+
+        if (doLJPME && hasLJ) {
+            if (!hasCoulomb) {
+                void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                        cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+                cu.executeKernelFlat(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
+
+                sort->sort(pmeAtomGridIndex);
+                cu.clearBuffer(pmeEnergyBuffer);
+            }
+
+            cu.clearBuffer(pmeGrid2);
+            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &sigmaEpsilon.getDevicePointer()};
+            cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
+
+            void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
+            cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+
+            dispersionFft->execFFT(true);
+
+            if (includeEnergy) {
+                void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
+                        &pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
+                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+                cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
+            }
+
+            void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
+                    &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+
+            dispersionFft->execFFT(false);
+
+            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &sigmaEpsilon.getDevicePointer()};
+            cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
+        }
+        if (usePmeStream) {
+            hipEventRecord(pmeSyncEvent, pmeStream);
+            cu.restoreDefaultStream();
+        }
+    }
+
+    if (dispersionCoefficient != 0.0 && includeDirect) {
+        double4 boxSize = cu.getPeriodicBoxSize();
+        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
+    }
+    return energy;
+}
+
+void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
+    // Make sure the new parameters are acceptable.
+
+    ContextSelector selector(cu);
+    if (force.getNumParticles() != cu.getNumAtoms())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    if (!hasCoulomb || !hasLJ) {
+        for (int i = 0; i < force.getNumParticles(); i++) {
+            double charge, sigma, epsilon;
+            force.getParticleParameters(i, charge, sigma, epsilon);
+            if (!hasCoulomb && charge != 0.0)
+                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
+            if (!hasLJ && epsilon != 0.0)
+                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
+        }
+    }
+    set<int> exceptionsWithOffsets;
+    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
+        string param;
+        int exception;
+        double charge, sigma, epsilon;
+        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        exceptionsWithOffsets.insert(exception);
+    }
+    vector<int> exceptions;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+        if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end())
+            exceptions.push_back(i);
+    }
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
+    int numExceptions = endIndex-startIndex;
+    if (numExceptions != exceptionAtoms.size())
+        throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
+
+    // Record the per-particle parameters.
+
+    vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    const vector<int>& order = cu.getAtomIndex();
+    for (int i = 0; i < force.getNumParticles(); i++) {
+        double charge, sigma, epsilon;
+        force.getParticleParameters(i, charge, sigma, epsilon);
+        baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
+    }
+    baseParticleParams.upload(baseParticleParamVec);
+
+    // Record the exceptions.
+
+    if (numExceptions > 0) {
+        vector<float4> baseExceptionParamsVec(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            int particle1, particle2;
+            double chargeProd, sigma, epsilon;
+            force.getExceptionParameters(exceptions[startIndex+i], particle1, particle2, chargeProd, sigma, epsilon);
+            if (make_pair(particle1, particle2) != exceptionAtoms[i])
+                throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
+            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
+        }
+        baseExceptionParams.upload(baseExceptionParamsVec);
+    }
+
+    // Compute other values.
+
+    ewaldSelfEnergy = 0.0;
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
+        if (cu.getContextIndex() == 0) {
+            for (int i = 0; i < force.getNumParticles(); i++) {
+                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
+                if (doLJPME)
+                    ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
+            }
+        }
+    }
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
+        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
+    cu.invalidateMolecules();
+    recomputeParams = true;
+}
+
+void HipCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != PME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (cu.getPlatformData().useCpuPme)
+        cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = this->alpha;
+        nx = gridSizeX;
+        ny = gridSizeY;
+        nz = gridSizeZ;
+    }
+}
+
+void HipCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (!doLJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (cu.getPlatformData().useCpuPme)
+        //cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
+        throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
+    else {
+        alpha = this->dispersionAlpha;
+        nx = dispersionGridSizeX;
+        ny = dispersionGridSizeY;
+        nz = dispersionGridSizeZ;
+    }
+}
--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/OpenMMException.h"
+#include "HipNonbondedUtilities.h"
+#include "HipArray.h"
+#include "HipContext.h"
+#include "HipKernelSources.h"
+#include "HipExpressionUtilities.h"
+#include "HipSort.h"
+#include <algorithm>
+#include <map>
+#include <set>
+#include <utility>
+
+using namespace OpenMM;
+using namespace std;
+
+#define CHECK_RESULT(result) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+
+class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
+public:
+    BlockSortTrait() {}
+    int getDataSize() const {return sizeof(int);}
+    int getKeySize() const {return sizeof(int);}
+    const char* getDataType() const {return "unsigned int";}
+    const char* getKeyType() const {return "unsigned int";}
+    const char* getMinKey() const {return "0";}
+    const char* getMaxKey() const {return "0xFFFFFFFFu";}
+    const char* getMaxValue() const {return "0xFFFFFFFFu";}
+    const char* getSortKey() const {return "value";}
+};
+
+HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
+        blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
+    // Decide how many thread blocks to use.
+
+    string errorMessage = "Error initializing nonbonded utilities";
+    CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, context.getEventFlags()));
+    CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(unsigned int), context.getHostMallocFlags()));
+    numForceThreadBlocks = 5*4*context.getMultiprocessors();
+    forceThreadBlockSize = 64;
+    findInteractingBlocksThreadBlockSize = context.getSIMDWidth();
+
+    // When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to
+    // accelerate the process.  This makes building the neighbor list faster, but it prevents
+    // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
+    // list.  We guess based on system size which will be faster.
+
+    useLargeBlocks = (context.getNumAtoms() > 90000);
+    setKernelSource(HipKernelSources::nonbonded);
+}
+
+HipNonbondedUtilities::~HipNonbondedUtilities() {
+    if (blockSorter != NULL)
+        delete blockSorter;
+    if (pinnedCountBuffer != NULL)
+        hipHostFree(pinnedCountBuffer);
+    hipEventDestroy(downloadCountEvent);
+}
+
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) {
+    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false);
+}
+
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) {
+    if (groupCutoff.size() > 0) {
+        if (usesCutoff != useCutoff)
+            throw OpenMMException("All Forces must agree on whether to use a cutoff");
+        if (usesPeriodic != usePeriodic)
+            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
+        if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
+            throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
+    }
+    if (usesExclusions)
+        requestExclusions(exclusionList);
+    useCutoff = usesCutoff;
+    usePeriodic = usesPeriodic;
+    useNeighborList |= (usesNeighborList && useCutoff);
+    groupCutoff[forceGroup] = cutoffDistance;
+    groupFlags |= 1<<forceGroup;
+    canUsePairList &= supportsPairList;
+    if (kernel.size() > 0) {
+        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
+            groupKernelSource[forceGroup] = "";
+        map<string, string> replacements;
+        replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
+        replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
+        groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
+    }
+}
+
+void HipNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
+    parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
+}
+
+void HipNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
+    parameters.push_back(parameter);
+}
+
+void HipNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
+    arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
+}
+
+void HipNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
+    arguments.push_back(parameter);
+}
+
+string HipNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
+    // See if the parameter has already been added.
+
+    int index;
+    for (index = 0; index < energyParameterDerivatives.size(); index++)
+        if (param == energyParameterDerivatives[index])
+            break;
+    if (index == energyParameterDerivatives.size())
+        energyParameterDerivatives.push_back(param);
+    context.addEnergyParameterDerivative(param);
+    return string("energyParamDeriv")+context.intToString(index);
+}
+
+void HipNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
+    if (anyExclusions) {
+        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
+        for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
+             if (exclusionList[i].size() != atomExclusions[i].size())
+                 sameExclusions = false;
+            set<int> expectedExclusions;
+            expectedExclusions.insert(atomExclusions[i].begin(), atomExclusions[i].end());
+            for (int j = 0; j < (int) exclusionList[i].size(); j++)
+                if (expectedExclusions.find(exclusionList[i][j]) == expectedExclusions.end())
+                     sameExclusions = false;
+        }
+        if (!sameExclusions)
+            throw OpenMMException("All Forces must have identical exceptions");
+    }
+    else {
+        atomExclusions = exclusionList;
+        anyExclusions = true;
+    }
+}
+
+static bool compareInt2(int2 a, int2 b) {
+    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
+}
+
+static bool compareInt2LargeSIMD(int2 a, int2 b) {
+    // This version is used on devices with SIMD width greater than tile size.  It puts diagonal tiles before off-diagonal
+    // ones to reduce thread divergence.
+
+    if (a.x == a.y) {
+        if (b.x == b.y)
+            return (a.x < b.x);
+        return true;
+    }
+    if (b.x == b.y)
+        return false;
+    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
+}
+
+void HipNonbondedUtilities::initialize(const System& system) {
+    string errorMessage = "Error initializing nonbonded utilities";
+    if (atomExclusions.size() == 0) {
+        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
+
+        atomExclusions.resize(context.getNumAtoms());
+        for (int i = 0; i < (int) atomExclusions.size(); i++)
+            atomExclusions[i].push_back(i);
+    }
+
+    // Create the list of tiles.
+
+    numAtoms = context.getNumAtoms();
+    int numAtomBlocks = context.getNumAtomBlocks();
+    int numContexts = context.getPlatformData().contexts.size();
+    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
+
+    // Build a list of tiles that contain exclusions.
+
+    set<pair<int, int> > tilesWithExclusions;
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/HipContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/HipContext::TileSize;
+            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
+        }
+    }
+    vector<int2> exclusionTilesVec;
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
+        exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useNeighborList ? compareInt2 : compareInt2LargeSIMD);
+    exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
+    exclusionTiles.upload(exclusionTilesVec);
+    map<pair<int, int>, int> exclusionTileMap;
+    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
+        int2 tile = exclusionTilesVec[i];
+        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
+    }
+    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
+        exclusionBlocksForBlock[iter->first].push_back(iter->second);
+        if (iter->first != iter->second)
+            exclusionBlocksForBlock[iter->second].push_back(iter->first);
+    }
+    vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
+    vector<unsigned int> exclusionIndicesVec;
+    for (int i = 0; i < numAtomBlocks; i++) {
+        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
+        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
+    }
+    maxExclusions = 0;
+    for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
+        maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
+    exclusionIndices.initialize<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
+    exclusionRowIndices.initialize<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
+    exclusionIndices.upload(exclusionIndicesVec);
+    exclusionRowIndices.upload(exclusionRowIndicesVec);
+
+    // Record the exclusion data.
+
+    exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions");
+    tileflags allFlags = (tileflags) -1;
+    vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/HipContext::TileSize;
+        int offset1 = atom1-x*HipContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/HipContext::TileSize;
+            int offset2 = atom2-y*HipContext::TileSize;
+            if (x > y) {
+                int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize;
+                exclusionVec[index+offset1] &= allFlags-(1<<offset2);
+            }
+            else {
+                int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize;
+                exclusionVec[index+offset2] &= allFlags-(1<<offset1);
+            }
+        }
+    }
+    atomExclusions.clear(); // We won't use this again, so free the memory it used
+    exclusions.upload(exclusionVec);
+
+    // Create data structures for the neighbor list.
+
+    if (useCutoff) {
+        // Select a size for the arrays that hold the neighbor list.  We have to make a fairly
+        // arbitrary guess, but if this turns out to be too small we'll increase it later.
+
+        maxTiles = 20*numAtomBlocks;
+        if (maxTiles > numTiles)
+            maxTiles = numTiles;
+        if (maxTiles < 1)
+            maxTiles = 1;
+        maxSinglePairs = 5*numAtoms;
+        // HIP-TODO: This may require tuning
+        numTilesInBatch = numAtomBlocks < 2000 ? 4 : 1;
+        interactingTiles.initialize<int>(context, maxTiles, "interactingTiles");
+        interactingAtoms.initialize<int>(context, HipContext::TileSize*maxTiles, "interactingAtoms");
+        interactionCount.initialize<unsigned int>(context, 2, "interactionCount");
+        singlePairs.initialize<int2>(context, maxSinglePairs, "singlePairs");
+        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+        blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
+        blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+        sortedBlocks.initialize<unsigned int>(context, numAtomBlocks, "sortedBlocks");
+        sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
+        sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange");
+        largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
+        largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox");
+        oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
+        rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
+        blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false);
+        vector<unsigned int> count(2, 0);
+        interactionCount.upload(count);
+        rebuildNeighborList.upload(&count[0]);
+        if (context.getUseDoublePrecision()) {
+            blockSizeRange.upload(vector<double>{1e38, 0});
+        } else {
+            blockSizeRange.upload(vector<float>{1e38, 0});
+        }
+    }
+
+    // Record arguments for kernels.
+
+    forceArgs.push_back(&context.getForce().getDevicePointer());
+    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
+    forceArgs.push_back(&context.getPosq().getDevicePointer());
+    forceArgs.push_back(&exclusions.getDevicePointer());
+    forceArgs.push_back(&exclusionTiles.getDevicePointer());
+    forceArgs.push_back(&startTileIndex);
+    forceArgs.push_back(&numTiles);
+    if (useCutoff) {
+        forceArgs.push_back(&interactingTiles.getDevicePointer());
+        forceArgs.push_back(&interactionCount.getDevicePointer());
+        forceArgs.push_back(context.getPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getPeriodicBoxVecXPointer());
+        forceArgs.push_back(context.getPeriodicBoxVecYPointer());
+        forceArgs.push_back(context.getPeriodicBoxVecZPointer());
+        forceArgs.push_back(&maxTiles);
+        forceArgs.push_back(&blockCenter.getDevicePointer());
+        forceArgs.push_back(&blockBoundingBox.getDevicePointer());
+        forceArgs.push_back(&interactingAtoms.getDevicePointer());
+        forceArgs.push_back(&maxSinglePairs);
+        forceArgs.push_back(&singlePairs.getDevicePointer());
+    }
+    for (int i = 0; i < (int) parameters.size(); i++)
+        forceArgs.push_back(&parameters[i].getMemory());
+    for (ParameterInfo& arg : arguments)
+        forceArgs.push_back(&arg.getMemory());
+    if (energyParameterDerivatives.size() > 0)
+        forceArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
+    if (useCutoff) {
+        findBlockBoundsArgs.push_back(&numAtoms);
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
+        findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecXPointer());
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
+        findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
+        findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer());
+        computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer());
+        computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer());
+        computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
+        sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
+        sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        if (useLargeBlocks) {
+            sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer());
+            sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer());
+            sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer());
+        }
+        sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
+        sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
+        sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
+        sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
+        sortBoxDataArgs.push_back(&forceRebuildNeighborList);
+        sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
+        findInteractingBlocksArgs.push_back(&interactionCount.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingTiles.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingAtoms.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&singlePairs.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
+        findInteractingBlocksArgs.push_back(&maxTiles);
+        findInteractingBlocksArgs.push_back(&maxSinglePairs);
+        findInteractingBlocksArgs.push_back(&startBlockIndex);
+        findInteractingBlocksArgs.push_back(&numBlocks);
+        findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        if (useLargeBlocks) {
+            findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer());
+            findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
+        }
+        findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
+        copyInteractionCountsArgs.push_back(&interactionCount.getDevicePointer());
+        copyInteractionCountsArgs.push_back(&pinnedCountBuffer);
+    }
+}
+
+double HipNonbondedUtilities::getMaxCutoffDistance() {
+    double cutoff = 0.0;
+    for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
+        cutoff = max(cutoff, iter->second);
+    return cutoff;
+}
+
+double HipNonbondedUtilities::padCutoff(double cutoff) {
+    double padding = (usePadding ? 0.08*cutoff : 0.0);
+    return cutoff+padding;
+}
+
+void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    if (groupKernels.find(forceGroups) == groupKernels.end())
+        createKernelsForGroups(forceGroups);
+    KernelSet& kernels = groupKernels[forceGroups];
+    if (useCutoff && usePeriodic) {
+        double4 box = context.getPeriodicBoxSize();
+        double minAllowedSize = 1.999999*kernels.cutoffDistance;
+        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
+            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
+    }
+    if (!useNeighborList)
+        return;
+    if (numTiles == 0)
+        return;
+
+    // Compute the neighbor list.
+
+    if (lastCutoff != kernels.cutoffDistance)
+        forceRebuildNeighborList = true;
+    context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth());
+    context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks());
+    blockSorter->sort(sortedBlocks);
+    context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64);
+    context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
+    forceRebuildNeighborList = false;
+    lastCutoff = kernels.cutoffDistance;
+    context.executeKernelFlat(kernels.copyInteractionCountsKernel, &copyInteractionCountsArgs[0], 1, 1);
+    hipEventRecord(downloadCountEvent, context.getCurrentStream());
+}
+
+void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    KernelSet& kernels = groupKernels[forceGroups];
+    if (kernels.hasForces) {
+        hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
+        if (kernel == NULL)
+            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
+        context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+    }
+    if (useNeighborList && numTiles > 0) {
+        hipEventSynchronize(downloadCountEvent);
+        updateNeighborListSize();
+    }
+}
+
+bool HipNonbondedUtilities::updateNeighborListSize() {
+    if (!useCutoff)
+        return false;
+    if (context.getStepsSinceReorder() == 0 || tilesAfterReorder == 0)
+        tilesAfterReorder = pinnedCountBuffer[0];
+    else if (context.getStepsSinceReorder() > 25 && pinnedCountBuffer[0] > 1.1*tilesAfterReorder)
+        context.forceReorder();
+    if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
+        return false;
+
+    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
+    // this from happening in the future.
+
+    if (pinnedCountBuffer[0] > maxTiles) {
+        maxTiles = (unsigned int) (1.2*pinnedCountBuffer[0]);
+        unsigned int numBlocks = context.getNumAtomBlocks();
+        int totalTiles = numBlocks*(numBlocks+1)/2;
+        if (maxTiles > totalTiles)
+            maxTiles = totalTiles;
+        interactingTiles.resize(maxTiles);
+        interactingAtoms.resize(HipContext::TileSize*(size_t) maxTiles);
+        if (forceArgs.size() > 0)
+            forceArgs[7] = &interactingTiles.getDevicePointer();
+        findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();
+        if (forceArgs.size() > 0)
+            forceArgs[17] = &interactingAtoms.getDevicePointer();
+        findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
+    }
+    if (pinnedCountBuffer[1] > maxSinglePairs) {
+        maxSinglePairs = (unsigned int) (1.2*pinnedCountBuffer[1]);
+        singlePairs.resize(maxSinglePairs);
+        if (forceArgs.size() > 0)
+            forceArgs[19] = &singlePairs.getDevicePointer();
+        findInteractingBlocksArgs[8] = &singlePairs.getDevicePointer();
+    }
+    forceRebuildNeighborList = true;
+    context.setForcesValid(false);
+    return true;
+}
+
+void HipNonbondedUtilities::setUsePadding(bool padding) {
+    usePadding = padding;
+}
+
+void HipNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
+    int numAtomBlocks = context.getNumAtomBlocks();
+    startBlockIndex = (int) (startFraction*numAtomBlocks);
+    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
+    long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
+    startTileIndex = (int) (startFraction*totalTiles);
+    numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
+    forceRebuildNeighborList = true;
+}
+
+void HipNonbondedUtilities::createKernelsForGroups(int groups) {
+    KernelSet kernels;
+    double cutoff = 0.0;
+    string source;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            cutoff = max(cutoff, groupCutoff[i]);
+            source += groupKernelSource[i];
+        }
+    }
+    kernels.hasForces = (source.size() > 0);
+    kernels.cutoffDistance = cutoff;
+    kernels.source = source;
+    kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
+    if (useCutoff) {
+        double paddedCutoff = padCutoff(cutoff);
+        map<string, string> defines;
+        defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
+        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+        defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+        defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
+        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
+        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
+        if (usePeriodic)
+            defines["USE_PERIODIC"] = "1";
+        if (context.getBoxIsTriclinic())
+            defines["TRICLINIC"] = "1";
+        if (useLargeBlocks)
+            defines["USE_LARGE_BLOCKS"] = "1";
+        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        int maxBits = 0;
+        if (canUsePairList) {
+            if (context.getUseDoublePrecision()) {
+                maxBits = 4;
+            }
+            else {
+                if (context.getSIMDWidth() > 32) {
+                    // CDNA
+                    if (context.getNumAtoms() < 100000)
+                        maxBits = 4;
+                    else // Large systems
+                        maxBits = 0;
+                }
+                else {
+                    // RDNA
+                    if (context.getNumAtoms() < 100000)
+                        maxBits = 4;
+                    else if (context.getNumAtoms() < 500000)
+                        maxBits = 2;
+                    else // Very large systems
+                        maxBits = 0;
+                }
+            }
+        }
+        defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits);
+        defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch);
+        defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize);
+        int binShift = 1;
+        while (1<<binShift <= context.getNumAtomBlocks())
+            binShift++;
+        defines["BIN_SHIFT"] = context.intToString(binShift);
+        defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
+        hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
+        kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
+        kernels.computeSortKeysKernel = context.getKernel(interactingBlocksProgram, "computeSortKeys");
+        kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
+        kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
+        kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
+    }
+    groupKernels[groups] = kernels;
+}
+
+hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups, bool includeForces, bool includeEnergy) {
+    map<string, string> replacements;
+    replacements["COMPUTE_INTERACTION"] = source;
+    const string suffixes[] = {"x", "y", "z", "w"};
+    stringstream args;
+    for (const ParameterInfo& param : params) {
+        args << ", ";
+        if (param.isConstant())
+            args << "const ";
+        args << param.getType();
+        args << "* __restrict__ global_";
+        args << param.getName();
+    }
+    for (const ParameterInfo& arg : arguments) {
+        args << ", ";
+        if (arg.isConstant())
+            args << "const ";
+        args << arg.getType();
+        args << "* __restrict__ ";
+        args << arg.getName();
+    }
+    if (energyParameterDerivatives.size() > 0)
+        args << ", mixed* __restrict__ energyParamDerivs";
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+
+    stringstream load1;
+    for (const ParameterInfo& param : params) {
+        load1 << param.getType();
+        load1 << " ";
+        load1 << param.getName();
+        load1 << "1 = global_";
+        load1 << param.getName();
+        load1 << "[atom1];\n";
+    }
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+
+    // Part 1. Defines for on diagonal exclusion tiles
+
+    stringstream broadcastWarpData;
+    broadcastWarpData << "posq2.x = SHFL(shflPosq.x, j);\n";
+    broadcastWarpData << "posq2.y = SHFL(shflPosq.y, j);\n";
+    broadcastWarpData << "posq2.z = SHFL(shflPosq.z, j);\n";
+    broadcastWarpData << "posq2.w = SHFL(shflPosq.w, j);\n";
+    for (const ParameterInfo& param : params) {
+        broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
+        for (int j = 0; j < param.getNumComponents(); j++) {
+            if (param.getNumComponents() == 1)
+                broadcastWarpData << "shfl" << param.getName() << "=SHFL(" << param.getName() <<"1,j);\n";
+            else
+                broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=SHFL(" << param.getName()+"1."+suffixes[j] <<",j);\n";
+        }
+    }
+    replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
+
+    // Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
+    stringstream declareLocal2;
+    for (const ParameterInfo& param : params)
+        declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
+    replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
+
+    stringstream loadLocal2;
+    for (const ParameterInfo& param : params)
+        loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
+
+    stringstream load2j;
+    for (const ParameterInfo& param : params)
+        load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
+    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
+
+    stringstream clearLocal;
+    for (const ParameterInfo& param : params) {
+        clearLocal<<"shfl";
+        clearLocal<<param.getName()<<" = ";
+        if (param.getNumComponents() == 1)
+            clearLocal<<"0;\n";
+        else
+            clearLocal<<"make_"<<param.getType()<<"(0);\n";
+    }
+    replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
+
+    stringstream initDerivs;
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
+    replacements["INIT_DERIVATIVES"] = initDerivs.str();
+    stringstream saveDerivs;
+    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+    int numDerivs = allParamDerivNames.size();
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        for (int index = 0; index < numDerivs; index++)
+            if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
+
+    stringstream shuffleWarpData;
+    shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n";
+    shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n";
+    for (const ParameterInfo& param : params) {
+        shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n";
+    }
+    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
+
+    map<string, string> defines;
+    if (useCutoff)
+        defines["USE_CUTOFF"] = "1";
+    if (usePeriodic)
+        defines["USE_PERIODIC"] = "1";
+    if (useExclusions)
+        defines["USE_EXCLUSIONS"] = "1";
+    if (isSymmetric)
+        defines["USE_SYMMETRIC"] = "1";
+    if (useNeighborList)
+        defines["USE_NEIGHBOR_LIST"] = "1";
+    defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
+    if (includeForces)
+        defines["INCLUDE_FORCES"] = "1";
+    if (includeEnergy)
+        defines["INCLUDE_ENERGY"] = "1";
+    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
+    double maxCutoff = 0.0;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            double cutoff = groupCutoff[i];
+            maxCutoff = max(maxCutoff, cutoff);
+            defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
+            defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
+        }
+    }
+    defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
+    defines["MAX_CUTOFF_SQUARED"] = context.doubleToString(maxCutoff*maxCutoff);
+    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+    defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
+    int numExclusionTiles = exclusionTiles.getSize();
+    defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
+    int numContexts = context.getPlatformData().contexts.size();
+    int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
+    int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
+    defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
+    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
+    hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
+    hipFunction_t kernel = context.getKernel(program, "computeNonbonded");
+    return kernel;
+}
+
+void HipNonbondedUtilities::setKernelSource(const string& source) {
+    kernelSource = source;
+}
--- a/platforms/hip/src/HipParallelKernels.cpp
+++ b/platforms/hip/src/HipParallelKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipParallelKernels.h"
+#include "HipKernelSources.h"
+#include "openmm/common/ContextSelector.h"
+
+using namespace OpenMM;
+using namespace std;
+
+
+#define CHECK_RESULT(result, prefix) \
+if (result != hipSuccess) { \
+    std::stringstream m; \
+    m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+    throw OpenMMException(m.str());\
+}
+
+/**
+ * Get the current clock time, measured in microseconds.
+ */
+#ifdef _MSC_VER
+    #include <Windows.h>
+    static long long getTime() {
+        FILETIME ft;
+        GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
+        ULARGE_INTEGER result;
+        result.LowPart = ft.dwLowDateTime;
+        result.HighPart = ft.dwHighDateTime;
+        return result.QuadPart/10;
+    }
+#else
+    #include <sys/time.h>
+    static long long getTime() {
+        struct timeval tod;
+        gettimeofday(&tod, 0);
+        return 1000000*tod.tv_sec+tod.tv_usec;
+    }
+#endif
+
+class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
+public:
+    BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
+    }
+    void execute() {
+        // Copy coordinates over to this device and execute the kernel.
+
+        ContextSelector selector(cu);
+        if (cu.getContextIndex() > 0) {
+            hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
+            if (!cu.getPlatformData().peerAccessSupported)
+                cu.getPosq().upload(pinnedMemory, false);
+        }
+        kernel.beginComputation(context, includeForce, includeEnergy, groups);
+    }
+private:
+    ContextImpl& context;
+    HipContext& cu;
+    HipCalcForcesAndEnergyKernel& kernel;
+    bool includeForce, includeEnergy;
+    int groups;
+    void* pinnedMemory;
+    hipEvent_t event;
+};
+
+class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
+public:
+    FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces,
+            bool& valid, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
+            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
+            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid),
+            stream(stream), event(event), localEvent(localEvent) {
+    }
+    void execute() {
+        // Execute the kernel, then download forces.
+
+        ContextSelector selector(cu);
+        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
+        if (cu.getComputeForceCount() < 200) {
+            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.
+
+            CHECK_RESULT(hipStreamSynchronize(cu.getCurrentStream()), "Error synchronizing HIP context");
+            completionTime = getTime();
+        }
+        if (includeForce) {
+            if (cu.getContextIndex() > 0) {
+                hipEventRecord(localEvent, cu.getCurrentStream());
+                hipStreamWaitEvent(stream, localEvent, 0);
+                int numAtoms = cu.getPaddedNumAtoms();
+                if (cu.getPlatformData().peerAccessSupported) {
+                    int numBytes = numAtoms*3*sizeof(long long);
+                    int offset = (cu.getContextIndex()-1)*numBytes;
+                    CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset,
+                                           cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces");
+                    hipEventRecord(event, stream);
+                }
+                else
+                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
+            }
+        }
+    }
+private:
+    ContextImpl& context;
+    HipContext& cu;
+    HipCalcForcesAndEnergyKernel& kernel;
+    bool includeForce, includeEnergy;
+    int groups;
+    double& energy;
+    long long& completionTime;
+    long long* pinnedMemory;
+    HipArray& contextForces;
+    bool& valid;
+    hipStream_t stream;
+    hipEvent_t event;
+    hipEvent_t localEvent;
+};
+
+HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
+        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
+}
+
+HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
+    ContextSelector selector(*data.contexts[0]);
+    if (pinnedPositionBuffer != NULL)
+        hipHostFree(pinnedPositionBuffer);
+    if (pinnedForceBuffer != NULL)
+        hipHostFree(pinnedForceBuffer);
+    hipEventDestroy(event);
+    for (int i = 0; i < peerCopyEvent.size(); i++)
+        hipEventDestroy(peerCopyEvent[i]);
+    for (int i = 0; i < peerCopyEventLocal.size(); i++)
+        hipEventDestroy(peerCopyEventLocal[i]);
+    for (int i = 0; i < peerCopyStream.size(); i++)
+        hipStreamDestroy(peerCopyStream[i]);
+}
+
+void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
+    HipContext& cu = *data.contexts[0];
+    ContextSelector selector(cu);
+    hipModule_t module = cu.createModule(HipKernelSources::parallel);
+    sumKernel = cu.getKernel(module, "sumForces");
+    int numContexts = data.contexts.size();
+    for (int i = 0; i < numContexts; i++)
+        getKernel(i).initialize(system);
+    for (int i = 0; i < numContexts; i++)
+        contextNonbondedFractions[i] = 1/(double) numContexts;
+    CHECK_RESULT(hipEventCreateWithFlags(&event, cu.getEventFlags()), "Error creating event");
+    peerCopyEvent.resize(numContexts);
+    peerCopyEventLocal.resize(numContexts);
+    peerCopyStream.resize(numContexts);
+    for (int i = 0; i < numContexts; i++) {
+        HipContext& cuLocal = *data.contexts[i];
+        ContextSelector selectorLocal(cuLocal);
+        CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
+        CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
+        CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
+    }
+}
+
+void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    HipContext& cu = *data.contexts[0];
+    ContextSelector selector(cu);
+    if (!contextForces.isInitialized()) {
+        contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
+        if (!cu.getPlatformData().peerAccessSupported) {
+            CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
+            CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
+        }
+    }
+
+    // Copy coordinates over to each device and execute the kernel.
+
+    if (!cu.getPlatformData().peerAccessSupported) {
+        cu.getPosq().download(pinnedPositionBuffer, false);
+        hipEventRecord(event, cu.getCurrentStream());
+    }
+    else {
+        int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
+        hipEventRecord(event, cu.getCurrentStream());
+        for (int i = 1; i < (int) data.contexts.size(); i++) {
+            hipStreamWaitEvent(peerCopyStream[i], event, 0);
+            CHECK_RESULT(hipMemcpyAsync(
+                data.contexts[i]->getPosq().getDevicePointer(),
+                cu.getPosq().getDevicePointer(), numBytes,
+                hipMemcpyDeviceToDevice, peerCopyStream[i]), "Error copying positions");
+            hipEventRecord(peerCopyEvent[i], peerCopyStream[i]);
+        }
+    }
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        data.contextEnergy[i] = 0.0;
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent));
+    }
+    data.syncContexts();
+}
+
+double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
+                pinnedForceBuffer, contextForces, valid, peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
+    }
+    data.syncContexts();
+    HipContext& cu = *data.contexts[0];
+    ContextSelector selector(cu);
+    if (cu.getPlatformData().peerAccessSupported)
+        for (int i = 1; i < data.contexts.size(); i++)
+            hipStreamWaitEvent(cu.getCurrentStream(), peerCopyEvent[i], 0);
+    double energy = 0.0;
+    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
+        energy += data.contextEnergy[i];
+    if (includeForce && valid) {
+        // Sum the forces from all devices.
+
+        if (!cu.getPlatformData().peerAccessSupported)
+            contextForces.upload(pinnedForceBuffer, false);
+        int bufferSize = 3*cu.getPaddedNumAtoms();
+        int numBuffers = data.contexts.size()-1;
+        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces.getDevicePointer(), &bufferSize, &numBuffers};
+        cu.executeKernel(sumKernel, args, bufferSize);
+
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
+        // finished last to the one that finished first.
+
+        if (cu.getComputeForceCount() < 200) {
+            int firstIndex = 0, lastIndex = 0;
+            const double eps = 0.001;
+            for (int i = 0; i < (int) completionTimes.size(); i++) {
+                if (completionTimes[i] < completionTimes[firstIndex])
+                    firstIndex = i;
+                if (contextNonbondedFractions[lastIndex] < eps || completionTimes[i] > completionTimes[lastIndex])
+                    lastIndex = i;
+            }
+            double fractionToTransfer = min(cu.getComputeForceCount() < 100 ? 0.01 : 0.001, contextNonbondedFractions[lastIndex]);
+            contextNonbondedFractions[firstIndex] += fractionToTransfer;
+            contextNonbondedFractions[lastIndex] -= fractionToTransfer;
+            double startFraction = 0.0;
+            for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
+                double endFraction = startFraction+contextNonbondedFractions[i];
+                if (i == contextNonbondedFractions.size()-1)
+                    endFraction = 1.0; // Avoid roundoff error
+                data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
+                startFraction = endFraction;
+            }
+        }
+    }
+    return energy;
+}
+
+class HipParallelCalcHarmonicBondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcHarmonicBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcHarmonicBondForceKernel::HipParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcHarmonicBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCustomBondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomBondForceKernel::HipParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcHarmonicAngleForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcHarmonicAngleForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcHarmonicAngleForceKernel::HipParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcHarmonicAngleForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCustomAngleForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomAngleForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomAngleForceKernel::HipParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomAngleForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcPeriodicTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcPeriodicTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcPeriodicTorsionForceKernel::HipParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcPeriodicTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcRBTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcRBTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcRBTorsionForceKernel::HipParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcRBTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCMAPTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCMAPTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCMAPTorsionForceKernel::HipParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCMAPTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCustomTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomTorsionForceKernel::HipParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcNonbondedForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, HipCalcNonbondedForceKernel& kernel, bool includeForce,
+            bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
+    }
+private:
+    ContextImpl& context;
+    HipCalcNonbondedForceKernel& kernel;
+    bool includeForce, includeEnergy, includeDirect, includeReciprocal;
+    double& energy;
+};
+
+HipParallelCalcNonbondedForceKernel::HipParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcNonbondedForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new HipCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+void HipParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
+}
+
+void HipParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
+
+class HipParallelCalcCustomNonbondedForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomNonbondedForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomNonbondedForceKernel::HipParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomNonbondedForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCustomExternalForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomExternalForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomExternalForceKernel::HipParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomExternalForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCustomHbondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomHbondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomHbondForceKernel::HipParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomHbondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
+class HipParallelCalcCustomCompoundBondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomCompoundBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+
+HipParallelCalcCustomCompoundBondForceKernel::HipParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomCompoundBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+
+void HipParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+
+double HipParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+
+void HipParallelCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
--- a/platforms/hip/src/HipParameterSet.cpp
+++ b/platforms/hip/src/HipParameterSet.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipParameterSet.h"
+
+using namespace OpenMM;
+using namespace std;
+
+HipParameterSet::HipParameterSet(HipContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
+            ComputeParameterSet(context, numParameters, numObjects, name, bufferPerParameter, useDoublePrecision) {
+    for (auto& info : getParameterInfos())
+        buffers.push_back(HipNonbondedUtilities::ParameterInfo(info.getName(), info.getComponentType(), info.getNumComponents(), info.getSize(), context.unwrap(info.getArray()).getDevicePointer()));
+}
--- a/platforms/hip/src/HipPlatform.cpp
+++ b/platforms/hip/src/HipPlatform.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2024 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipContext.h"
+#include "HipExpressionUtilities.h"
+#include "HipPlatform.h"
+#include "HipKernelFactory.h"
+#include "HipKernels.h"
+#include "openmm/Context.h"
+#include "openmm/System.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/hardware.h"
+#include <algorithm>
+#include <cctype>
+#include <sstream>
+#include <cstdio>
+#ifdef _MSC_VER
+    #include <Windows.h>
+#endif
+using namespace OpenMM;
+using namespace std;
+
+#define CHECK_RESULT(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+
+#ifdef OPENMM_COMMON_BUILDING_STATIC_LIBRARY
+extern "C" void registerHipPlatform() {
+    Platform::registerPlatform(new HipPlatform());
+}
+#else
+extern "C" OPENMM_EXPORT_COMMON void registerPlatforms() {
+    Platform::registerPlatform(new HipPlatform());
+}
+#endif
+
+HipPlatform::HipPlatform() {
+    deprecatedPropertyReplacements["HipDeviceIndex"] = HipDeviceIndex();
+    deprecatedPropertyReplacements["HipDeviceName"] = HipDeviceName();
+    deprecatedPropertyReplacements["HipUseBlockingSync"] = HipUseBlockingSync();
+    deprecatedPropertyReplacements["HipPrecision"] = HipPrecision();
+    deprecatedPropertyReplacements["HipUseCpuPme"] = HipUseCpuPme();
+    deprecatedPropertyReplacements["HipTempDirectory"] = HipTempDirectory();
+    deprecatedPropertyReplacements["HipDisablePmeStream"] = HipDisablePmeStream();
+    deprecatedPropertyReplacements["HipDeterministicForces"] = HipDeterministicForces();
+    HipKernelFactory* factory = new HipKernelFactory();
+    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
+    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
+    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
+    registerKernelFactory(VirtualSitesKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCPPForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
+    registerKernelFactory(CalcRMSDForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
+    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
+    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
+    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
+    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
+    registerKernelFactory(CalcATMForceKernel::Name(), factory);
+    platformProperties.push_back(HipDeviceIndex());
+    platformProperties.push_back(HipDeviceName());
+    platformProperties.push_back(HipUseBlockingSync());
+    platformProperties.push_back(HipPrecision());
+    platformProperties.push_back(HipUseCpuPme());
+    platformProperties.push_back(HipTempDirectory());
+    platformProperties.push_back(HipDisablePmeStream());
+    platformProperties.push_back(HipDeterministicForces());
+    setPropertyDefaultValue(HipDeviceIndex(), "");
+    setPropertyDefaultValue(HipDeviceName(), "");
+    setPropertyDefaultValue(HipUseBlockingSync(), "true");
+    setPropertyDefaultValue(HipPrecision(), "single");
+    setPropertyDefaultValue(HipUseCpuPme(), "false");
+    setPropertyDefaultValue(HipDisablePmeStream(), "false");
+    setPropertyDefaultValue(HipDeterministicForces(), "false");
+#ifdef _MSC_VER
+    setPropertyDefaultValue(HipTempDirectory(), string(getenv("TEMP")));
+#else
+    char* tmpdir = getenv("TMPDIR");
+    string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
+    setPropertyDefaultValue(HipTempDirectory(), tmp);
+#endif
+}
+
+double HipPlatform::getSpeed() const {
+    // Reduce the speed of the HIP platform if there are no HIP devices in the system,
+    // so the OpenCL plaform can be selected as default
+    int numDevices;
+    return hipGetDeviceCount(&numDevices) != hipErrorNoDevice ? 100 : 40;
+}
+
+bool HipPlatform::supportsDoublePrecision() const {
+    return true;
+}
+
+const string& HipPlatform::getPropertyValue(const Context& context, const string& property) const {
+    const ContextImpl& impl = getContextImpl(context);
+    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
+    string propertyName = property;
+    if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
+        propertyName = deprecatedPropertyReplacements.find(property)->second;
+    map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
+    if (value != data->propertyValues.end())
+        return value->second;
+    return Platform::getPropertyValue(context, property);
+}
+
+void HipPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
+}
+
+void HipPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
+    const string& devicePropValue = (properties.find(HipDeviceIndex()) == properties.end() ?
+            getPropertyDefaultValue(HipDeviceIndex()) : properties.find(HipDeviceIndex())->second);
+    string blockingPropValue = (properties.find(HipUseBlockingSync()) == properties.end() ?
+            getPropertyDefaultValue(HipUseBlockingSync()) : properties.find(HipUseBlockingSync())->second);
+    string precisionPropValue = (properties.find(HipPrecision()) == properties.end() ?
+            getPropertyDefaultValue(HipPrecision()) : properties.find(HipPrecision())->second);
+    string cpuPmePropValue = (properties.find(HipUseCpuPme()) == properties.end() ?
+            getPropertyDefaultValue(HipUseCpuPme()) : properties.find(HipUseCpuPme())->second);
+    const string& tempPropValue = (properties.find(HipTempDirectory()) == properties.end() ?
+            getPropertyDefaultValue(HipTempDirectory()) : properties.find(HipTempDirectory())->second);
+    string pmeStreamPropValue = (properties.find(HipDisablePmeStream()) == properties.end() ?
+            getPropertyDefaultValue(HipDisablePmeStream()) : properties.find(HipDisablePmeStream())->second);
+    string deterministicForcesValue = (properties.find(HipDeterministicForces()) == properties.end() ?
+            getPropertyDefaultValue(HipDeterministicForces()) : properties.find(HipDeterministicForces())->second);
+    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
+    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
+    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
+    transform(pmeStreamPropValue.begin(), pmeStreamPropValue.end(), pmeStreamPropValue.begin(), ::tolower);
+    transform(deterministicForcesValue.begin(), deterministicForcesValue.end(), deterministicForcesValue.begin(), ::tolower);
+    vector<string> pmeKernelName;
+    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
+    if (!supportsKernels(pmeKernelName))
+        cpuPmePropValue = "false";
+    int threads = getNumProcessors();
+    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
+    if (threadsEnv != NULL)
+        stringstream(threadsEnv) >> threads;
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, tempPropValue,
+            pmeStreamPropValue, deterministicForcesValue, threads, NULL));
+}
+
+void HipPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
+    Platform& platform = originalContext.getPlatform();
+    string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), HipDeviceIndex());
+    string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseBlockingSync());
+    string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), HipPrecision());
+    string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseCpuPme());
+    string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), HipTempDirectory());
+    string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), HipDisablePmeStream());
+    string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), HipDeterministicForces());
+    int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, tempPropValue,
+            pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
+}
+
+void HipPlatform::contextDestroyed(ContextImpl& context) const {
+    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
+    delete data;
+}
+
+HipPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& cpuPmeProperty, const string& tempProperty, const string& pmeStreamProperty,
+            const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
+                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false),
+                threads(numThreads) {
+    bool blocking = (blockingProperty == "true");
+    vector<string> devices;
+    size_t searchPos = 0, nextPos;
+    while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
+        devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
+        searchPos = nextPos+1;
+    }
+    devices.push_back(deviceIndexProperty.substr(searchPos));
+    PlatformData* originalData = NULL;
+    if (originalContext != NULL)
+        originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
+    try {
+        for (int i = 0; i < (int) devices.size(); i++) {
+            if (devices[i].length() > 0) {
+                int deviceIndex;
+                stringstream(devices[i]) >> deviceIndex;
+                contexts.push_back(new HipContext(system, deviceIndex, blocking, precisionProperty, tempProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
+            }
+        }
+        if (contexts.size() == 0)
+            contexts.push_back(new HipContext(system, -1, blocking, precisionProperty, tempProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
+    }
+    catch (...) {
+        // If an exception was thrown, do our best to clean up memory.
+
+        for (int i = 0; i < (int) contexts.size(); i++)
+            delete contexts[i];
+        throw;
+    }
+    stringstream deviceIndex, deviceName;
+    for (int i = 0; i < (int) contexts.size(); i++) {
+        if (i > 0) {
+            deviceIndex << ',';
+            deviceName << ',';
+        }
+        deviceIndex << contexts[i]->getDeviceIndex();
+        char name[1000];
+        CHECK_RESULT(hipDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
+        deviceName << name;
+    }
+
+    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
+    disablePmeStream = (pmeStreamProperty == "true");
+    deterministicForces = (deterministicForcesProperty == "true");
+    propertyValues[HipPlatform::HipDeviceIndex()] = deviceIndex.str();
+    propertyValues[HipPlatform::HipDeviceName()] = deviceName.str();
+    propertyValues[HipPlatform::HipUseBlockingSync()] = blocking ? "true" : "false";
+    propertyValues[HipPlatform::HipPrecision()] = precisionProperty;
+    propertyValues[HipPlatform::HipUseCpuPme()] = useCpuPme ? "true" : "false";
+    propertyValues[HipPlatform::HipTempDirectory()] = tempProperty;
+    propertyValues[HipPlatform::HipDisablePmeStream()] = disablePmeStream ? "true" : "false";
+    propertyValues[HipPlatform::HipDeterministicForces()] = deterministicForces ? "true" : "false";
+    contextEnergy.resize(contexts.size());
+
+    // Determine whether peer-to-peer copying is supported, and enable it if so.
+
+    peerAccessSupported = true;
+    for (int i = 1; i < contexts.size(); i++) {
+        int canAccess;
+        hipDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
+        if (!canAccess) {
+            peerAccessSupported = false;
+            break;
+        }
+    }
+}
+
+HipPlatform::PlatformData::~PlatformData() {
+    for (int i = 0; i < (int) contexts.size(); i++)
+        delete contexts[i];
+}
+
+void HipPlatform::PlatformData::initializeContexts(const System& system) {
+    if (hasInitializedContexts)
+        return;
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->initialize();
+    hasInitializedContexts = true;
+}
+
+void HipPlatform::PlatformData::syncContexts() {
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->getWorkThread().flush();
+}
--- a/platforms/hip/src/HipProgram.cpp
+++ b/platforms/hip/src/HipProgram.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipProgram.h"
+#include "HipKernel.h"
+
+using namespace OpenMM;
+using namespace std;
+
+HipProgram::HipProgram(HipContext& context, hipModule_t module) : context(context), module(module) {
+}
+
+ComputeKernel HipProgram::createKernel(const string& name) {
+    hipFunction_t kernel = context.getKernel(module, name.c_str());
+    return shared_ptr<ComputeKernelImpl>(new HipKernel(context, kernel, name));
+}
--- a/platforms/hip/src/HipSort.cpp
+++ b/platforms/hip/src/HipSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2021 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "HipSort.h"
+#include "HipKernelSources.h"
+#include <algorithm>
+#include <map>
+
+using namespace OpenMM;
+using namespace std;
+
+HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, bool uniform) :
+        context(context), trait(trait), dataLength(length), uniform(uniform) {
+    // Create kernels.
+
+    map<string, string> replacements;
+    replacements["DATA_TYPE"] = trait->getDataType();
+    replacements["KEY_TYPE"] =  trait->getKeyType();
+    replacements["SORT_KEY"] = trait->getSortKey();
+    replacements["MIN_KEY"] = trait->getMinKey();
+    replacements["MAX_KEY"] = trait->getMaxKey();
+    replacements["MAX_VALUE"] = trait->getMaxValue();
+    replacements["UNIFORM"] = (uniform ? "1" : "0");
+    hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
+    shortListKernel = context.getKernel(module, "sortShortList");
+    shortList2Kernel = context.getKernel(module, "sortShortList2");
+    computeRangeKernel = context.getKernel(module, "computeRange");
+    assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
+    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
+    copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
+    sortBucketsKernel = context.getKernel(module, "sortBuckets");
+
+    // Work out the work group sizes for various kernels.
+
+    int maxSharedMem;
+    hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
+    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
+    int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
+    isShortList = (length <= maxShortList);
+    sortKernelSize = 256;
+    rangeKernelSize = 256;
+    if (rangeKernelSize > length)
+        rangeKernelSize = length;
+    rangeKernelBlocks = (length + rangeKernelSize - 1) / rangeKernelSize;
+    if (sortKernelSize > maxLocalBuffer)
+        sortKernelSize = maxLocalBuffer;
+    unsigned int targetBucketSize = uniform ? sortKernelSize/2 : sortKernelSize/8;
+    unsigned int numBuckets = length/targetBucketSize;
+    if (numBuckets < 1)
+        numBuckets = 1;
+    // computeBucketPositions is executed as a single work group so larger block size is faster.
+    positionsKernelSize = 1024;
+    if (positionsKernelSize > numBuckets)
+        positionsKernelSize = numBuckets;
+
+    // Create workspace arrays.
+
+    if (!isShortList) {
+        counters.initialize<unsigned int>(context, 1, "counters");
+        unsigned int zero = 0;
+        counters.upload(&zero);
+        dataRange.initialize(context, 2*rangeKernelBlocks, trait->getKeySize(), "sortDataRange");
+        bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
+        bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
+        offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
+    }
+    buckets.initialize(context, length, trait->getDataSize(), "buckets");
+}
+
+HipSort::~HipSort() {
+    delete trait;
+}
+
+void HipSort::sort(HipArray& data) {
+    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
+        throw OpenMMException("HipSort called with different data size");
+    if (data.getSize() == 0)
+        return;
+    if (isShortList) {
+        // We can use a simpler sort kernel that does the entire operation in one kernel.
+
+        if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
+            void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
+            context.executeKernel(shortList2Kernel, sortArgs, dataLength, HipContext::ThreadBlockSize, HipContext::ThreadBlockSize*trait->getKeySize());
+            buckets.copyTo(data);
+        }
+        else {
+            void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
+            context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
+        }
+    }
+    else {
+        // Compute the range of data values.
+
+        unsigned int numBuckets = bucketOffset.getSize();
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
+        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelBlocks*rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
+
+        // Assign array elements to buckets.
+
+        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
+                &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
+        context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
+
+        // Compute the position of each bucket.
+
+        void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer(), &counters.getDevicePointer()};
+        context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
+
+        // Copy the data into the buckets.
+
+        void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
+                &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
+        context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
+
+        // Sort each bucket.
+
+        void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &bucketOffset.getDevicePointer()};
+        context.executeKernelFlat(sortBucketsKernel, sortArgs, numBuckets*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+    }
+}