Add hipification of CUDA platform

Port changes in CUDA backend to HIP Fix a warning about arithmetic operations on void* in HipArray::uploadSubArray Fix "Error Initializing context ROCm 5.3.0" https://github.com/StreamHPC/openmm-hip/issues/3 hipDeviceSetCacheConfig returns hipErrorNotSupported on 5.3 Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>

Add hipification of CUDA platform
Port changes in CUDA backend to HIP Fix a warning about arithmetic operations on void* in HipArray::uploadSubArray Fix "Error Initializing context ROCm 5.3.0" https://github.com/StreamHPC/openmm-hip/issues/3 hipDeviceSetCacheConfig returns hipErrorNotSupported on 5.3 Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>
89d2ff0e · Anton Gorenko · 8defca2d · 89d2ff0e · 89d2ff0e · 89d2ff0e
Unverified Commit 89d2ff0e authored Aug 25, 2024 by Anton Gorenko
20 changed files
--- a/platforms/hip/src/HipBondedUtilities.cpp
+++ b/platforms/hip/src/HipBondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipBondedUtilities.h"
+#include "HipContext.h"
+#include "HipExpressionUtilities.h"
+#include "HipKernelSources.h"
+#include "openmm/OpenMMException.h"
+#include "HipNonbondedUtilities.h"
+#include <iostream>
+using namespace OpenMM;
+using namespace std;
+HipBondedUtilities::HipBondedUtilities(HipContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
+}
+void HipBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
+    if (atoms.size() > 0) {
+        forceAtoms.push_back(atoms);
+        forceSource.push_back(source);
+        forceGroup.push_back(group);
+        allGroups |= 1<<group;
+    }
+}
+string HipBondedUtilities::addArgument(hipDeviceptr_t data, const string& type) {
+    arguments.push_back(data);
+    argTypes.push_back(type);
+    return "customArg"+context.intToString(arguments.size());
+}
+string HipBondedUtilities::addArgument(ArrayInterface& data, const string& type) {
+    return addArgument(context.unwrap(data).getDevicePointer(), type);
+}
+string HipBondedUtilities::addEnergyParameterDerivative(const string& param) {
+    // See if the parameter has already been added.
+    int index;
+    for (index = 0; index < energyParameterDerivatives.size(); index++)
+        if (param == energyParameterDerivatives[index])
+            break;
+    if (index == energyParameterDerivatives.size())
+        energyParameterDerivatives.push_back(param);
+    context.addEnergyParameterDerivative(param);
+    return string("energyParamDeriv")+context.intToString(index);
+}
+void HipBondedUtilities::addPrefixCode(const string& source) {
+    for (int i = 0; i < (int) prefixCode.size(); i++)
+        if (prefixCode[i] == source)
+            return;
+    prefixCode.push_back(source);
+}
+void HipBondedUtilities::initialize(const System& system) {
+    int numForces = forceAtoms.size();
+    hasInteractions = (numForces > 0);
+    if (!hasInteractions)
+        return;
+    // Build the lists of atom indices.
+    atomIndices.resize(numForces);
+    for (int i = 0; i < numForces; i++) {
+        int numBonds = forceAtoms[i].size();
+        int numAtoms = forceAtoms[i][0].size();
+        int numArrays = (numAtoms+3)/4;
+        int startAtom = 0;
+        atomIndices[i].resize(numArrays);
+        for (int j = 0; j < numArrays; j++) {
+            int width = min(numAtoms-startAtom, 4);
+            int paddedWidth = (width == 3 ? 4 : width);
+            vector<unsigned int> indexVec(paddedWidth*numBonds);
+            for (int bond = 0; bond < numBonds; bond++) {
+                for (int atom = 0; atom < width; atom++)
+                    indexVec[bond*paddedWidth+atom] = forceAtoms[i][bond][startAtom+atom];
+            }
+            atomIndices[i][j].initialize(context, numBonds, 4*paddedWidth, "bondedIndices");
+            atomIndices[i][j].upload(&indexVec[0]);
+            startAtom += width;
+        }
+    }
+    // Create the kernel.
+    stringstream s;
+    s<<HipKernelSources::vectorOps;
+    for (int i = 0; i < (int) prefixCode.size(); i++)
+        s<<prefixCode[i];
+    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
+    for (int force = 0; force < numForces; force++) {
+        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
+            int indexWidth = atomIndices[force][i].getElementSize()/4;
+            string indexType = "uint"+context.intToString(indexWidth);
+            s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
+        }
+    }
+    for (int i = 0; i < (int) arguments.size(); i++)
+        s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
+    if (energyParameterDerivatives.size() > 0)
+        s<<", mixed* __restrict__ energyParamDerivs";
+    s<<") {\n";
+    s<<"mixed energy = 0;\n";
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
+    for (int force = 0; force < numForces; force++)
+        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
+    s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
+    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+    int numDerivs = allParamDerivNames.size();
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        for (int index = 0; index < numDerivs; index++)
+            if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                s<<"energyParamDerivs[(blockIdx.x*blockDim.x+threadIdx.x)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+    s<<"}\n";
+    map<string, string> defines;
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    hipModule_t module = context.createModule(s.str(), defines);
+    kernel = context.getKernel(module, "computeBondedForces");
+    forceAtoms.clear();
+    forceSource.clear();
+}
+string HipBondedUtilities::createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const string& computeForce) {
+    maxBonds = max(maxBonds, numBonds);
+    string suffix[] = {".x", ".y", ".z", ".w"};
+    stringstream s;
+    s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
+    s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
+    int startAtom = 0;
+    for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
+        int indexWidth = atomIndices[forceIndex][i].getElementSize()/4;
+        string indexType = "uint"+context.intToString(indexWidth);
+        s<<"    "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
+        int atomsToLoad = min(indexWidth, numAtoms-startAtom);
+        for (int j = 0; j < atomsToLoad; j++) {
+            s<<"    unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
+            s<<"    real4 pos"<<(startAtom+j+1)<<" = posq[atom"<<(startAtom+j+1)<<"];\n";
+        }
+        startAtom += indexWidth;
+    }
+    s<<computeForce<<"\n";
+    for (int i = 0; i < numAtoms; i++) {
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0x100000000)));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0x100000000)));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0x100000000)));\n";
+        s<<"    __threadfence_block();\n";
+    }
+    s<<"}\n";
+    return s.str();
+}
+void HipBondedUtilities::computeInteractions(int groups) {
+    if ((groups&allGroups) == 0)
+        return;
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        kernelArgs.push_back(&context.getForce().getDevicePointer());
+        kernelArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
+        kernelArgs.push_back(&context.getPosq().getDevicePointer());
+        kernelArgs.push_back(NULL);
+        kernelArgs.push_back(context.getPeriodicBoxSizePointer());
+        kernelArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        kernelArgs.push_back(context.getPeriodicBoxVecXPointer());
+        kernelArgs.push_back(context.getPeriodicBoxVecYPointer());
+        kernelArgs.push_back(context.getPeriodicBoxVecZPointer());
+        for (int i = 0; i < (int) atomIndices.size(); i++)
+            for (int j = 0; j < (int) atomIndices[i].size(); j++)
+                kernelArgs.push_back(&atomIndices[i][j].getDevicePointer());
+        for (int i = 0; i < (int) arguments.size(); i++)
+            kernelArgs.push_back(&arguments[i]);
+        if (energyParameterDerivatives.size() > 0)
+            kernelArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
+    }
+    if (!hasInteractions)
+        return;
+    kernelArgs[3] = &groups;
+    context.executeKernel(kernel, &kernelArgs[0], maxBonds);
+}
--- a/platforms/hip/src/HipContext.cpp
+++ b/platforms/hip/src/HipContext.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#ifdef WIN32
+  #error "Windows unsupported for HIP platform"
+#endif
+#include <cmath>
+#include "HipContext.h"
+#include "HipArray.h"
+#include "HipBondedUtilities.h"
+#include "HipEvent.h"
+#include "HipIntegrationUtilities.h"
+#include "HipKernels.h"
+#include "HipKernelSources.h"
+#include "HipNonbondedUtilities.h"
+#include "HipProgram.h"
+#include "openmm/common/ComputeArray.h"
+#include "SHA1.h"
+#include "openmm/Platform.h"
+#include "openmm/System.h"
+#include "openmm/VirtualSite.h"
+#include "HipExpressionUtilities.h"
+#include "openmm/internal/ContextImpl.h"
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <typeinfo>
+#include <sys/stat.h>
+#include <unistd.h>
+#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
+#define CHECK_RESULT2(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+using namespace OpenMM;
+using namespace std;
+const int HipContext::ThreadBlockSize = 64;
+const int HipContext::TileSize = sizeof(tileflags)*8;
+static_assert(HipContext::ThreadBlockSize == HipContext::TileSize);
+bool HipContext::hasInitializedHip = false;
+HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
+        const string& tempDir, const std::string& hostCompiler, HipPlatform::PlatformData& platformData, HipContext* originalContext) : ComputeContext(system), currentStream(0),
+        platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
+        hasCompilerKernel(false), isHipccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
+    // Determine what compiler to use.
+    this->compiler = "\""+compiler+"\"";
+    if (platformData.context != NULL) {
+        try {
+            compilerKernel = platformData.context->getPlatform().createKernel(HipCompilerKernel::Name(), *platformData.context);
+            hasCompilerKernel = true;
+        }
+        catch (...) {
+            // The runtime compiler plugin isn't available.
+        }
+    }
+    string testCompilerCommand = this->compiler+" --version > /dev/null 2> /dev/null";
+    int res = std::system(testCompilerCommand.c_str());
+    struct stat info;
+    isHipccAvailable = (res == 0 && stat(tempDir.c_str(), &info) == 0);
+    if (!hasInitializedHip) {
+        CHECK_RESULT2(hipInit(0), "Error initializing HIP");
+        hasInitializedHip = true;
+    }
+    if (precision == "single") {
+        useDoublePrecision = false;
+        useMixedPrecision = false;
+    }
+    else if (precision == "mixed") {
+        useDoublePrecision = false;
+        useMixedPrecision = true;
+    }
+    else if (precision == "double") {
+        useDoublePrecision = true;
+        useMixedPrecision = false;
+    }
+    else
+        throw OpenMMException("Illegal value for Precision: "+precision);
+    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
+    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
+    this->tempDir = tempDir+"/";
+    cacheDir = cacheDir+"/";
+    contextIndex = platformData.contexts.size();
+    string errorMessage = "Error initializing Context";
+    if (originalContext == NULL) {
+        isLinkedContext = false;
+        int numDevices;
+        CHECK_RESULT(hipGetDeviceCount(&numDevices));
+        if (deviceIndex < -1 || deviceIndex >= numDevices)
+            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
+        vector<int> devicePrecedence;
+        if (deviceIndex == -1) {
+            devicePrecedence = getDevicePrecedence();
+        } else {
+            devicePrecedence.push_back(deviceIndex);
+        }
+        this->deviceIndex = -1;
+        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+            int trialDeviceIndex = devicePrecedence[i];
+            CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
+            defaultOptimizationOptions = "-ffast-math -Wall";
+            // try setting device
+            if (hipSetDevice(device) == hipSuccess) {
+                // and set flags
+                unsigned int flags = hipDeviceMapHost;
+                if (useBlockingSync)
+                    flags += hipDeviceScheduleBlockingSync;
+                else
+                    flags += hipDeviceScheduleSpin;
+                if (hipSetDeviceFlags(flags) == hipSuccess) {
+                    this->deviceIndex = trialDeviceIndex;
+                    break;
+                }
+            }
+        }
+        if (this->deviceIndex == -1) {
+            if (deviceIndex != -1)
+                throw OpenMMException("The requested HIP device could not be loaded");
+            else
+                throw OpenMMException("No compatible HIP device is available");
+        }
+    }
+    else {
+        isLinkedContext = true;
+        this->deviceIndex = originalContext->deviceIndex;
+        this->device = originalContext->device;
+    }
+    hipDeviceProp_t props;
+    CHECK_RESULT(hipGetDeviceProperties(&props, device));
+    // set device properties
+    this->simdWidth = props.warpSize;
+    this->sharedMemPerBlock = props.sharedMemPerBlock;
+    this->hasGlobalInt64Atomics = props.arch.hasGlobalInt64Atomics;
+    this->hasDoubles = props.arch.hasDoubles;
+    // HIP-TODO: remove hasWarpShuffle==false paths completely?
+    this->hasWarpShuffle = true;
+    gpuArchitecture = props.gcnArchName;
+    // HIP-TODO: find a good value here
+    int numThreadBlocksPerComputeUnit = 6;
+    contextIsValid = true;
+    if (contextIndex > 0) {
+        int canAccess;
+        CHECK_RESULT(hipDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice()));
+        if (canAccess) {
+            platformData.contexts[0]->setAsCurrent();
+            CHECK_RESULT(hipDeviceEnablePeerAccess(getDevice(), 0));
+            setAsCurrent();
+            CHECK_RESULT(hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0));
+        }
+    }
+    numAtoms = system.getNumParticles();
+    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
+    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
+    int multiprocessors;
+    CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
+    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
+    // GCN hardware is more like CUDA-8, w/ no independent forward progress
+    // or *_sync primatives
+    compilationDefines["SYNC_WARPS"] = "";
+    compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
+    // Important: the predicate for ballot is defined as an integer, hence
+    // it is important that we convert the variable field to a true/false value
+    // before running ballot, such that we do not discard the top half when
+    // running on a long int
+    compilationDefines["BALLOT(var)"] = "__ballot((var) != 0);";
+    if (useDoublePrecision) {
+        posq.initialize<double4>(*this, paddedNumAtoms, "posq");
+        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
+        compilationDefines["make_real2"] = "make_double2";
+        compilationDefines["make_real3"] = "make_double3";
+        compilationDefines["make_real4"] = "make_double4";
+        compilationDefines["make_mixed2"] = "make_double2";
+        compilationDefines["make_mixed3"] = "make_double3";
+        compilationDefines["make_mixed4"] = "make_double4";
+    }
+    else if (useMixedPrecision) {
+        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
+        posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
+        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["USE_MIXED_PRECISION"] = "1";
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
+        compilationDefines["make_mixed2"] = "make_double2";
+        compilationDefines["make_mixed3"] = "make_double3";
+        compilationDefines["make_mixed4"] = "make_double4";
+    }
+    else {
+        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
+        velm.initialize<float4>(*this, paddedNumAtoms, "velm");
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
+        compilationDefines["make_mixed2"] = "make_float2";
+        compilationDefines["make_mixed3"] = "make_float3";
+        compilationDefines["make_mixed4"] = "make_float4";
+    }
+    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
+    posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
+    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
+    atomIndex.resize(paddedNumAtoms);
+    for (int i = 0; i < paddedNumAtoms; ++i)
+        atomIndex[i] = i;
+    atomIndexDevice.upload(atomIndex);
+    // Create utility kernels that are used in multiple places.
+    hipModule_t utilities = createModule(HipKernelSources::vectorOps+HipKernelSources::utilities);
+    clearBufferKernel = getKernel(utilities, "clearBuffer");
+    clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
+    clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
+    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
+    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
+    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
+    setChargesKernel = getKernel(utilities, "setCharges");
+    // Set defines based on the requested precision.
+    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
+    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
+    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
+    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
+    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
+    compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
+    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
+    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
+    compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
+    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
+    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
+    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
+    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
+    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
+    // Set defines for applying periodic boundary conditions.
+    Vec3 boxVectors[3];
+    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
+                      boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
+                      boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
+    if (boxIsTriclinic) {
+        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
+            "{"
+            "real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
+            "delta.x -= scale3*periodicBoxVecZ.x; \\\n"
+            "delta.y -= scale3*periodicBoxVecZ.y; \\\n"
+            "delta.z -= scale3*periodicBoxVecZ.z; \\\n"
+            "real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
+            "delta.x -= scale2*periodicBoxVecY.x; \\\n"
+            "delta.y -= scale2*periodicBoxVecY.y; \\\n"
+            "real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
+            "delta.x -= scale1*periodicBoxVecX.x;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
+            "{"
+            "real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
+            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
+            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
+            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
+            "real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
+            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
+            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
+            "real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
+            "pos.x -= scale1*periodicBoxVecX.x;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
+            "{"
+            "real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
+            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
+            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
+            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
+            "real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
+            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
+            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
+            "real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
+            "pos.x -= scale1*periodicBoxVecX.x;}";
+    }
+    else {
+        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
+            "{"
+            "delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
+            "delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
+            "delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
+            "{"
+            "pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
+            "pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
+            "pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
+        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
+            "{"
+            "pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
+            "pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
+            "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
+    }
+    // Create utilities objects.
+    bonded = new HipBondedUtilities(*this);
+    nonbonded = new HipNonbondedUtilities(*this);
+    integration = new HipIntegrationUtilities(*this, system);
+    expression = new HipExpressionUtilities(*this);
+}
+HipContext::~HipContext() {
+    setAsCurrent();
+    for (auto force : forces)
+        delete force;
+    for (auto listener : reorderListeners)
+        delete listener;
+    for (auto computation : preComputations)
+        delete computation;
+    for (auto computation : postComputations)
+        delete computation;
+    if (pinnedBuffer != NULL)
+        hipHostFree(pinnedBuffer);
+    if (integration != NULL)
+        delete integration;
+    if (expression != NULL)
+        delete expression;
+    if (bonded != NULL)
+        delete bonded;
+    if (nonbonded != NULL)
+        delete nonbonded;
+    contextIsValid = false;
+}
+void HipContext::initialize() {
+    hipSetDevice(device);
+    string errorMessage = "Error initializing Context";
+    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
+    if (useDoublePrecision) {
+        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<double>(*this, 1, "energySum");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else if (useMixedPrecision) {
+        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<double>(*this, 1, "energySum");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else {
+        energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
+        energySum.initialize<float>(*this, 1, "energySum");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
+        CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
+    }
+    for (int i = 0; i < numAtoms; i++) {
+        double mass = system.getParticleMass(i);
+        if (useDoublePrecision || useMixedPrecision)
+            ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
+        else
+            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
+    }
+    velm.upload(pinnedBuffer);
+    bonded->initialize(system);
+    addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
+    addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
+    int numEnergyParamDerivs = energyParamDerivNames.size();
+    if (numEnergyParamDerivs > 0) {
+        if (useDoublePrecision || useMixedPrecision)
+            energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        else
+            energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
+        addAutoclearBuffer(energyParamDerivBuffer);
+    }
+    findMoleculeGroups();
+    nonbonded->initialize(system);
+}
+void HipContext::initializeContexts() {
+    getPlatformData().initializeContexts(system);
+}
+void HipContext::setAsCurrent() {
+    if (contextIsValid)
+        hipSetDevice(device);
+}
+hipModule_t HipContext::createModule(const string source, const char* optimizationFlags) {
+    return createModule(source, map<string, string>(), optimizationFlags);
+}
+hipModule_t HipContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
+    static_assert(8*sizeof(void*) == HipContext::TileSize);
+    string bits = intToString(8*sizeof(void*));
+    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
+    stringstream src;
+    if (!options.empty())
+        src << "// Compilation Options: " << options << endl << endl;
+    for (auto& pair : compilationDefines) {
+        // Query defines to avoid duplicate variables
+        if (defines.find(pair.first) == defines.end()) {
+            src << "#define " << pair.first;
+            if (!pair.second.empty())
+                src << " " << pair.second;
+            src << endl;
+        }
+    }
+    if (!compilationDefines.empty())
+        src << endl;
+    // include the main header for built-in variables (threadIdx etc.) and functions
+    src << "#include \"hip/hip_runtime.h\"\n";
+    // include the vector types
+    src << "#include \"hip/hip_vector_types.h\"\n";
+    if (useDoublePrecision) {
+        src << "typedef double real;\n";
+        src << "typedef double2 real2;\n";
+        src << "typedef double3 real3;\n";
+        src << "typedef double4 real4;\n";
+    }
+    else {
+        src << "typedef float real;\n";
+        src << "typedef float2 real2;\n";
+        src << "typedef float3 real3;\n";
+        src << "typedef float4 real4;\n";
+    }
+    if (useDoublePrecision || useMixedPrecision) {
+        src << "typedef double mixed;\n";
+        src << "typedef double2 mixed2;\n";
+        src << "typedef double3 mixed3;\n";
+        src << "typedef double4 mixed4;\n";
+    }
+    else {
+        src << "typedef float mixed;\n";
+        src << "typedef float2 mixed2;\n";
+        src << "typedef float3 mixed3;\n";
+        src << "typedef float4 mixed4;\n";
+    }
+    src << "typedef unsigned long tileflags;\n";
+    src << "static_assert(sizeof(tileflags)*8==" << HipContext::TileSize << ",\"tileflags size does not match TILE_SIZE\");\n";
+    src << HipKernelSources::common << endl;
+    for (auto& pair : defines) {
+        src << "#define " << pair.first;
+        if (!pair.second.empty())
+            src << " " << pair.second;
+        src << endl;
+    }
+    if (!defines.empty())
+        src << endl;
+    src << source << endl;
+    // See whether we already have PTX for this kernel cached.
+    CSHA1 sha1;
+    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
+    sha1.Final();
+    UINT_8 hash[20];
+    sha1.GetHash(hash);
+    stringstream cacheFile;
+    cacheFile << cacheDir;
+    cacheFile.flags(ios::hex);
+    for (int i = 0; i < 20; i++)
+        cacheFile << setw(2) << setfill('0') << (int) hash[i];
+    cacheFile << '_' << gpuArchitecture << '_' << bits;
+    hipModule_t module;
+    if (hipModuleLoad(&module, cacheFile.str().c_str()) == hipSuccess)
+        return module;
+    // Select names for the various temporary files.
+    stringstream tempFileName;
+    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
+    tempFileName << "_" << getpid();
+    string inputFile = (tempDir+tempFileName.str()+".hip.cpp");
+    string outputFile = (tempDir+tempFileName.str()+".hsaco");
+    string logFile = (tempDir+tempFileName.str()+".log");
+    int res = 0;
+    // If the runtime compiler plugin is available, use it.
+    if (hasCompilerKernel) {
+        string ptx = compilerKernel.getAs<HipCompilerKernel>().createModule(src.str(), options, *this);
+        // If possible, write the PTX out to a temporary file so we can cache it for later use.
+        bool wroteCache = false;
+        try {
+            ofstream out(outputFile.c_str());
+            out << ptx;
+            out.close();
+            if (!out.fail())
+                wroteCache = true;
+        }
+        catch (...) {
+            // Ignore.
+        }
+        if (!wroteCache) {
+            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
+            CHECK_RESULT2(hipModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading HIP module");
+            return module;
+        }
+    }
+    else {
+        // Write out the source to a temporary file.
+        ofstream out(inputFile.c_str());
+        out << src.str();
+        out.close();
+        string command = compiler + " --genco --amdgpu-target=" + gpuArchitecture + " " + options + " -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
+        res = std::system(command.c_str());
+    }
+    try {
+        if (res != 0) {
+            // Load the error log.
+            stringstream error;
+            error << "Error launching HIP compiler: " << res;
+            ifstream log(logFile.c_str());
+            if (log.is_open()) {
+                string line;
+                while (!log.eof()) {
+                    getline(log, line);
+                    error << '\n' << line;
+                }
+                log.close();
+            }
+            throw OpenMMException(error.str());
+        }
+        hipError_t result = hipModuleLoad(&module, outputFile.c_str());
+        if (result != hipSuccess) {
+            std::stringstream m;
+            m<<"Error loading HIP module: "<<getErrorString(result)<<" ("<<result<<")";
+            throw OpenMMException(m.str());
+        }
+        const char* save = getenv("OPENMM_SAVE_TEMPS");
+        bool savetemps = save != nullptr;
+        if (!savetemps) {
+            remove(inputFile.c_str());
+            remove(logFile.c_str());
+        }
+        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0 && !savetemps)
+            remove(outputFile.c_str());
+        return module;
+    }
+    catch (...) {
+        const char* save = getenv("OPENMM_SAVE_TEMPS");
+        bool savetemps = save != nullptr;
+        if (!savetemps) {
+            remove(inputFile.c_str());
+            remove(outputFile.c_str());
+            remove(logFile.c_str());
+        }
+        throw;
+    }
+}
+hipFunction_t HipContext::getKernel(hipModule_t& module, const string& name) {
+    hipFunction_t function;
+    hipError_t result = hipModuleGetFunction(&function, module, name.c_str());
+    if (result != hipSuccess) {
+        std::stringstream m;
+        m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(m.str());
+    }
+    return function;
+}
+hipStream_t HipContext::getCurrentStream() {
+    return currentStream;
+}
+void HipContext::setCurrentStream(hipStream_t stream) {
+    currentStream = stream;
+}
+void HipContext::restoreDefaultStream() {
+    setCurrentStream(0);
+}
+HipArray* HipContext::createArray() {
+    return new HipArray();
+}
+ComputeEvent HipContext::createEvent() {
+    return shared_ptr<ComputeEventImpl>(new HipEvent(*this));
+}
+ComputeProgram HipContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
+    hipModule_t module = createModule(HipKernelSources::vectorOps+source, defines);
+    return shared_ptr<ComputeProgramImpl>(new HipProgram(*this, module));
+}
+HipArray& HipContext::unwrap(ArrayInterface& array) const {
+    HipArray* cuarray;
+    ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
+    if (wrapper != NULL)
+        cuarray = dynamic_cast<HipArray*>(&wrapper->getArray());
+    else
+        cuarray = dynamic_cast<HipArray*>(&array);
+    if (cuarray == NULL)
+        throw OpenMMException("Array argument is not an HipArray");
+    return *cuarray;
+}
+std::string HipContext::getErrorString(hipError_t result) {
+    return string(hipGetErrorName(result));
+}
+void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
+    if (blockSize == -1)
+        blockSize = ThreadBlockSize;
+    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
+    hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
+    if (result != hipSuccess) {
+        stringstream str;
+        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+int HipContext::computeThreadBlockSize(double memory) const {
+    int maxShared = this->sharedMemPerBlock;
+    int max = (int) (maxShared/memory);
+    if (max < HipContext::ThreadBlockSize) {
+        throw OpenMMException("Too much shared memory requested!");
+    }
+    int threads = this->simdWidth;
+    while (threads+this->simdWidth < max)
+        threads += this->simdWidth;
+    return threads;
+}
+void HipContext::clearBuffer(ArrayInterface& array) {
+    clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
+}
+void HipContext::clearBuffer(hipDeviceptr_t memory, int size) {
+    int words = size/4;
+    void* args[] = {&memory, &words};
+    executeKernel(clearBufferKernel, args, words, 4 * this->simdWidth);
+}
+void HipContext::addAutoclearBuffer(ArrayInterface& array) {
+    addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
+}
+void HipContext::addAutoclearBuffer(hipDeviceptr_t memory, int size) {
+    autoclearBuffers.push_back(memory);
+    autoclearBufferSizes.push_back(size/4);
+}
+void HipContext::clearAutoclearBuffers() {
+    int preferredTBSize = this->simdWidth * 4;
+    int base = 0;
+    int total = autoclearBufferSizes.size();
+    while (total-base >= 6) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
+                        &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
+        executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), preferredTBSize);
+        base += 6;
+    }
+    if (total-base == 5) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
+        executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), preferredTBSize);
+    }
+    else if (total-base == 4) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
+        executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), preferredTBSize);
+    }
+    else if (total-base == 3) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
+        executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), preferredTBSize);
+    }
+    else if (total-base == 2) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
+        executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), preferredTBSize);
+    }
+    else if (total-base == 1) {
+        clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
+    }
+}
+double HipContext::reduceEnergy() {
+    int bufferSize = energyBuffer.getSize();
+    int workGroupSize  = 512;
+    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
+    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
+    energySum.download(pinnedBuffer);
+    if (getUseDoublePrecision() || getUseMixedPrecision())
+        return *((double*) pinnedBuffer);
+    else
+        return *((float*) pinnedBuffer);
+}
+void HipContext::setCharges(const vector<double>& charges) {
+    if (!chargeBuffer.isInitialized())
+        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    vector<double> c(numAtoms);
+    for (int i = 0; i < numAtoms; i++)
+        c[i] = charges[i];
+    chargeBuffer.upload(c, true);
+    void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
+    executeKernel(setChargesKernel, args, numAtoms);
+}
+bool HipContext::requestPosqCharges() {
+    bool allow = !hasAssignedPosqCharges;
+    hasAssignedPosqCharges = true;
+    return allow;
+}
+void HipContext::addEnergyParameterDerivative(const string& param) {
+    // See if this parameter has already been registered.
+    for (int i = 0; i < energyParamDerivNames.size(); i++)
+        if (param == energyParamDerivNames[i])
+            return;
+    energyParamDerivNames.push_back(param);
+}
+void HipContext::flushQueue() {
+    hipStreamSynchronize(getCurrentStream());
+}
+vector<int> HipContext::getDevicePrecedence() {
+    int numDevices;
+    hipDeviceProp_t thisDevice;
+    string errorMessage = "Error initializing Context";
+    vector<pair<int, int> > devices;
+    CHECK_RESULT(hipGetDeviceCount(&numDevices));
+    for (int i = 0; i < numDevices; i++) {
+        CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i));
+        int clock, multiprocessors, speed;
+        clock = thisDevice.clockRate;
+        multiprocessors = thisDevice.multiProcessorCount;
+        speed = clock*multiprocessors;
+        devices.push_back(std::make_pair(speed, -i));
+    }
+    // sort first by speed (higher is better), and finally device index (lower is better)
+    std::sort(devices.begin(), devices.end());
+    std::reverse(devices.begin(), devices.end());
+    vector<int> precedence;
+    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
+        precedence.push_back(-devices[i].second);
+    }
+    return precedence;
+}
--- a/platforms/hip/src/HipEvent.cpp
+++ b/platforms/hip/src/HipEvent.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipEvent.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+HipEvent::HipEvent(HipContext& context) : context(context), eventCreated(false) {
+    hipError_t result = hipEventCreateWithFlags(&event, hipEventDisableTiming);
+    if (result != hipSuccess)
+        throw OpenMMException("Error creating HIP event:"+HipContext::getErrorString(result));
+    eventCreated = true;
+}
+HipEvent::~HipEvent() {
+    if (eventCreated)
+        hipEventDestroy(event);
+}
+void HipEvent::enqueue() {
+    hipEventRecord(event, 0);
+}
+void HipEvent::wait() {
+    hipEventSynchronize(event);
+}
--- a/platforms/hip/src/HipFFT3D.cpp
+++ b/platforms/hip/src/HipFFT3D.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipFFT3D.h"
+#include "HipContext.h"
+#include "HipKernelSources.h"
+#include "SimTKOpenMMRealType.h"
+#include <map>
+#include <sstream>
+#include <string>
+using namespace OpenMM;
+using namespace std;
+HipFFT3D::HipFFT3D(HipContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
+        context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
+    packRealAsComplex = false;
+    int packedXSize = xsize;
+    int packedYSize = ysize;
+    int packedZSize = zsize;
+    if (realToComplex) {
+        // If any axis size is even, we can pack the real values into a complex grid that is only half as large.
+        // Look for an appropriate axis.
+        packRealAsComplex = true;
+        int packedAxis, bufferSize;
+        if (xsize%2 == 0) {
+            packedAxis = 0;
+            packedXSize /= 2;
+            bufferSize = packedXSize;
+        }
+        else if (ysize%2 == 0) {
+            packedAxis = 1;
+            packedYSize /= 2;
+            bufferSize = packedYSize;
+        }
+        else if (zsize%2 == 0) {
+            packedAxis = 2;
+            packedZSize /= 2;
+            bufferSize = packedZSize;
+        }
+        else
+            packRealAsComplex = false;
+        if (packRealAsComplex) {
+            // Build the kernels for packing and unpacking the data.
+            map<string, string> defines;
+            defines["XSIZE"] = context.intToString(xsize);
+            defines["YSIZE"] = context.intToString(ysize);
+            defines["ZSIZE"] = context.intToString(zsize);
+            defines["PACKED_AXIS"] = context.intToString(packedAxis);
+            defines["PACKED_XSIZE"] = context.intToString(packedXSize);
+            defines["PACKED_YSIZE"] = context.intToString(packedYSize);
+            defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
+            defines["M_PI"] = context.doubleToString(M_PI);
+            hipModule_t module = context.createModule(HipKernelSources::vectorOps+HipKernelSources::fftR2C, defines);
+            packForwardKernel = context.getKernel(module, "packForwardData");
+            unpackForwardKernel = context.getKernel(module, "unpackForwardData");
+            packBackwardKernel = context.getKernel(module, "packBackwardData");
+            unpackBackwardKernel = context.getKernel(module, "unpackBackwardData");
+        }
+    }
+    bool inputIsReal = (realToComplex && !packRealAsComplex);
+    zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
+    xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
+    ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
+    invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
+    invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
+    invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
+}
+void HipFFT3D::execFFT(HipArray& in, HipArray& out, bool forward) {
+    hipFunction_t kernel1 = (forward ? zkernel : invzkernel);
+    hipFunction_t kernel2 = (forward ? xkernel : invxkernel);
+    hipFunction_t kernel3 = (forward ? ykernel : invykernel);
+    void* args1[] = {&in.getDevicePointer(), &out.getDevicePointer()};
+    void* args2[] = {&out.getDevicePointer(), &in.getDevicePointer()};
+    if (packRealAsComplex) {
+        hipFunction_t packKernel = (forward ? packForwardKernel : packBackwardKernel);
+        hipFunction_t unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
+        int gridSize = xsize*ysize*zsize/2;
+        // Pack the data into a half sized grid.
+        context.executeKernel(packKernel, args1, gridSize, 128);
+        // Perform the FFT.
+        context.executeKernel(kernel1, args2, gridSize, zthreads);
+        context.executeKernel(kernel2, args1, gridSize, xthreads);
+        context.executeKernel(kernel3, args2, gridSize, ythreads);
+        // Unpack the data.
+        context.executeKernel(unpackKernel, args1, gridSize, 128);
+    }
+    else {
+        context.executeKernel(kernel1, args1, xsize*ysize*zsize, zthreads);
+        context.executeKernel(kernel2, args2, xsize*ysize*zsize, xthreads);
+        context.executeKernel(kernel3, args1, xsize*ysize*zsize, ythreads);
+    }
+}
+int HipFFT3D::findLegalDimension(int minimum) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+        int unfactored = minimum;
+        for (int factor = 2; factor < 8; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1)
+            return minimum;
+        minimum++;
+    }
+}
+static int getSmallestRadix(int size) {
+    int minRadix = 1;
+    int unfactored = size;
+    while (unfactored%7 == 0) {
+        minRadix = 7;
+        unfactored /= 7;
+    }
+    while (unfactored%5 == 0) {
+        minRadix = 5;
+        unfactored /= 5;
+    }
+    while (unfactored%4 == 0) {
+        minRadix = 4;
+        unfactored /= 4;
+    }
+    while (unfactored%3 == 0) {
+        minRadix = 3;
+        unfactored /= 3;
+    }
+    while (unfactored%2 == 0) {
+        minRadix = 2;
+        unfactored /= 2;
+    }
+    return minRadix;
+}
+hipFunction_t HipFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
+    int maxThreads = (context.getUseDoublePrecision() ? 128 : 256);
+//    while (maxThreads > 128 && maxThreads-64 >= zsize)
+//        maxThreads -= 64;
+    int threadsPerBlock = zsize/getSmallestRadix(zsize);
+    stringstream source;
+    int blocksPerGroup = max(1, maxThreads/threadsPerBlock);
+    int stage = 0;
+    int L = zsize;
+    int m = 1;
+    // Factor zsize, generating an appropriate block of code for each factor.
+    while (L > 1) {
+        int input = stage%2;
+        int output = 1-input;
+        int radix;
+        if (L%7 == 0)
+            radix = 7;
+        else if (L%5 == 0)
+            radix = 5;
+        else if (L%4 == 0)
+            radix = 4;
+        else if (L%3 == 0)
+            radix = 3;
+        else if (L%2 == 0)
+            radix = 2;
+        else
+            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
+        source<<"{\n";
+        L = L/radix;
+        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
+        if (L*m < threadsPerBlock)
+            source<<"if (threadIdx.x < "<<(blocksPerGroup*L*m)<<") {\n";
+        else
+            source<<"{\n";
+        source<<"int block = threadIdx.x/"<<(L*m)<<";\n";
+        source<<"int i = threadIdx.x-block*"<<(L*m)<<";\n";
+        source<<"int base = i+block*"<<zsize<<";\n";
+        source<<"int j = i/"<<m<<";\n";
+        if (radix == 7) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
+            source<<"real2 c5 = data"<<input<<"[base+"<<(5*L*m)<<"];\n";
+            source<<"real2 c6 = data"<<input<<"[base+"<<(6*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c6;\n";
+            source<<"real2 d1 = c1-c6;\n";
+            source<<"real2 d2 = c2+c5;\n";
+            source<<"real2 d3 = c2-c5;\n";
+            source<<"real2 d4 = c4+c3;\n";
+            source<<"real2 d5 = c4-c3;\n";
+            source<<"real2 d6 = d2+d0;\n";
+            source<<"real2 d7 = d5+d3;\n";
+            source<<"real2 b0 = c0+d6+d4;\n";
+            source<<"real2 b1 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)+cos(6*M_PI/7))/3-1)<<"*(d6+d4);\n";
+            source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
+            source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
+            source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
+            source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+            source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+            source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+            source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+            source<<"real2 t0 = b0+b1;\n";
+            source<<"real2 t1 = b2+b3;\n";
+            source<<"real2 t2 = b4-b3;\n";
+            source<<"real2 t3 = -b2-b4;\n";
+            source<<"real2 t4 = b6+b7;\n";
+            source<<"real2 t5 = b8-b7;\n";
+            source<<"real2 t6 = -b8-b6;\n";
+            source<<"real2 t7 = t0+t1;\n";
+            source<<"real2 t8 = t0+t2;\n";
+            source<<"real2 t9 = t0+t3;\n";
+            source<<"real2 t10 = make_real2(t4.y+b5.y, -(t4.x+b5.x));\n";
+            source<<"real2 t11 = make_real2(t5.y+b5.y, -(t5.x+b5.x));\n";
+            source<<"real2 t12 = make_real2(t6.y+b5.y, -(t6.x+b5.x));\n";
+            source<<"data"<<output<<"[base+6*j*"<<m<<"] = b0;\n";
+            source<<"data"<<output<<"[base+(6*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(7*L)<<"], t7-t10);\n";
+            source<<"data"<<output<<"[base+(6*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(7*L)<<"], t9-t12);\n";
+            source<<"data"<<output<<"[base+(6*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(7*L)<<"], t8+t11);\n";
+            source<<"data"<<output<<"[base+(6*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(7*L)<<"], t8-t11);\n";
+            source<<"data"<<output<<"[base+(6*j+5)*"<<m<<"] = multiplyComplex(w[j*"<<(5*zsize)<<"/"<<(7*L)<<"], t9+t12);\n";
+            source<<"data"<<output<<"[base+(6*j+6)*"<<m<<"] = multiplyComplex(w[j*"<<(6*zsize)<<"/"<<(7*L)<<"], t7+t10);\n";
+        }
+        else if (radix == 5) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c4;\n";
+            source<<"real2 d1 = c2+c3;\n";
+            source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
+            source<<"real2 d3 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
+            source<<"real2 d4 = d0+d1;\n";
+            source<<"real2 d5 = "<<context.doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
+            source<<"real2 d6 = c0-0.25f*d4;\n";
+            source<<"real2 d7 = d6+d5;\n";
+            source<<"real2 d8 = d6-d5;\n";
+            string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
+            source<<"real2 d9 = (SIGN)*make_real2(d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+            source<<"real2 d10 = (SIGN)*make_real2("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+            source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
+            source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
+            source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
+            source<<"data"<<output<<"[base+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
+            source<<"data"<<output<<"[base+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
+        }
+        else if (radix == 4) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 d0 = c0+c2;\n";
+            source<<"real2 d1 = c0-c2;\n";
+            source<<"real2 d2 = c1+c3;\n";
+            source<<"real2 d3 = (SIGN)*make_real2(c1.y-c3.y, c3.x-c1.x);\n";
+            source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
+            source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
+            source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
+            source<<"data"<<output<<"[base+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
+        }
+        else if (radix == 3) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c2;\n";
+            source<<"real2 d1 = c0-0.5f*d0;\n";
+            source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*make_real2(c1.y-c2.y, c2.x-c1.x);\n";
+            source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
+            source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
+            source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
+        }
+        else if (radix == 2) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"data"<<output<<"[base+j*"<<m<<"] = c0+c1;\n";
+            source<<"data"<<output<<"[base+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
+        }
+        source<<"}\n";
+        m = m*radix;
+        source<<"__syncthreads();\n";
+        source<<"}\n";
+        ++stage;
+    }
+    // Create the kernel.
+    bool outputIsReal = (inputIsReal && axis == 2 && !forward);
+    bool outputIsPacked = (inputIsReal && axis == 2 && forward);
+    string outputSuffix = (outputIsReal ? ".x" : "");
+    if (outputIsPacked)
+        source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
+    else
+        source<<"if (index < XSIZE*YSIZE)\n";
+    source<<"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)\n";
+    if (outputIsPacked)
+        source<<"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
+    else
+            source<<"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
+    map<string, string> replacements;
+    replacements["XSIZE"] = context.intToString(xsize);
+    replacements["YSIZE"] = context.intToString(ysize);
+    replacements["ZSIZE"] = context.intToString(zsize);
+    replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup);
+    replacements["THREADS_PER_BLOCK"] = context.intToString(threadsPerBlock);
+    replacements["M_PI"] = context.doubleToString(M_PI);
+    replacements["COMPUTE_FFT"] = source.str();
+    replacements["SIGN"] = (forward ? "1" : "-1");
+    replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
+    replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
+    replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
+    replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
+    replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
+    hipModule_t module = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(HipKernelSources::fft, replacements));
+    hipFunction_t kernel = context.getKernel(module, "execFFT");
+    threads = blocksPerGroup*threadsPerBlock;
+    return kernel;
+}
--- a/platforms/hip/src/HipIntegrationUtilities.cpp
+++ b/platforms/hip/src/HipIntegrationUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipIntegrationUtilities.h"
+#include "HipContext.h"
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
+#define CHECK_RESULT2(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<dynamic_cast<HipContext&>(context).getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+HipIntegrationUtilities::HipIntegrationUtilities(HipContext& context, const System& system) : IntegrationUtilities(context, system),
+        ccmaConvergedMemory(NULL) {
+        CHECK_RESULT2(hipEventCreateWithFlags(&ccmaEvent, hipEventDisableTiming), "Error creating event for CCMA");
+        CHECK_RESULT2(hipHostMalloc((void**) &ccmaConvergedMemory, sizeof(int), hipHostMallocMapped), "Error allocating pinned memory");
+        CHECK_RESULT2(hipHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
+}
+HipIntegrationUtilities::~HipIntegrationUtilities() {
+    context.setAsCurrent();
+    if (ccmaConvergedMemory != NULL) {
+        hipHostFree(ccmaConvergedMemory);
+        hipEventDestroy(ccmaEvent);
+    }
+}
+HipArray& HipIntegrationUtilities::getPosDelta() {
+    return dynamic_cast<HipContext&>(context).unwrap(posDelta);
+}
+HipArray& HipIntegrationUtilities::getRandom() {
+    return dynamic_cast<HipContext&>(context).unwrap(random);
+}
+HipArray& HipIntegrationUtilities::getStepSize() {
+    return dynamic_cast<HipContext&>(context).unwrap(stepSize);
+}
+void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
+    ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
+    if (constrainVelocities) {
+        settleKernel = settleVelKernel;
+        shakeKernel = shakeVelKernel;
+        ccmaForceKernel = ccmaVelForceKernel;
+    }
+    else {
+        settleKernel = settlePosKernel;
+        shakeKernel = shakePosKernel;
+        ccmaForceKernel = ccmaPosForceKernel;
+    }
+    if (settleAtoms.isInitialized()) {
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            settleKernel->setArg(1, tol);
+        else
+            settleKernel->setArg(1, (float) tol);
+        settleKernel->execute(settleAtoms.getSize());
+    }
+    if (shakeAtoms.isInitialized()) {
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            shakeKernel->setArg(1, tol);
+        else
+            shakeKernel->setArg(1, (float) tol);
+        shakeKernel->execute(shakeAtoms.getSize());
+    }
+    if (ccmaConstraintAtoms.isInitialized()) {
+        if (ccmaConstraintAtoms.getSize() <= 1024) {
+            // Use the version of CCMA that runs in a single kernel with one workgroup.
+            ccmaFullKernel->setArg(0, (int) constrainVelocities);
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+                ccmaFullKernel->setArg(14, tol);
+            else
+                ccmaFullKernel->setArg(14, (float) tol);
+            ccmaFullKernel->execute(128, 128);
+        }
+        else {
+            ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
+            if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+                ccmaForceKernel->setArg(7, tol);
+            else
+                ccmaForceKernel->setArg(7, (float) tol);
+            ccmaDirectionsKernel->execute(ccmaConstraintAtoms.getSize());
+            const int checkInterval = 4;
+            ccmaConvergedMemory[0] = 0;
+            ccmaUpdateKernel->setArg(4, constrainVelocities ? context.getVelm() : posDelta);
+            for (int i = 0; i < 150; i++) {
+                ccmaForceKernel->setArg(8, i);
+                ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
+                if ((i+1)%checkInterval == 0)
+                    CHECK_RESULT2(hipEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
+                ccmaMultiplyKernel->setArg(5, i);
+                ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
+                ccmaUpdateKernel->setArg(9, i);
+                ccmaUpdateKernel->execute(context.getNumAtoms());
+                if ((i+1)%checkInterval == 0) {
+                    CHECK_RESULT2(hipEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
+                    if (ccmaConvergedMemory[0])
+                        break;
+                }
+            }
+        }
+    }
+}
+void HipIntegrationUtilities::distributeForcesFromVirtualSites() {
+    if (numVsites > 0) {
+        vsiteForceKernel->setArg(2, context.getLongForceBuffer());
+        vsiteForceKernel->execute(numVsites);
+    }
+}
--- a/platforms/hip/src/HipKernel.cpp
+++ b/platforms/hip/src/HipKernel.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipKernel.h"
+#include "openmm/common/ComputeArray.h"
+#include "openmm/internal/AssertionUtilities.h"
+#include <cstring>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+HipKernel::HipKernel(HipContext& context, hipFunction_t kernel, const string& name) : context(context), kernel(kernel), name(name) {
+}
+string HipKernel::getName() const {
+    return name;
+}
+int HipKernel::getMaxBlockSize() const {
+    int size;
+    hipError_t result = hipFuncGetAttribute(&size, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
+    if (result != hipSuccess)
+        throw OpenMMException("Error querying max thread block size: "+context.getErrorString(result));
+    return size;
+}
+void HipKernel::execute(int threads, int blockSize) {
+    int numArgs = arrayArgs.size();
+    argPointers.resize(numArgs);
+    for (int i = 0; i < numArgs; i++) {
+        if (arrayArgs[i] != NULL)
+            argPointers[i] = &arrayArgs[i]->getDevicePointer();
+        else
+            argPointers[i] = &primitiveArgs[i];
+    }
+    context.executeKernel(kernel, argPointers.data(), threads, blockSize);
+}
+void HipKernel::addArrayArg(ArrayInterface& value) {
+    int index = arrayArgs.size();
+    addEmptyArg();
+    setArrayArg(index, value);
+}
+void HipKernel::addPrimitiveArg(const void* value, int size) {
+    int index = arrayArgs.size();
+    addEmptyArg();
+    setPrimitiveArg(index, value, size);
+}
+void HipKernel::addEmptyArg() {
+    primitiveArgs.push_back(make_double4(0, 0, 0, 0));
+    arrayArgs.push_back(NULL);
+}
+void HipKernel::setArrayArg(int index, ArrayInterface& value) {
+    ASSERT_VALID_INDEX(index, arrayArgs);
+    arrayArgs[index] = &context.unwrap(value);
+}
+void HipKernel::setPrimitiveArg(int index, const void* value, int size) {
+    ASSERT_VALID_INDEX(index, primitiveArgs);
+    if (size > sizeof(double4))
+        throw OpenMMException("Unsupported value type for kernel argument");
+    memcpy(&primitiveArgs[index], value, size);
+    arrayArgs[index] = NULL;
+}
--- a/platforms/hip/src/HipKernelFactory.cpp
+++ b/platforms/hip/src/HipKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipKernelFactory.h"
+#include "HipKernels.h"
+#include "HipParallelKernels.h"
+#include "HipPlatform.h"
+#include "openmm/common/CommonKernels.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    HipPlatform::PlatformData& data = *static_cast<HipPlatform::PlatformData*>(context.getPlatformData());
+    if (data.contexts.size() > 1) {
+        // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
+        if (name == CalcForcesAndEnergyKernel::Name())
+            return new HipParallelCalcForcesAndEnergyKernel(name, platform, data);
+        if (name == CalcHarmonicBondForceKernel::Name())
+            return new HipParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomBondForceKernel::Name())
+            return new HipParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcHarmonicAngleForceKernel::Name())
+            return new HipParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomAngleForceKernel::Name())
+            return new HipParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcPeriodicTorsionForceKernel::Name())
+            return new HipParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcRBTorsionForceKernel::Name())
+            return new HipParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCMAPTorsionForceKernel::Name())
+            return new HipParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomTorsionForceKernel::Name())
+            return new HipParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcNonbondedForceKernel::Name())
+            return new HipParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomNonbondedForceKernel::Name())
+            return new HipParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomExternalForceKernel::Name())
+            return new HipParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomHbondForceKernel::Name())
+            return new HipParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
+        if (name == CalcCustomCompoundBondForceKernel::Name())
+            return new HipParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
+    }
+    HipContext& cu = *data.contexts[0];
+    if (name == CalcForcesAndEnergyKernel::Name())
+        return new HipCalcForcesAndEnergyKernel(name, platform, cu);
+    if (name == UpdateStateDataKernel::Name())
+        return new HipUpdateStateDataKernel(name, platform, cu);
+    if (name == ApplyConstraintsKernel::Name())
+        return new CommonApplyConstraintsKernel(name, platform, cu);
+    if (name == VirtualSitesKernel::Name())
+        return new CommonVirtualSitesKernel(name, platform, cu);
+    if (name == CalcHarmonicBondForceKernel::Name())
+        return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomBondForceKernel::Name())
+        return new CommonCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcHarmonicAngleForceKernel::Name())
+        return new CommonCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomAngleForceKernel::Name())
+        return new CommonCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcPeriodicTorsionForceKernel::Name())
+        return new CommonCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcRBTorsionForceKernel::Name())
+        return new CommonCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCMAPTorsionForceKernel::Name())
+        return new CommonCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomTorsionForceKernel::Name())
+        return new CommonCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcNonbondedForceKernel::Name())
+        return new HipCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomNonbondedForceKernel::Name())
+        return new CommonCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcGBSAOBCForceKernel::Name())
+        return new CommonCalcGBSAOBCForceKernel(name, platform, cu);
+    if (name == CalcCustomGBForceKernel::Name())
+        return new CommonCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomExternalForceKernel::Name())
+        return new CommonCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomHbondForceKernel::Name())
+        return new CommonCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCentroidBondForceKernel::Name())
+        return new CommonCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCompoundBondForceKernel::Name())
+        return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCVForceKernel::Name())
+        return new HipCalcCustomCVForceKernel(name, platform, cu);
+    if (name == CalcRMSDForceKernel::Name())
+        return new CommonCalcRMSDForceKernel(name, platform, cu);
+    if (name == CalcCustomManyParticleForceKernel::Name())
+        return new CommonCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcGayBerneForceKernel::Name())
+        return new CommonCalcGayBerneForceKernel(name, platform, cu);
+    if (name == IntegrateVerletStepKernel::Name())
+        return new CommonIntegrateVerletStepKernel(name, platform, cu);
+    if (name == IntegrateLangevinStepKernel::Name())
+        return new CommonIntegrateLangevinStepKernel(name, platform, cu);
+    if (name == IntegrateLangevinMiddleStepKernel::Name())
+        return new CommonIntegrateLangevinMiddleStepKernel(name, platform, cu);
+    if (name == IntegrateBrownianStepKernel::Name())
+        return new CommonIntegrateBrownianStepKernel(name, platform, cu);
+    if (name == IntegrateVariableVerletStepKernel::Name())
+        return new CommonIntegrateVariableVerletStepKernel(name, platform, cu);
+    if (name == IntegrateVariableLangevinStepKernel::Name())
+        return new CommonIntegrateVariableLangevinStepKernel(name, platform, cu);
+    if (name == IntegrateCustomStepKernel::Name())
+        return new CommonIntegrateCustomStepKernel(name, platform, cu);
+    if (name == ApplyAndersenThermostatKernel::Name())
+        return new CommonApplyAndersenThermostatKernel(name, platform, cu);
+    if (name == IntegrateNoseHooverStepKernel::Name())
+        return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
+    if (name == ApplyMonteCarloBarostatKernel::Name())
+        return new CommonApplyMonteCarloBarostatKernel(name, platform, cu);
+    if (name == RemoveCMMotionKernel::Name())
+        return new CommonRemoveCMMotionKernel(name, platform, cu);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/platforms/hip/src/HipKernelSources.cpp.in
+++ b/platforms/hip/src/HipKernelSources.cpp.in
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipKernelSources.h"
+using namespace OpenMM;
+using namespace std;
--- a/platforms/hip/src/HipKernelSources.h.in
+++ b/platforms/hip/src/HipKernelSources.h.in
+#ifndef OPENMM_HIPKERNELSOURCES_H_
+#define OPENMM_HIPKERNELSOURCES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#ifdef _MSC_VER
+#error "Windows unsupported for HIP platform"
+#endif
+#include "openmm/common/windowsExportCommon.h"
+#include <string>
+namespace OpenMM {
+/**
+ * This class is a central holding place for the source code of HIP kernels.
+ * The CMake build script inserts declarations into it based on the .hip files in the
+ * kernels subfolder.
+ */
+class OPENMM_EXPORT_COMMON HipKernelSources {
+public:
+@KERNEL_FILE_DECLARATIONS@
+};
+} // namespace OpenMM
+#endif /*OPENMM_HIPKERNELSOURCES_H_*/
--- a/platforms/hip/src/HipKernels.cpp
+++ b/platforms/hip/src/HipKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipKernels.h"
+#include "HipForceInfo.h"
+#include "openmm/Context.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/NonbondedForceImpl.h"
+#include "CommonKernelSources.h"
+#include "HipBondedUtilities.h"
+#include "HipExpressionUtilities.h"
+#include "HipIntegrationUtilities.h"
+#include "HipNonbondedUtilities.h"
+#include "HipKernelSources.h"
+#include "SimTKOpenMMRealType.h"
+#include "SimTKOpenMMUtilities.h"
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+#include <set>
+#include <assert.h>
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+void HipCalcForcesAndEnergyKernel::initialize(const System& system) {
+}
+void HipCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cu.setForcesValid(true);
+    cu.setAsCurrent();
+    cu.clearAutoclearBuffers();
+    for (auto computation : cu.getPreComputations())
+        computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
+    HipNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    cu.setComputeForceCount(cu.getComputeForceCount()+1);
+    nb.prepareInteractions(groups);
+    map<string, double>& derivs = cu.getEnergyParamDerivWorkspace();
+    for (auto& param : context.getParameters())
+        derivs[param.first] = 0;
+}
+double HipCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
+    cu.setAsCurrent();
+    cu.getBondedUtilities().computeInteractions(groups);
+    cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
+    double sum = 0.0;
+    for (auto computation : cu.getPostComputations())
+        sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
+    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
+    if (includeEnergy)
+        sum += cu.reduceEnergy();
+    if (!cu.getForcesValid())
+        valid = false;
+    return sum;
+}
+void HipUpdateStateDataKernel::initialize(const System& system) {
+}
+double HipUpdateStateDataKernel::getTime(const ContextImpl& context) const {
+    return cu.getTime();
+}
+void HipUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
+    vector<HipContext*>& contexts = cu.getPlatformData().contexts;
+    for (auto ctx : contexts)
+        ctx->setTime(time);
+}
+void HipUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
+    cu.setAsCurrent();
+    int numParticles = context.getSystem().getNumParticles();
+    positions.resize(numParticles);
+    vector<float4> posCorrection;
+    if (cu.getUseDoublePrecision()) {
+        double4* posq = (double4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+    }
+    else if (cu.getUseMixedPrecision()) {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq, false);
+        posCorrection.resize(numParticles);
+        cu.getPosqCorrection().download(posCorrection);
+    }
+    else {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+    }
+    // Filling in the output array is done in parallel for speed.
+    cu.getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
+        const vector<int>& order = cu.getAtomIndex();
+        int numParticles = cu.getNumAtoms();
+        Vec3 boxVectors[3];
+        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        if (cu.getUseDoublePrecision()) {
+            double4* posq = (double4*) cu.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                double4 pos = posq[i];
+                mm_int4 offset = cu.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+        else if (cu.getUseMixedPrecision()) {
+            float4* posq = (float4*) cu.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                float4 pos1 = posq[i];
+                float4 pos2 = posCorrection[i];
+                mm_int4 offset = cu.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+        else {
+            float4* posq = (float4*) cu.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                float4 pos = posq[i];
+                mm_int4 offset = cu.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+    });
+    cu.getPlatformData().threads.waitForThreads();
+}
+void HipUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
+    cu.setAsCurrent();
+    const vector<int>& order = cu.getAtomIndex();
+    int numParticles = context.getSystem().getNumParticles();
+    if (cu.getUseDoublePrecision()) {
+        double4* posq = (double4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+        for (int i = 0; i < numParticles; ++i) {
+            double4& pos = posq[i];
+            const Vec3& p = positions[order[i]];
+            pos.x = p[0];
+            pos.y = p[1];
+            pos.z = p[2];
+        }
+        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
+            posq[i] = make_double4(0.0, 0.0, 0.0, 0.0);
+        cu.getPosq().upload(posq);
+    }
+    else {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+        for (int i = 0; i < numParticles; ++i) {
+            float4& pos = posq[i];
+            const Vec3& p = positions[order[i]];
+            pos.x = (float) p[0];
+            pos.y = (float) p[1];
+            pos.z = (float) p[2];
+        }
+        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
+            posq[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        cu.getPosq().upload(posq);
+    }
+    if (cu.getUseMixedPrecision()) {
+        float4* posCorrection = (float4*) cu.getPinnedBuffer();
+        for (int i = 0; i < numParticles; ++i) {
+            float4& c = posCorrection[i];
+            const Vec3& p = positions[order[i]];
+            c.x = (float) (p[0]-(float)p[0]);
+            c.y = (float) (p[1]-(float)p[1]);
+            c.z = (float) (p[2]-(float)p[2]);
+            c.w = 0;
+        }
+        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
+            posCorrection[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        cu.getPosqCorrection().upload(posCorrection);
+    }
+    for (auto& offset : cu.getPosCellOffsets())
+        offset = mm_int4(0, 0, 0, 0);
+    cu.reorderAtoms();
+}
+void HipUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
+    cu.setAsCurrent();
+    const vector<int>& order = cu.getAtomIndex();
+    int numParticles = context.getSystem().getNumParticles();
+    velocities.resize(numParticles);
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+        double4* velm = (double4*) cu.getPinnedBuffer();
+        cu.getVelm().download(velm);
+        for (int i = 0; i < numParticles; ++i) {
+            double4 vel = velm[i];
+            mm_int4 offset = cu.getPosCellOffsets()[i];
+            velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
+        }
+    }
+    else {
+        float4* velm = (float4*) cu.getPinnedBuffer();
+        cu.getVelm().download(velm);
+        for (int i = 0; i < numParticles; ++i) {
+            float4 vel = velm[i];
+            mm_int4 offset = cu.getPosCellOffsets()[i];
+            velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
+        }
+    }
+}
+void HipUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
+    cu.setAsCurrent();
+    const vector<int>& order = cu.getAtomIndex();
+    int numParticles = context.getSystem().getNumParticles();
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+        double4* velm = (double4*) cu.getPinnedBuffer();
+        cu.getVelm().download(velm);
+        for (int i = 0; i < numParticles; ++i) {
+            double4& vel = velm[i];
+            const Vec3& p = velocities[order[i]];
+            vel.x = p[0];
+            vel.y = p[1];
+            vel.z = p[2];
+        }
+        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
+            velm[i] = make_double4(0.0, 0.0, 0.0, 0.0);
+        cu.getVelm().upload(velm);
+    }
+    else {
+        float4* velm = (float4*) cu.getPinnedBuffer();
+        cu.getVelm().download(velm);
+        for (int i = 0; i < numParticles; ++i) {
+            float4& vel = velm[i];
+            const Vec3& p = velocities[order[i]];
+            vel.x = p[0];
+            vel.y = p[1];
+            vel.z = p[2];
+        }
+        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
+            velm[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        cu.getVelm().upload(velm);
+    }
+}
+void HipUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
+    cu.setAsCurrent();
+    long long* force = (long long*) cu.getPinnedBuffer();
+    cu.getForce().download(force);
+    const vector<int>& order = cu.getAtomIndex();
+    int numParticles = context.getSystem().getNumParticles();
+    int paddedNumParticles = cu.getPaddedNumAtoms();
+    forces.resize(numParticles);
+    double scale = 1.0/(double) 0x100000000LL;
+    for (int i = 0; i < numParticles; ++i)
+        forces[order[i]] = Vec3(scale*force[i], scale*force[i+paddedNumParticles], scale*force[i+paddedNumParticles*2]);
+}
+void HipUpdateStateDataKernel::getEnergyParameterDerivatives(ContextImpl& context, map<string, double>& derivs) {
+    const vector<string>& paramDerivNames = cu.getEnergyParamDerivNames();
+    int numDerivs = paramDerivNames.size();
+    if (numDerivs == 0)
+        return;
+    derivs = cu.getEnergyParamDerivWorkspace();
+    HipArray& derivArray = cu.getEnergyParamDerivBuffer();
+    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+        vector<double> derivBuffers;
+        derivArray.download(derivBuffers);
+        for (int i = numDerivs; i < derivArray.getSize(); i += numDerivs)
+            for (int j = 0; j < numDerivs; j++)
+                derivBuffers[j] += derivBuffers[i+j];
+        for (int i = 0; i < numDerivs; i++)
+            derivs[paramDerivNames[i]] += derivBuffers[i];
+    }
+    else {
+        vector<float> derivBuffers;
+        derivArray.download(derivBuffers);
+        for (int i = numDerivs; i < derivArray.getSize(); i += numDerivs)
+            for (int j = 0; j < numDerivs; j++)
+                derivBuffers[j] += derivBuffers[i+j];
+        for (int i = 0; i < numDerivs; i++)
+            derivs[paramDerivNames[i]] += derivBuffers[i];
+    }
+}
+void HipUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
+    cu.getPeriodicBoxVectors(a, b, c);
+}
+void HipUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) {
+    vector<HipContext*>& contexts = cu.getPlatformData().contexts;
+    // If any particles have been wrapped to the first periodic box, we need to unwrap them
+    // to avoid changing their positions.
+    vector<Vec3> positions;
+    for (auto& offset : cu.getPosCellOffsets()) {
+        if (offset.x != 0 || offset.y != 0 || offset.z != 0) {
+            getPositions(context, positions);
+            break;
+        }
+    }
+    // Update the vectors.
+    for (auto ctx : contexts)
+        ctx->setPeriodicBoxVectors(a, b, c);
+    if (positions.size() > 0)
+        setPositions(context, positions);
+}
+void HipUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
+    cu.setAsCurrent();
+    int version = 3;
+    stream.write((char*) &version, sizeof(int));
+    int precision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
+    stream.write((char*) &precision, sizeof(int));
+    double time = cu.getTime();
+    stream.write((char*) &time, sizeof(double));
+    int stepCount = cu.getStepCount();
+    stream.write((char*) &stepCount, sizeof(int));
+    int stepsSinceReorder = cu.getStepsSinceReorder();
+    stream.write((char*) &stepsSinceReorder, sizeof(int));
+    char* buffer = (char*) cu.getPinnedBuffer();
+    cu.getPosq().download(buffer);
+    stream.write(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
+    if (cu.getUseMixedPrecision()) {
+        cu.getPosqCorrection().download(buffer);
+        stream.write(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
+    }
+    cu.getVelm().download(buffer);
+    stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
+    stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
+    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
+    Vec3 boxVectors[3];
+    cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    stream.write((char*) boxVectors, 3*sizeof(Vec3));
+    cu.getIntegrationUtilities().createCheckpoint(stream);
+    SimTKOpenMMUtilities::createCheckpoint(stream);
+}
+void HipUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
+    cu.setAsCurrent();
+    int version;
+    stream.read((char*) &version, sizeof(int));
+    if (version != 3)
+        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
+    int precision;
+    stream.read((char*) &precision, sizeof(int));
+    int expectedPrecision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
+    if (precision != expectedPrecision)
+        throw OpenMMException("Checkpoint was created with a different numeric precision");
+    double time;
+    stream.read((char*) &time, sizeof(double));
+    int stepCount, stepsSinceReorder;
+    stream.read((char*) &stepCount, sizeof(int));
+    stream.read((char*) &stepsSinceReorder, sizeof(int));
+    vector<HipContext*>& contexts = cu.getPlatformData().contexts;
+    for (auto ctx : contexts) {
+        ctx->setTime(time);
+        ctx->setStepCount(stepCount);
+        ctx->setStepsSinceReorder(stepsSinceReorder);
+    }
+    char* buffer = (char*) cu.getPinnedBuffer();
+    stream.read(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
+    cu.getPosq().upload(buffer);
+    if (cu.getUseMixedPrecision()) {
+        stream.read(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
+        cu.getPosqCorrection().upload(buffer);
+    }
+    stream.read(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
+    cu.getVelm().upload(buffer);
+    stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
+    cu.getAtomIndexArray().upload(cu.getAtomIndex());
+    stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
+    Vec3 boxVectors[3];
+    stream.read((char*) &boxVectors, 3*sizeof(Vec3));
+    for (auto ctx : contexts)
+        ctx->setPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    cu.getIntegrationUtilities().loadCheckpoint(stream);
+    SimTKOpenMMUtilities::loadCheckpoint(stream);
+    for (auto listener : cu.getReorderListeners())
+        listener->execute();
+}
+class HipCalcNonbondedForceKernel::ForceInfo : public HipForceInfo {
+public:
+    ForceInfo(const NonbondedForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
+        force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
+        force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
+        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
+    }
+    int getNumParticleGroups() {
+        return force.getNumExceptions();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
+        particles.resize(2);
+        particles[0] = particle1;
+        particles[1] = particle2;
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        int particle1, particle2;
+        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
+        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
+        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
+        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
+    }
+private:
+    const NonbondedForce& force;
+};
+class HipCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    PmeIO(HipContext& cu, hipFunction_t addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel) {
+        forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
+    }
+    float* getPosq() {
+        cu.setAsCurrent();
+        cu.getPosq().download(posq);
+        return (float*) &posq[0];
+    }
+    void setForce(float* force) {
+        forceTemp.upload(force);
+        void* args[] = {&forceTemp.getDevicePointer(), &cu.getForce().getDevicePointer()};
+        cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
+    }
+private:
+    HipContext& cu;
+    vector<float4> posq;
+    HipArray forceTemp;
+    hipFunction_t addForcesKernel;
+};
+class HipCalcNonbondedForceKernel::PmePreComputation : public HipContext::ForcePreComputation {
+public:
+    PmePreComputation(HipContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
+    }
+private:
+    HipContext& cu;
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+class HipCalcNonbondedForceKernel::PmePostComputation : public HipContext::ForcePostComputation {
+public:
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+    }
+private:
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+class HipCalcNonbondedForceKernel::SyncStreamPreComputation : public HipContext::ForcePreComputation {
+public:
+    SyncStreamPreComputation(HipContext& cu, hipStream_t stream, hipEvent_t event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        if ((groups&(1<<forceGroup)) != 0) {
+            hipEventRecord(event, cu.getCurrentStream());
+            hipStreamWaitEvent(stream, event, 0);
+        }
+    }
+private:
+    HipContext& cu;
+    hipStream_t stream;
+    hipEvent_t event;
+    int forceGroup;
+};
+class HipCalcNonbondedForceKernel::SyncStreamPostComputation : public HipContext::ForcePostComputation {
+public:
+    SyncStreamPostComputation(HipContext& cu, hipEvent_t event, hipFunction_t addEnergyKernel, HipArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
+            addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        if ((groups&(1<<forceGroup)) != 0) {
+            hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
+            if (includeEnergy) {
+                int bufferSize = pmeEnergyBuffer.getSize();
+                void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
+                cu.executeKernel(addEnergyKernel, args, bufferSize);
+            }
+        }
+        return 0.0;
+    }
+private:
+    HipContext& cu;
+    hipEvent_t event;
+    hipFunction_t addEnergyKernel;
+    HipArray& pmeEnergyBuffer;
+    int forceGroup;
+};
+HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
+    cu.setAsCurrent();
+    if (sort != NULL)
+        delete sort;
+    if (fft != NULL)
+        delete fft;
+    if (dispersionFft != NULL)
+        delete dispersionFft;
+    if (pmeio != NULL)
+        delete pmeio;
+    if (hasInitializedFFT) {
+        if (useHipFFT) {
+            hipfftDestroy(fftForward);
+            hipfftDestroy(fftBackward);
+            if (doLJPME) {
+                hipfftDestroy(dispersionFftForward);
+                hipfftDestroy(dispersionFftBackward);
+            }
+        }
+        if (usePmeStream) {
+            hipStreamDestroy(pmeStream);
+            hipEventDestroy(pmeSyncEvent);
+            hipEventDestroy(paramsSyncEvent);
+        }
+    }
+}
+void HipCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+    cu.setAsCurrent();
+    int forceIndex;
+    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
+        ;
+    string prefix = "nonbonded"+cu.intToString(forceIndex)+"_";
+    // Identify which exceptions are 1-4 interactions.
+    set<int> exceptionsWithOffsets;
+    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
+        string param;
+        int exception;
+        double charge, sigma, epsilon;
+        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        exceptionsWithOffsets.insert(exception);
+    }
+    vector<pair<int, int> > exclusions;
+    vector<int> exceptions;
+    map<int, int> exceptionIndex;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+        exclusions.push_back(pair<int, int>(particle1, particle2));
+        if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
+            exceptionIndex[i] = exceptions.size();
+            exceptions.push_back(i);
+        }
+    }
+    // Initialize nonbonded interactions.
+    int numParticles = force.getNumParticles();
+    vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    vector<vector<int> > exclusionList(numParticles);
+    hasCoulomb = false;
+    hasLJ = false;
+    for (int i = 0; i < numParticles; i++) {
+        double charge, sigma, epsilon;
+        force.getParticleParameters(i, charge, sigma, epsilon);
+        baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
+        exclusionList[i].push_back(i);
+        if (charge != 0.0)
+            hasCoulomb = true;
+        if (epsilon != 0.0)
+            hasLJ = true;
+    }
+    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
+        string param;
+        int particle;
+        double charge, sigma, epsilon;
+        force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
+        if (charge != 0.0)
+            hasCoulomb = true;
+        if (epsilon != 0.0)
+            hasLJ = true;
+    }
+    for (auto exclusion : exclusions) {
+        exclusionList[exclusion.first].push_back(exclusion.second);
+        exclusionList[exclusion.second].push_back(exclusion.first);
+    }
+    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
+    bool useCutoff = (nonbondedMethod != NoCutoff);
+    bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
+    doLJPME = (nonbondedMethod == LJPME && hasLJ);
+    usePosqCharges = hasCoulomb ? cu.requestPosqCharges() : false;
+    map<string, string> defines;
+    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
+    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
+    defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
+    if (useCutoff) {
+        // Compute the reaction field constants.
+        double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
+        double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
+        defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
+        defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
+        // Compute the switching coefficients.
+        if (force.getUseSwitchingFunction()) {
+            defines["LJ_SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
+            defines["LJ_SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
+            defines["LJ_SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
+            defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
+        }
+    }
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
+        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
+    else
+        dispersionCoefficient = 0.0;
+    alpha = 0;
+    ewaldSelfEnergy = 0.0;
+    map<string, string> paramsDefines;
+    paramsDefines["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
+    if (hasOffsets)
+        paramsDefines["HAS_OFFSETS"] = "1";
+    if (usePosqCharges)
+        paramsDefines["USE_POSQ_CHARGES"] = "1";
+    if (nonbondedMethod == Ewald) {
+        // Compute the Ewald parameters.
+        int kmaxx, kmaxy, kmaxz;
+        NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
+        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+        defines["USE_EWALD"] = "1";
+        if (cu.getContextIndex() == 0) {
+            paramsDefines["INCLUDE_EWALD"] = "1";
+            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
+            for (int i = 0; i < numParticles; i++)
+                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
+            // Create the reciprocal space kernels.
+            map<string, string> replacements;
+            replacements["NUM_ATOMS"] = cu.intToString(numParticles);
+            replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+            replacements["KMAX_X"] = cu.intToString(kmaxx);
+            replacements["KMAX_Y"] = cu.intToString(kmaxy);
+            replacements["KMAX_Z"] = cu.intToString(kmaxz);
+            replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
+            replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+            replacements["M_PI"] = cu.doubleToString(M_PI);
+            hipModule_t module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::ewald, replacements);
+            ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
+            ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
+            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
+            cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
+        }
+    }
+    else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
+        // Compute the PME parameters.
+        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
+        gridSizeX = HipFFT3D::findLegalDimension(gridSizeX);
+        gridSizeY = HipFFT3D::findLegalDimension(gridSizeY);
+        gridSizeZ = HipFFT3D::findLegalDimension(gridSizeZ);
+        if (doLJPME) {
+            NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
+                                                  dispersionGridSizeY, dispersionGridSizeZ, true);
+            dispersionGridSizeX = HipFFT3D::findLegalDimension(dispersionGridSizeX);
+            dispersionGridSizeY = HipFFT3D::findLegalDimension(dispersionGridSizeY);
+            dispersionGridSizeZ = HipFFT3D::findLegalDimension(dispersionGridSizeZ);
+        }
+        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+        defines["USE_EWALD"] = "1";
+        defines["DO_LJPME"] = doLJPME ? "1" : "0";
+        if (doLJPME)
+            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+        if (cu.getContextIndex() == 0) {
+            paramsDefines["INCLUDE_EWALD"] = "1";
+            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
+            for (int i = 0; i < numParticles; i++)
+                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
+            if (doLJPME) {
+                paramsDefines["INCLUDE_LJPME"] = "1";
+                paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cu.doubleToString(pow(dispersionAlpha, 6)/3.0);
+                for (int i = 0; i < numParticles; i++)
+                    ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
+            }
+            char deviceName[100];
+            hipDeviceGetName(deviceName, 100, cu.getDevice());
+            usePmeStream = (!cu.getPlatformData().disablePmeStream && !cu.getPlatformData().useCpuPme && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
+            map<string, string> pmeDefines;
+            pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
+            pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
+            pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+            pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
+            pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
+            pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
+            pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
+            pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["M_PI"] = cu.doubleToString(M_PI);
+            if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
+                pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
+            if (usePmeStream)
+                pmeDefines["USE_PME_STREAM"] = "1";
+            map<string, string> replacements;
+            replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
+            hipModule_t module = cu.createModule(HipKernelSources::vectorOps+cu.replaceStrings(CommonKernelSources::pme, replacements), pmeDefines);
+            if (cu.getPlatformData().useCpuPme && !doLJPME && usePosqCharges) {
+                // Create the CPU PME kernel.
+                try {
+                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
+                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
+                    hipFunction_t addForcesKernel = cu.getKernel(module, "addForces");
+                    pmeio = new PmeIO(cu, addForcesKernel);
+                    cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
+                    cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+                }
+                catch (OpenMMException& ex) {
+                    // The CPU PME plugin isn't available.
+                }
+            }
+            if (pmeio == NULL) {
+                pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+                pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+                pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+                pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
+                pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+                pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+                hipFuncSetCacheConfig(pmeSpreadChargeKernel, hipFuncCachePreferShared);
+                hipFuncSetCacheConfig(pmeInterpolateForceKernel, hipFuncCachePreferL1);
+                if (doLJPME) {
+                    pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
+                    pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
+                    pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
+                    pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
+                    pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
+                    pmeDefines["USE_LJPME"] = "1";
+                    pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
+                    if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
+                        pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
+                    double invRCut6 = pow(force.getCutoffDistance(), -6);
+                    double dalphaR = dispersionAlpha * force.getCutoffDistance();
+                    double dar2 = dalphaR*dalphaR;
+                    double dar4 = dar2*dar2;
+                    double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
+                    defines["INVCUT6"] = cu.doubleToString(invRCut6);
+                    defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
+                    module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
+                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+                    pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+                    pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+                    pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+                    pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
+                    hipFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, hipFuncCachePreferL1);
+                }
+                // Create required data structures.
+                int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+                int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
+                int gridElements = gridSizeX*gridSizeY*roundedZSize;
+                if (doLJPME) {
+                    roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
+                    gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
+                }
+                pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
+                pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
+                cu.addAutoclearBuffer(pmeGrid2);
+                pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
+                pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
+                pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+                if (doLJPME) {
+                    pmeDispersionBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
+                    pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
+                    pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
+                }
+                pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
+                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+                pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*HipContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
+                cu.clearBuffer(pmeEnergyBuffer);
+                sort = new HipSort(cu, new SortTrait(), cu.getNumAtoms());
+                int cufftVersion;
+                hipfftGetVersion(&cufftVersion);
+                useHipFFT = (cufftVersion >= 7050); // There was a critical bug in version 7.0
+                if (useHipFFT) {
+                    hipfftResult result = hipfftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_D2Z : HIPFFT_R2C);
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                    result = hipfftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_Z2D : HIPFFT_C2R);
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                    if (doLJPME) {
+                        result = hipfftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY,
+                                                dispersionGridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_D2Z : HIPFFT_R2C);
+                        if (result != HIPFFT_SUCCESS)
+                            throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
+                        result = hipfftPlan3d(&dispersionFftBackward, dispersionGridSizeX, dispersionGridSizeY,
+                                             dispersionGridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_Z2D : HIPFFT_C2R);
+                        if (result != HIPFFT_SUCCESS)
+                            throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
+                    }
+                }
+                else {
+                    fft = new HipFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
+                    if (doLJPME)
+                        dispersionFft = new HipFFT3D(cu, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
+                }
+                // Prepare for doing PME on its own stream.
+                if (usePmeStream) {
+                    hipStreamCreateWithFlags(&pmeStream, hipStreamNonBlocking);
+                    if (useHipFFT) {
+                        hipfftSetStream(fftForward, pmeStream);
+                        hipfftSetStream(fftBackward, pmeStream);
+                        if (doLJPME) {
+                            hipfftSetStream(dispersionFftForward, pmeStream);
+                            hipfftSetStream(dispersionFftBackward, pmeStream);
+                        }
+                    }
+                    CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, hipEventDisableTiming), "Error creating event for NonbondedForce");
+                    CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, hipEventDisableTiming), "Error creating event for NonbondedForce");
+                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
+                    if (recipForceGroup < 0)
+                        recipForceGroup = force.getForceGroup();
+                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
+                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
+                }
+                hasInitializedFFT = true;
+                // Initialize the b-spline moduli.
+                for (int grid = 0; grid < 2; grid++) {
+                    int xsize, ysize, zsize;
+                    HipArray *xmoduli, *ymoduli, *zmoduli;
+                    if (grid == 0) {
+                        xsize = gridSizeX;
+                        ysize = gridSizeY;
+                        zsize = gridSizeZ;
+                        xmoduli = &pmeBsplineModuliX;
+                        ymoduli = &pmeBsplineModuliY;
+                        zmoduli = &pmeBsplineModuliZ;
+                    }
+                    else {
+                        if (!doLJPME)
+                            continue;
+                        xsize = dispersionGridSizeX;
+                        ysize = dispersionGridSizeY;
+                        zsize = dispersionGridSizeZ;
+                        xmoduli = &pmeDispersionBsplineModuliX;
+                        ymoduli = &pmeDispersionBsplineModuliY;
+                        zmoduli = &pmeDispersionBsplineModuliZ;
+                    }
+                    int maxSize = max(max(xsize, ysize), zsize);
+                    vector<double> data(PmeOrder);
+                    vector<double> ddata(PmeOrder);
+                    vector<double> bsplines_data(maxSize);
+                    data[PmeOrder-1] = 0.0;
+                    data[1] = 0.0;
+                    data[0] = 1.0;
+                    for (int i = 3; i < PmeOrder; i++) {
+                        double div = 1.0/(i-1.0);
+                        data[i-1] = 0.0;
+                        for (int j = 1; j < (i-1); j++)
+                            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+                        data[0] = div*data[0];
+                    }
+                    // Differentiate.
+                    ddata[0] = -data[0];
+                    for (int i = 1; i < PmeOrder; i++)
+                        ddata[i] = data[i-1]-data[i];
+                    double div = 1.0/(PmeOrder-1);
+                    data[PmeOrder-1] = 0.0;
+                    for (int i = 1; i < (PmeOrder-1); i++)
+                        data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+                    data[0] = div*data[0];
+                    for (int i = 0; i < maxSize; i++)
+                        bsplines_data[i] = 0.0;
+                    for (int i = 1; i <= PmeOrder; i++)
+                        bsplines_data[i] = data[i-1];
+                    // Evaluate the actual bspline moduli for X/Y/Z.
+                    for (int dim = 0; dim < 3; dim++) {
+                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
+                        vector<double> moduli(ndata);
+                        for (int i = 0; i < ndata; i++) {
+                            double sc = 0.0;
+                            double ss = 0.0;
+                            for (int j = 0; j < ndata; j++) {
+                                double arg = (2.0*M_PI*i*j)/ndata;
+                                sc += bsplines_data[j]*cos(arg);
+                                ss += bsplines_data[j]*sin(arg);
+                            }
+                            moduli[i] = sc*sc+ss*ss;
+                        }
+                        for (int i = 0; i < ndata; i++)
+                            if (moduli[i] < 1.0e-7)
+                                moduli[i] = (moduli[(i-1+ndata)%ndata]+moduli[(i+1)%ndata])*0.5;
+                        if (dim == 0)
+                            xmoduli->upload(moduli, true);
+                        else if (dim == 1)
+                            ymoduli->upload(moduli, true);
+                        else
+                            zmoduli->upload(moduli, true);
+                    }
+                }
+            }
+        }
+    }
+    // Add code to subtract off the reciprocal part of excluded interactions.
+    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
+        int numContexts = cu.getPlatformData().contexts.size();
+        int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
+        int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
+        int numExclusions = endIndex-startIndex;
+        if (numExclusions > 0) {
+            paramsDefines["HAS_EXCLUSIONS"] = "1";
+            vector<vector<int> > atoms(numExclusions, vector<int>(2));
+            exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
+            exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
+            vector<int2> exclusionAtomsVec(numExclusions);
+            for (int i = 0; i < numExclusions; i++) {
+                int j = i+startIndex;
+                exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
+                atoms[i][0] = exclusions[j].first;
+                atoms[i][1] = exclusions[j].second;
+            }
+            exclusionAtoms.upload(exclusionAtomsVec);
+            map<string, string> replacements;
+            replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams.getDevicePointer(), "float4");
+            replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
+            replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
+            replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
+            if (doLJPME)
+                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
+        }
+    }
+    // Add the interaction to the default nonbonded kernel.
+    string source = cu.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
+    charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
+    baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
+    baseParticleParams.upload(baseParticleParamVec);
+    map<string, string> replacements;
+    replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
+    if (usePosqCharges) {
+        replacements["CHARGE1"] = "posq1.w";
+        replacements["CHARGE2"] = "posq2.w";
+    }
+    else {
+        replacements["CHARGE1"] = prefix+"charge1";
+        replacements["CHARGE2"] = prefix+"charge2";
+    }
+    if (hasCoulomb)
+        cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
+    sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
+    if (hasLJ) {
+        replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
+        replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
+        cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
+    }
+    source = cu.replaceStrings(source, replacements);
+    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
+    // Initialize the exceptions.
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
+    int numExceptions = endIndex-startIndex;
+    if (numExceptions > 0) {
+        paramsDefines["HAS_EXCEPTIONS"] = "1";
+        exceptionAtoms.resize(numExceptions);
+        vector<vector<int> > atoms(numExceptions, vector<int>(2));
+        exceptionParams.initialize<float4>(cu, numExceptions, "exceptionParams");
+        baseExceptionParams.initialize<float4>(cu, numExceptions, "baseExceptionParams");
+        vector<float4> baseExceptionParamsVec(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            double chargeProd, sigma, epsilon;
+            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
+            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
+            exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
+        }
+        baseExceptionParams.upload(baseExceptionParamsVec);
+        map<string, string> replacements;
+        replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
+        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams.getDevicePointer(), "float4");
+        cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
+    }
+    // Initialize parameter offsets.
+    vector<vector<float4> > particleOffsetVec(force.getNumParticles());
+    vector<vector<float4> > exceptionOffsetVec(force.getNumExceptions());
+    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
+        string param;
+        int particle;
+        double charge, sigma, epsilon;
+        force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
+        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
+        int paramIndex;
+        if (paramPos == paramNames.end()) {
+            paramIndex = paramNames.size();
+            paramNames.push_back(param);
+        }
+        else
+            paramIndex = paramPos-paramNames.begin();
+        particleOffsetVec[particle].push_back(make_float4(charge, sigma, epsilon, paramIndex));
+    }
+    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
+        string param;
+        int exception;
+        double charge, sigma, epsilon;
+        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
+        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
+        int paramIndex;
+        if (paramPos == paramNames.end()) {
+            paramIndex = paramNames.size();
+            paramNames.push_back(param);
+        }
+        else
+            paramIndex = paramPos-paramNames.begin();
+        exceptionOffsetVec[exceptionIndex[exception]].push_back(make_float4(charge, sigma, epsilon, paramIndex));
+    }
+    paramValues.resize(paramNames.size(), 0.0);
+    particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
+    exceptionParamOffsets.initialize<float4>(cu, max(force.getNumExceptionParameterOffsets(), 1), "exceptionParamOffsets");
+    particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
+    exceptionOffsetIndices.initialize<int>(cu, force.getNumExceptions()+1, "exceptionOffsetIndices");
+    vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
+    vector<float4> p, e;
+    for (int i = 0; i < particleOffsetVec.size(); i++) {
+        particleOffsetIndicesVec.push_back(p.size());
+        for (int j = 0; j < particleOffsetVec[i].size(); j++)
+            p.push_back(particleOffsetVec[i][j]);
+    }
+    while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
+        particleOffsetIndicesVec.push_back(p.size());
+    for (int i = 0; i < exceptionOffsetVec.size(); i++) {
+        exceptionOffsetIndicesVec.push_back(e.size());
+        for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
+            e.push_back(exceptionOffsetVec[i][j]);
+    }
+    exceptionOffsetIndicesVec.push_back(e.size());
+    if (force.getNumParticleParameterOffsets() > 0) {
+        particleParamOffsets.upload(p);
+        particleOffsetIndices.upload(particleOffsetIndicesVec);
+    }
+    if (force.getNumExceptionParameterOffsets() > 0) {
+        exceptionParamOffsets.upload(e);
+        exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
+    }
+    globalParams.initialize(cu, max((int) paramValues.size(), 1), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
+    if (paramValues.size() > 0)
+        globalParams.upload(paramValues, true);
+    recomputeParams = true;
+    // Initialize the kernel for updating parameters.
+    hipModule_t module = cu.createModule(CommonKernelSources::nonbondedParameters, paramsDefines);
+    computeParamsKernel = cu.getKernel(module, "computeParameters");
+    computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
+    info = new ForceInfo(force);
+    cu.addForce(info);
+}
+double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
+    // Update particle and exception parameters.
+    bool paramChanged = false;
+    for (int i = 0; i < paramNames.size(); i++) {
+        double value = context.getParameter(paramNames[i]);
+        if (value != paramValues[i]) {
+            paramValues[i] = value;;
+            paramChanged = true;
+        }
+    }
+    if (paramChanged) {
+        recomputeParams = true;
+        globalParams.upload(paramValues, true);
+    }
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
+    if (recomputeParams || hasOffsets) {
+        bool computeSelfEnergy = (includeEnergy && includeReciprocal);
+        int numAtoms = cu.getPaddedNumAtoms();
+        vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
+                &baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
+                &particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
+        int numExceptions;
+        if (exceptionParams.isInitialized()) {
+            numExceptions = exceptionParams.getSize();
+            paramsArgs.push_back(&numExceptions);
+            paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
+            paramsArgs.push_back(&exceptionParams.getDevicePointer());
+            paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
+            paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
+        }
+        cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
+        if (exclusionParams.isInitialized()) {
+            int numExclusions = exclusionParams.getSize();
+            vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
+                    &numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
+            cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
+        }
+        if (usePmeStream) {
+            hipEventRecord(paramsSyncEvent, cu.getCurrentStream());
+            hipStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
+        }
+        if (hasOffsets)
+            energy = 0.0; // The Ewald self energy was computed in the kernel.
+        recomputeParams = false;
+    }
+    // Do reciprocal space calculations.
+    if (cosSinSums.isInitialized() && includeReciprocal) {
+        void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
+        cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums.getSize());
+        void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
+        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
+    }
+    if (pmeGrid1.isInitialized() && includeReciprocal) {
+        if (usePmeStream)
+            cu.setCurrentStream(pmeStream);
+        // Invert the periodic box vectors.
+        Vec3 boxVectors[3];
+        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
+        double scale = 1.0/determinant;
+        double4 recipBoxVectors[3];
+        recipBoxVectors[0] = make_double4(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0, 0);
+        recipBoxVectors[1] = make_double4(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0, 0);
+        recipBoxVectors[2] = make_double4((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale, 0);
+        float4 recipBoxVectorsFloat[3];
+        void* recipBoxVectorPointer[3];
+        if (cu.getUseDoublePrecision()) {
+            recipBoxVectorPointer[0] = &recipBoxVectors[0];
+            recipBoxVectorPointer[1] = &recipBoxVectors[1];
+            recipBoxVectorPointer[2] = &recipBoxVectors[2];
+        }
+        else {
+            recipBoxVectorsFloat[0] = make_float4((float) recipBoxVectors[0].x, 0, 0, 0);
+            recipBoxVectorsFloat[1] = make_float4((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0, 0);
+            recipBoxVectorsFloat[2] = make_float4((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z, 0);
+            recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
+            recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
+            recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
+        }
+        // Execute the reciprocal space kernels.
+        if (hasCoulomb) {
+            void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
+            sort->sort(pmeAtomGridIndex);
+            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &charges.getDevicePointer()};
+            cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
+            void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
+            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
+            if (useHipFFT) {
+                if (cu.getUseDoublePrecision()) {
+                    hipfftResult result = hipfftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                } else {
+                    hipfftResult result = hipfftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                }
+            }
+            else {
+                fft->execFFT(pmeGrid1, pmeGrid2, true);
+            }
+            if (includeEnergy) {
+                void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
+                        &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
+                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+                cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
+            }
+            void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
+                    &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
+            if (useHipFFT) {
+                if (cu.getUseDoublePrecision()) {
+                    hipfftResult result = hipfftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                } else {
+                    hipfftResult result = hipfftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*)  pmeGrid1.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                }
+            }
+            else {
+                fft->execFFT(pmeGrid2, pmeGrid1, false);
+            }
+            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &charges.getDevicePointer()};
+            cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
+        }
+        if (doLJPME && hasLJ) {
+            if (!hasCoulomb) {
+                void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                        cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+                cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
+                sort->sort(pmeAtomGridIndex);
+                cu.clearBuffer(pmeEnergyBuffer);
+            }
+            cu.clearBuffer(pmeGrid2);
+            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &sigmaEpsilon.getDevicePointer()};
+            cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
+            void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
+            cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+            if (useHipFFT) {
+                if (cu.getUseDoublePrecision()) {
+                    hipfftResult result = hipfftExecD2Z(dispersionFftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                } else {
+                    hipfftResult result = hipfftExecR2C(dispersionFftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                }
+            }
+            else {
+                dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
+            }
+            if (includeEnergy) {
+                void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
+                        &pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
+                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+                cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
+            }
+            void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
+                    &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
+            cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
+            if (useHipFFT) {
+                if (cu.getUseDoublePrecision()) {
+                    hipfftResult result = hipfftExecZ2D(dispersionFftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                } else {
+                    hipfftResult result = hipfftExecC2R(dispersionFftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*)  pmeGrid1.getDevicePointer());
+                    if (result != HIPFFT_SUCCESS)
+                        throw OpenMMException("Error executing FFT: "+cu.intToString(result));
+                }
+            }
+            else {
+                dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
+            }
+            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
+                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
+                    &sigmaEpsilon.getDevicePointer()};
+            cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
+        }
+        if (usePmeStream) {
+            hipEventRecord(pmeSyncEvent, pmeStream);
+            cu.restoreDefaultStream();
+        }
+    }
+    if (dispersionCoefficient != 0.0 && includeDirect) {
+        double4 boxSize = cu.getPeriodicBoxSize();
+        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
+    }
+    return energy;
+}
+void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
+    // Make sure the new parameters are acceptable.
+    cu.setAsCurrent();
+    if (force.getNumParticles() != cu.getNumAtoms())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    if (!hasCoulomb || !hasLJ) {
+        for (int i = 0; i < force.getNumParticles(); i++) {
+            double charge, sigma, epsilon;
+            force.getParticleParameters(i, charge, sigma, epsilon);
+            if (!hasCoulomb && charge != 0.0)
+                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
+            if (!hasLJ && epsilon != 0.0)
+                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
+        }
+    }
+    vector<int> exceptions;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+        if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
+            exceptions.push_back(i);
+        else if (chargeProd != 0.0 || epsilon != 0.0)
+            throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
+    }
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
+    int numExceptions = endIndex-startIndex;
+    // Record the per-particle parameters.
+    vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    const vector<int>& order = cu.getAtomIndex();
+    for (int i = 0; i < force.getNumParticles(); i++) {
+        double charge, sigma, epsilon;
+        force.getParticleParameters(i, charge, sigma, epsilon);
+        baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
+    }
+    baseParticleParams.upload(baseParticleParamVec);
+    // Record the exceptions.
+    if (numExceptions > 0) {
+        vector<vector<int> > atoms(numExceptions, vector<int>(2));
+        vector<float4> baseExceptionParamsVec(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            double chargeProd, sigma, epsilon;
+            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
+            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
+        }
+        baseExceptionParams.upload(baseExceptionParamsVec);
+    }
+    // Compute other values.
+    ewaldSelfEnergy = 0.0;
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
+        if (cu.getContextIndex() == 0) {
+            for (int i = 0; i < force.getNumParticles(); i++) {
+                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
+                if (doLJPME)
+                    ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
+            }
+        }
+    }
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
+        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
+    cu.invalidateMolecules();
+    recomputeParams = true;
+}
+void HipCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != PME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (cu.getPlatformData().useCpuPme)
+        cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = this->alpha;
+        nx = gridSizeX;
+        ny = gridSizeY;
+        nz = gridSizeZ;
+    }
+}
+void HipCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (!doLJPME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (cu.getPlatformData().useCpuPme)
+        //cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
+        throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
+    else {
+        alpha = this->dispersionAlpha;
+        nx = dispersionGridSizeX;
+        ny = dispersionGridSizeY;
+        nz = dispersionGridSizeZ;
+    }
+}
--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/OpenMMException.h"
+#include "HipNonbondedUtilities.h"
+#include "HipArray.h"
+#include "HipContext.h"
+#include "HipKernelSources.h"
+#include "HipExpressionUtilities.h"
+#include "HipSort.h"
+#include <algorithm>
+#include <map>
+#include <set>
+#include <utility>
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
+public:
+    BlockSortTrait(bool useDouble) : useDouble(useDouble) {
+    }
+    int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
+    int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
+    const char* getDataType() const {return "real2";}
+    const char* getKeyType() const {return "real";}
+    const char* getMinKey() const {return "-3.40282e+38f";}
+    const char* getMaxKey() const {return "3.40282e+38f";}
+    const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
+    const char* getSortKey() const {return "value.x";}
+private:
+    bool useDouble;
+};
+HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
+        blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true) {
+    // Decide how many thread blocks to use.
+    string errorMessage = "Error initializing nonbonded utilities";
+    int multiprocessors;
+    CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, context.getDevice()));
+    CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, 0));
+    CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(int), hipHostMallocPortable));
+    numForceThreadBlocks = 4*multiprocessors;
+    forceThreadBlockSize = (256);
+    setKernelSource(HipKernelSources::nonbonded);
+}
+HipNonbondedUtilities::~HipNonbondedUtilities() {
+    if (blockSorter != NULL)
+        delete blockSorter;
+    if (pinnedCountBuffer != NULL)
+        hipHostFree(pinnedCountBuffer);
+    hipEventDestroy(downloadCountEvent);
+}
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
+    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
+}
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
+    if (groupCutoff.size() > 0) {
+        if (usesCutoff != useCutoff)
+            throw OpenMMException("All Forces must agree on whether to use a cutoff");
+        if (usesPeriodic != usePeriodic)
+            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
+        if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
+            throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
+    }
+    if (usesExclusions)
+        requestExclusions(exclusionList);
+    useCutoff = usesCutoff;
+    usePeriodic = usesPeriodic;
+    groupCutoff[forceGroup] = cutoffDistance;
+    groupFlags |= 1<<forceGroup;
+    canUsePairList &= supportsPairList;
+    if (kernel.size() > 0) {
+        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
+            groupKernelSource[forceGroup] = "";
+        map<string, string> replacements;
+        replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
+        replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
+        groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
+    }
+}
+void HipNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
+    parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
+}
+void HipNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
+    parameters.push_back(parameter);
+}
+void HipNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
+    arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
+}
+void HipNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
+    arguments.push_back(parameter);
+}
+string HipNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
+    // See if the parameter has already been added.
+    int index;
+    for (index = 0; index < energyParameterDerivatives.size(); index++)
+        if (param == energyParameterDerivatives[index])
+            break;
+    if (index == energyParameterDerivatives.size())
+        energyParameterDerivatives.push_back(param);
+    context.addEnergyParameterDerivative(param);
+    return string("energyParamDeriv")+context.intToString(index);
+}
+void HipNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
+    if (anyExclusions) {
+        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
+        for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
+             if (exclusionList[i].size() != atomExclusions[i].size())
+                 sameExclusions = false;
+            set<int> expectedExclusions;
+            expectedExclusions.insert(atomExclusions[i].begin(), atomExclusions[i].end());
+            for (int j = 0; j < (int) exclusionList[i].size(); j++)
+                if (expectedExclusions.find(exclusionList[i][j]) == expectedExclusions.end())
+                     sameExclusions = false;
+        }
+        if (!sameExclusions)
+            throw OpenMMException("All Forces must have identical exceptions");
+    }
+    else {
+        atomExclusions = exclusionList;
+        anyExclusions = true;
+    }
+}
+static bool compareInt2(int2 a, int2 b) {
+    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
+}
+void HipNonbondedUtilities::initialize(const System& system) {
+    string errorMessage = "Error initializing nonbonded utilities";
+    if (atomExclusions.size() == 0) {
+        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
+        atomExclusions.resize(context.getNumAtoms());
+        for (int i = 0; i < (int) atomExclusions.size(); i++)
+            atomExclusions[i].push_back(i);
+    }
+    // Create the list of tiles.
+    numAtoms = context.getNumAtoms();
+    int numAtomBlocks = context.getNumAtomBlocks();
+    int numContexts = context.getPlatformData().contexts.size();
+    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
+    // Build a list of tiles that contain exclusions.
+    set<pair<int, int> > tilesWithExclusions;
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/HipContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/HipContext::TileSize;
+            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
+        }
+    }
+    vector<int2> exclusionTilesVec;
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
+        exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
+    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareInt2);
+    exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
+    exclusionTiles.upload(exclusionTilesVec);
+    map<pair<int, int>, int> exclusionTileMap;
+    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
+        int2 tile = exclusionTilesVec[i];
+        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
+    }
+    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
+    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
+        exclusionBlocksForBlock[iter->first].push_back(iter->second);
+        if (iter->first != iter->second)
+            exclusionBlocksForBlock[iter->second].push_back(iter->first);
+    }
+    vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
+    vector<unsigned int> exclusionIndicesVec;
+    for (int i = 0; i < numAtomBlocks; i++) {
+        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
+        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
+    }
+    maxExclusions = 0;
+    for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
+        maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
+    exclusionIndices.initialize<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
+    exclusionRowIndices.initialize<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
+    exclusionIndices.upload(exclusionIndicesVec);
+    exclusionRowIndices.upload(exclusionRowIndicesVec);
+    // Record the exclusion data.
+    exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions");
+    tileflags allFlags = static_cast<tileflags>(-1);
+    vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/HipContext::TileSize;
+        int offset1 = atom1-x*HipContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/HipContext::TileSize;
+            int offset2 = atom2-y*HipContext::TileSize;
+            if (x > y) {
+                int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize;
+                exclusionVec[index+offset1] &= allFlags-(static_cast<tileflags>(1)<<offset2);
+            }
+            else {
+                int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize;
+                exclusionVec[index+offset2] &= allFlags-(static_cast<tileflags>(1)<<offset1);
+            }
+        }
+    }
+    atomExclusions.clear(); // We won't use this again, so free the memory it used
+    exclusions.upload(exclusionVec);
+    // Create data structures for the neighbor list.
+    if (useCutoff) {
+        // Select a size for the arrays that hold the neighbor list.  We have to make a fairly
+        // arbitrary guess, but if this turns out to be too small we'll increase it later.
+        maxTiles = 20*numAtomBlocks;
+        if (maxTiles > numTiles)
+            maxTiles = numTiles;
+        if (maxTiles < 1)
+            maxTiles = 1;
+        maxSinglePairs = 5*numAtoms;
+        interactingTiles.initialize<int>(context, maxTiles, "interactingTiles");
+        interactingAtoms.initialize<int>(context, HipContext::TileSize*maxTiles, "interactingAtoms");
+        interactionCount.initialize<unsigned int>(context, 2, "interactionCount");
+        singlePairs.initialize<int2>(context, maxSinglePairs, "singlePairs");
+        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+        blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
+        blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+        sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
+        sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
+        sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
+        rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
+        blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
+        vector<unsigned int> count(2, 0);
+        interactionCount.upload(count);
+        rebuildNeighborList.upload(&count[0]);
+    }
+    // Record arguments for kernels.
+    forceArgs.push_back(&context.getForce().getDevicePointer());
+    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
+    forceArgs.push_back(&context.getPosq().getDevicePointer());
+    forceArgs.push_back(&exclusions.getDevicePointer());
+    forceArgs.push_back(&exclusionTiles.getDevicePointer());
+    forceArgs.push_back(&startTileIndex);
+    forceArgs.push_back(&numTiles);
+    if (useCutoff) {
+        forceArgs.push_back(&interactingTiles.getDevicePointer());
+        forceArgs.push_back(&interactionCount.getDevicePointer());
+        forceArgs.push_back(context.getPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getPeriodicBoxVecXPointer());
+        forceArgs.push_back(context.getPeriodicBoxVecYPointer());
+        forceArgs.push_back(context.getPeriodicBoxVecZPointer());
+        forceArgs.push_back(&maxTiles);
+        forceArgs.push_back(&blockCenter.getDevicePointer());
+        forceArgs.push_back(&blockBoundingBox.getDevicePointer());
+        forceArgs.push_back(&interactingAtoms.getDevicePointer());
+        forceArgs.push_back(&maxSinglePairs);
+        forceArgs.push_back(&singlePairs.getDevicePointer());
+    }
+    for (int i = 0; i < (int) parameters.size(); i++)
+        forceArgs.push_back(&parameters[i].getMemory());
+    for (ParameterInfo& arg : arguments)
+        forceArgs.push_back(&arg.getMemory());
+    if (energyParameterDerivatives.size() > 0)
+        forceArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
+    if (useCutoff) {
+        findBlockBoundsArgs.push_back(&numAtoms);
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
+        findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecXPointer());
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
+        findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
+        findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
+        findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
+        findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
+        sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
+        sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
+        sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
+        sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
+        sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
+        sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
+        sortBoxDataArgs.push_back(&forceRebuildNeighborList);
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
+        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
+        findInteractingBlocksArgs.push_back(&interactionCount.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingTiles.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&interactingAtoms.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&singlePairs.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
+        findInteractingBlocksArgs.push_back(&maxTiles);
+        findInteractingBlocksArgs.push_back(&maxSinglePairs);
+        findInteractingBlocksArgs.push_back(&startBlockIndex);
+        findInteractingBlocksArgs.push_back(&numBlocks);
+        findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
+        findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
+    }
+}
+double HipNonbondedUtilities::getMaxCutoffDistance() {
+    double cutoff = 0.0;
+    for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
+        cutoff = max(cutoff, iter->second);
+    return cutoff;
+}
+double HipNonbondedUtilities::padCutoff(double cutoff) {
+    double padding = (usePadding ? 0.08*cutoff : 0.0);
+    return cutoff+padding;
+}
+void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    if (groupKernels.find(forceGroups) == groupKernels.end())
+        createKernelsForGroups(forceGroups);
+    if (!useCutoff)
+        return;
+    if (numTiles == 0)
+        return;
+    KernelSet& kernels = groupKernels[forceGroups];
+    if (usePeriodic) {
+        double4 box = context.getPeriodicBoxSize();
+        double minAllowedSize = 1.999999*kernels.cutoffDistance;
+        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
+            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
+    }
+    // Compute the neighbor list.
+    if (lastCutoff != kernels.cutoffDistance)
+        forceRebuildNeighborList = true;
+    context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
+    blockSorter->sort(sortedBlocks);
+    context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
+    context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
+    forceRebuildNeighborList = false;
+    lastCutoff = kernels.cutoffDistance;
+    interactionCount.download(pinnedCountBuffer, false);
+    hipEventRecord(downloadCountEvent, context.getCurrentStream());
+}
+void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    KernelSet& kernels = groupKernels[forceGroups];
+    if (kernels.hasForces) {
+        hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
+        if (kernel == NULL)
+            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
+        context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+    }
+    if (useCutoff && numTiles > 0) {
+        hipEventSynchronize(downloadCountEvent);
+        updateNeighborListSize();
+    }
+}
+bool HipNonbondedUtilities::updateNeighborListSize() {
+    if (!useCutoff)
+        return false;
+    if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
+        return false;
+    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
+    // this from happening in the future.
+    if (pinnedCountBuffer[0] > maxTiles) {
+        maxTiles = (int) (1.2*pinnedCountBuffer[0]);
+        int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+        if (maxTiles > totalTiles)
+            maxTiles = totalTiles;
+        interactingTiles.resize(maxTiles);
+        interactingAtoms.resize(HipContext::TileSize*maxTiles);
+        if (forceArgs.size() > 0)
+            forceArgs[7] = &interactingTiles.getDevicePointer();
+        findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();
+        if (forceArgs.size() > 0)
+            forceArgs[17] = &interactingAtoms.getDevicePointer();
+        findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
+    }
+    if (pinnedCountBuffer[1] > maxSinglePairs) {
+        maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
+        singlePairs.resize(maxSinglePairs);
+        if (forceArgs.size() > 0)
+            forceArgs[19] = &singlePairs.getDevicePointer();
+        findInteractingBlocksArgs[8] = &singlePairs.getDevicePointer();
+    }
+    forceRebuildNeighborList = true;
+    context.setForcesValid(false);
+    return true;
+}
+void HipNonbondedUtilities::setUsePadding(bool padding) {
+    usePadding = padding;
+}
+void HipNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
+    int numAtomBlocks = context.getNumAtomBlocks();
+    startBlockIndex = (int) (startFraction*numAtomBlocks);
+    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
+    long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
+    startTileIndex = (int) (startFraction*totalTiles);
+    numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
+    forceRebuildNeighborList = true;
+}
+void HipNonbondedUtilities::createKernelsForGroups(int groups) {
+    KernelSet kernels;
+    double cutoff = 0.0;
+    string source;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            cutoff = max(cutoff, groupCutoff[i]);
+            source += groupKernelSource[i];
+        }
+    }
+    kernels.hasForces = (source.size() > 0);
+    kernels.cutoffDistance = cutoff;
+    kernels.source = source;
+    kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
+    if (useCutoff) {
+        double paddedCutoff = padCutoff(cutoff);
+        map<string, string> defines;
+        defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
+        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+        defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
+        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
+        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
+        if (usePeriodic)
+            defines["USE_PERIODIC"] = "1";
+        if (context.getBoxIsTriclinic())
+            defines["TRICLINIC"] = "1";
+        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? "2" : "0");
+        hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
+        kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
+        kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
+        kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
+    }
+    groupKernels[groups] = kernels;
+}
+hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups, bool includeForces, bool includeEnergy) {
+    map<string, string> replacements;
+    replacements["COMPUTE_INTERACTION"] = source;
+    const string suffixes[] = {"x", "y", "z", "w"};
+    stringstream localData;
+    int localDataSize = 0;
+    for (const ParameterInfo& param : params) {
+        if (param.getNumComponents() == 1)
+            localData<<param.getType()<<" "<<param.getName()<<";\n";
+        else {
+            for (int j = 0; j < param.getNumComponents(); ++j)
+                localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
+        }
+        localDataSize += param.getSize();
+    }
+    replacements["ATOM_PARAMETER_DATA"] = localData.str();
+    stringstream args;
+    for (const ParameterInfo& param : params) {
+        args << ", ";
+        if (param.isConstant())
+            args << "const ";
+        args << param.getType();
+        args << "* __restrict__ global_";
+        args << param.getName();
+    }
+    for (const ParameterInfo& arg : arguments) {
+        args << ", ";
+        if (arg.isConstant())
+            args << "const ";
+        args << arg.getType();
+        args << "* __restrict__ ";
+        args << arg.getName();
+    }
+    if (energyParameterDerivatives.size() > 0)
+        args << ", mixed* __restrict__ energyParamDerivs";
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    stringstream load1;
+    for (const ParameterInfo& param : params) {
+        load1 << param.getType();
+        load1 << " ";
+        load1 << param.getName();
+        load1 << "1 = global_";
+        load1 << param.getName();
+        load1 << "[atom1];\n";
+    }
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    bool useShuffle = context.getSupportsWarpShuffle();
+    // Part 1. Defines for on diagonal exclusion tiles
+    stringstream loadLocal1;
+    if(useShuffle) {
+        // not needed if using shuffles as we can directly fetch from register
+    } else {
+        for (const ParameterInfo& param : params) {
+            if (param.getNumComponents() == 1)
+                loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
+            else {
+                for (int j = 0; j < param.getNumComponents(); ++j)
+                    loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
+            }
+        }
+    }
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
+    stringstream broadcastWarpData;
+    if (useShuffle) {
+        broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n";
+        broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n";
+        broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n";
+        broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n";
+        for (const ParameterInfo& param : params) {
+            broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
+            for (int j = 0; j < param.getNumComponents(); j++) {
+                if (param.getNumComponents() == 1)
+                    broadcastWarpData << "shfl" << param.getName() << "=real_shfl(" << param.getName() <<"1,j);\n";
+                else
+                    broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=real_shfl(" << param.getName()+"1."+suffixes[j] <<",j);\n";
+            }
+        }
+    } else {
+        // not used if not shuffling
+    }
+    replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
+    // Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
+    stringstream declareLocal2;
+    if (useShuffle) {
+        for (const ParameterInfo& param : params)
+            declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
+    }
+    else {
+        // not used if using shared memory
+    }
+    replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
+    stringstream loadLocal2;
+    if (useShuffle) {
+        for (const ParameterInfo& param : params)
+            loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
+    }
+    else {
+        for (const ParameterInfo& param : params) {
+            if (param.getNumComponents() == 1)
+                loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
+            else {
+                loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
+                for (int j = 0; j < param.getNumComponents(); ++j)
+                    loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
+            }
+        }
+    }
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
+    stringstream load2j;
+    if (useShuffle) {
+        for (const ParameterInfo& param : params)
+            load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
+    }
+    else {
+        for (const ParameterInfo& param : params) {
+            if (param.getNumComponents() == 1)
+                load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
+            else {
+                load2j<<param.getType()<<" "<<param.getName()<<"2 = make_"<<param.getType()<<"(";
+                for (int j = 0; j < param.getNumComponents(); ++j) {
+                    if (j > 0)
+                        load2j<<", ";
+                    load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
+                }
+                load2j<<");\n";
+            }
+        }
+    }
+    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
+    stringstream clearLocal;
+    for (const ParameterInfo& param : params) {
+        if (useShuffle)
+            clearLocal<<"shfl";
+        else
+            clearLocal<<"localData[atom2].";
+        clearLocal<<param.getName()<<" = ";
+        if (param.getNumComponents() == 1)
+            clearLocal<<"0;\n";
+        else
+            clearLocal<<"make_"<<param.getType()<<"(0);\n";
+    }
+    replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
+    stringstream initDerivs;
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
+    replacements["INIT_DERIVATIVES"] = initDerivs.str();
+    stringstream saveDerivs;
+    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
+    int numDerivs = allParamDerivNames.size();
+    for (int i = 0; i < energyParameterDerivatives.size(); i++)
+        for (int index = 0; index < numDerivs; index++)
+            if (allParamDerivNames[index] == energyParameterDerivatives[i])
+                saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
+    stringstream shuffleWarpData;
+    if(useShuffle) {
+        shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n";
+        shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n";
+        shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n";
+        shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
+        shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
+        shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
+        shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
+        for (const ParameterInfo& param : params) {
+            if (param.getNumComponents() == 1)
+                shuffleWarpData<<"shfl"<<param.getName()<<"=real_shfl(shfl"<<param.getName()<<", tgx+1);\n";
+            else {
+                for (int j = 0; j < param.getNumComponents(); j++) {
+                    // looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
+                    shuffleWarpData<<"shfl"<<param.getName()
+                        <<"."<<suffixes[j]<<"=real_shfl(shfl"
+                        <<param.getName()<<"."<<suffixes[j]
+                        <<", tgx+1);\n";
+                }
+            }
+        }
+    } else {
+        // not used otherwise
+    }
+    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
+    map<string, string> defines;
+    if (useCutoff)
+        defines["USE_CUTOFF"] = "1";
+    if (usePeriodic)
+        defines["USE_PERIODIC"] = "1";
+    if (useExclusions)
+        defines["USE_EXCLUSIONS"] = "1";
+    if (isSymmetric)
+        defines["USE_SYMMETRIC"] = "1";
+    if (useShuffle)
+        defines["ENABLE_SHUFFLE"] = "1";
+    if (includeForces)
+        defines["INCLUDE_FORCES"] = "1";
+    if (includeEnergy)
+        defines["INCLUDE_ENERGY"] = "1";
+    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
+    double maxCutoff = 0.0;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            double cutoff = groupCutoff[i];
+            maxCutoff = max(maxCutoff, cutoff);
+            defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
+            defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
+        }
+    }
+    defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
+    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+    defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
+    int numExclusionTiles = exclusionTiles.getSize();
+    defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
+    int numContexts = context.getPlatformData().contexts.size();
+    int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
+    int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
+    defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
+    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
+    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
+        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
+    hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
+    hipFunction_t kernel = context.getKernel(program, "computeNonbonded");
+    return kernel;
+}
+void HipNonbondedUtilities::setKernelSource(const string& source) {
+    kernelSource = source;
+}
--- a/platforms/hip/src/HipParallelKernels.cpp
+++ b/platforms/hip/src/HipParallelKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipParallelKernels.h"
+#include "HipKernelSources.h"
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result, prefix) \
+if (result != hipSuccess) { \
+    std::stringstream m; \
+    m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+    throw OpenMMException(m.str());\
+}
+/**
+ * Get the current clock time, measured in microseconds.
+ */
+#ifdef _MSC_VER
+    #error "Windows unsupported for HIP platform"
+    #include <Windows.h>
+    static long long getTime() {
+        FILETIME ft;
+        GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
+        ULARGE_INTEGER result;
+        result.LowPart = ft.dwLowDateTime;
+        result.HighPart = ft.dwHighDateTime;
+        return result.QuadPart/10;
+    }
+#else
+    #include <sys/time.h>
+    static long long getTime() {
+        struct timeval tod;
+        gettimeofday(&tod, 0);
+        return 1000000*tod.tv_sec+tod.tv_usec;
+    }
+#endif
+class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
+public:
+    BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
+    }
+    void execute() {
+        // Copy coordinates over to this device and execute the kernel.
+        cu.setAsCurrent();
+        if (cu.getContextIndex() > 0) {
+            hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
+            if (!cu.getPlatformData().peerAccessSupported)
+                cu.getPosq().upload(pinnedMemory, false);
+        }
+        kernel.beginComputation(context, includeForce, includeEnergy, groups);
+        if (cu.getNonbondedUtilities().getUsePeriodic())
+            cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
+    }
+private:
+    ContextImpl& context;
+    HipContext& cu;
+    HipCalcForcesAndEnergyKernel& kernel;
+    bool includeForce, includeEnergy;
+    int groups;
+    void* pinnedMemory;
+    hipEvent_t event;
+    int2& interactionCount;
+};
+class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
+public:
+    FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces, bool& valid, int2& interactionCount) :
+            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
+            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
+    }
+    void execute() {
+        // Execute the kernel, then download forces.
+        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
+        if (cu.getComputeForceCount() < 200) {
+            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.
+            CHECK_RESULT(hipStreamSynchronize(cu.getCurrentStream()), "Error synchronizing HIP context");
+            completionTime = getTime();
+        }
+        if (includeForce) {
+            if (cu.getContextIndex() > 0) {
+                int numAtoms = cu.getPaddedNumAtoms();
+                if (cu.getPlatformData().peerAccessSupported) {
+                    int numBytes = numAtoms*3*sizeof(long long);
+                    int offset = (cu.getContextIndex()-1)*numBytes;
+                    HipContext& context0 = *cu.getPlatformData().contexts[0];
+                    CHECK_RESULT(hipMemcpy(static_cast<char*>(contextForces.getDevicePointer())+offset,
+                                           cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice), "Error copying forces");
+                }
+                else
+                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
+            }
+        }
+        if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
+                interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
+            valid = false;
+            cu.getNonbondedUtilities().updateNeighborListSize();
+        }
+    }
+private:
+    ContextImpl& context;
+    HipContext& cu;
+    HipCalcForcesAndEnergyKernel& kernel;
+    bool includeForce, includeEnergy;
+    int groups;
+    double& energy;
+    long long& completionTime;
+    long long* pinnedMemory;
+    HipArray& contextForces;
+    bool& valid;
+    int2& interactionCount;
+};
+HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
+        interactionCounts(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
+}
+HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
+    data.contexts[0]->setAsCurrent();
+    if (pinnedPositionBuffer != NULL)
+        hipHostFree(pinnedPositionBuffer);
+    if (pinnedForceBuffer != NULL)
+        hipHostFree(pinnedForceBuffer);
+    hipEventDestroy(event);
+    hipStreamDestroy(peerCopyStream);
+    if (interactionCounts != NULL)
+        hipHostFree(interactionCounts);
+}
+void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
+    HipContext& cu = *data.contexts[0];
+    cu.setAsCurrent();
+    hipModule_t module = cu.createModule(HipKernelSources::parallel);
+    sumKernel = cu.getKernel(module, "sumForces");
+    int numContexts = data.contexts.size();
+    for (int i = 0; i < numContexts; i++)
+        getKernel(i).initialize(system);
+    for (int i = 0; i < numContexts; i++)
+        contextNonbondedFractions[i] = 1/(double) numContexts;
+    CHECK_RESULT(hipEventCreateWithFlags(&event, 0), "Error creating event");
+    CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream, hipStreamNonBlocking), "Error creating stream");
+    CHECK_RESULT(hipHostMalloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
+}
+void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    HipContext& cu = *data.contexts[0];
+    cu.setAsCurrent();
+    if (!contextForces.isInitialized()) {
+        contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
+        CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
+        CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
+    }
+    // Copy coordinates over to each device and execute the kernel.
+    if (!cu.getPlatformData().peerAccessSupported) {
+        cu.getPosq().download(pinnedPositionBuffer, false);
+        hipEventRecord(event, cu.getCurrentStream());
+    }
+    else {
+        int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
+        hipEventRecord(event, cu.getCurrentStream());
+        hipStreamWaitEvent(peerCopyStream, event, 0);
+        for (int i = 1; i < (int) data.contexts.size(); i++)
+            CHECK_RESULT(hipMemcpyAsync(
+                data.contexts[i]->getPosq().getDevicePointer(),
+                cu.getPosq().getDevicePointer(), numBytes,
+                hipMemcpyDeviceToDevice, peerCopyStream), "Error copying positions");
+        hipEventRecord(event, peerCopyStream);
+    }
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        data.contextEnergy[i] = 0.0;
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
+    }
+}
+double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, contextForces, valid, interactionCounts[i]));
+    }
+    data.syncContexts();
+    double energy = 0.0;
+    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
+        energy += data.contextEnergy[i];
+    if (includeForce && valid) {
+        // Sum the forces from all devices.
+        HipContext& cu = *data.contexts[0];
+        if (!cu.getPlatformData().peerAccessSupported)
+            contextForces.upload(pinnedForceBuffer, false);
+        int bufferSize = 3*cu.getPaddedNumAtoms();
+        int numBuffers = data.contexts.size()-1;
+        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces.getDevicePointer(), &bufferSize, &numBuffers};
+        cu.executeKernel(sumKernel, args, bufferSize);
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
+        // finished last to the one that finished first.
+        if (cu.getComputeForceCount() < 200) {
+            int firstIndex = 0, lastIndex = 0;
+            for (int i = 0; i < (int) completionTimes.size(); i++) {
+                if (completionTimes[i] < completionTimes[firstIndex])
+                    firstIndex = i;
+                if (completionTimes[i] > completionTimes[lastIndex])
+                    lastIndex = i;
+            }
+            double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
+            contextNonbondedFractions[firstIndex] += fractionToTransfer;
+            contextNonbondedFractions[lastIndex] -= fractionToTransfer;
+            double startFraction = 0.0;
+            for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
+                double endFraction = startFraction+contextNonbondedFractions[i];
+                if (i == contextNonbondedFractions.size()-1)
+                    endFraction = 1.0; // Avoid roundoff error
+                data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
+                startFraction = endFraction;
+            }
+	}
+    }
+    return energy;
+}
+class HipParallelCalcHarmonicBondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcHarmonicBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcHarmonicBondForceKernel::HipParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcHarmonicBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCustomBondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomBondForceKernel::HipParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcHarmonicAngleForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcHarmonicAngleForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcHarmonicAngleForceKernel::HipParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcHarmonicAngleForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCustomAngleForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomAngleForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomAngleForceKernel::HipParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomAngleForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcPeriodicTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcPeriodicTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcPeriodicTorsionForceKernel::HipParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcPeriodicTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcRBTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcRBTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcRBTorsionForceKernel::HipParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcRBTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCMAPTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCMAPTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCMAPTorsionForceKernel::HipParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCMAPTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCustomTorsionForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomTorsionForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomTorsionForceKernel::HipParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomTorsionForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcNonbondedForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, HipCalcNonbondedForceKernel& kernel, bool includeForce,
+            bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
+    }
+private:
+    ContextImpl& context;
+    HipCalcNonbondedForceKernel& kernel;
+    bool includeForce, includeEnergy, includeDirect, includeReciprocal;
+    double& energy;
+};
+HipParallelCalcNonbondedForceKernel::HipParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcNonbondedForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new HipCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+void HipParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
+}
+void HipParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
+class HipParallelCalcCustomNonbondedForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomNonbondedForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomNonbondedForceKernel::HipParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomNonbondedForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCustomExternalForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomExternalForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomExternalForceKernel::HipParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomExternalForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCustomHbondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomHbondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomHbondForceKernel::HipParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomHbondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+class HipParallelCalcCustomCompoundBondForceKernel::Task : public HipContext::WorkTask {
+public:
+    Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
+            bool includeEnergy, double& energy) : context(context), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
+    }
+    void execute() {
+        energy += kernel.execute(context, includeForce, includeEnergy);
+    }
+private:
+    ContextImpl& context;
+    CommonCalcCustomCompoundBondForceKernel& kernel;
+    bool includeForce, includeEnergy;
+    double& energy;
+};
+HipParallelCalcCustomCompoundBondForceKernel::HipParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
+        CalcCustomCompoundBondForceKernel(name, platform), data(data) {
+    for (int i = 0; i < (int) data.contexts.size(); i++)
+        kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
+}
+void HipParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).initialize(system, force);
+}
+double HipParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    for (int i = 0; i < (int) data.contexts.size(); i++) {
+        HipContext& cu = *data.contexts[i];
+        ComputeContext::WorkThread& thread = cu.getWorkThread();
+        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
+    }
+    return 0.0;
+}
+void HipParallelCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
--- a/platforms/hip/src/HipParameterSet.cpp
+++ b/platforms/hip/src/HipParameterSet.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipParameterSet.h"
+using namespace OpenMM;
+using namespace std;
+HipParameterSet::HipParameterSet(HipContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
+            ComputeParameterSet(context, numParameters, numObjects, name, bufferPerParameter, useDoublePrecision) {
+    for (auto& info : getParameterInfos())
+        buffers.push_back(HipNonbondedUtilities::ParameterInfo(info.getName(), info.getComponentType(), info.getNumComponents(), info.getSize(), context.unwrap(info.getArray()).getDevicePointer()));
+}
--- a/platforms/hip/src/HipPlatform.cpp
+++ b/platforms/hip/src/HipPlatform.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipContext.h"
+#include "HipExpressionUtilities.h"
+#include "HipPlatform.h"
+#include "HipKernelFactory.h"
+#include "HipKernels.h"
+#include "openmm/Context.h"
+#include "openmm/System.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/hardware.h"
+#include <algorithm>
+#include <cctype>
+#include <sstream>
+#include <cstdio>
+#ifdef _MSC_VER
+    #error "Windows unsupported for HIP platform"
+#endif
+using namespace OpenMM;
+using namespace std;
+#define CHECK_RESULT(result, prefix) \
+    if (result != hipSuccess) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+#ifdef OPENMM_COMMON_BUILDING_STATIC_LIBRARY
+extern "C" void registerHipPlatform() {
+    Platform::registerPlatform(new HipPlatform());
+}
+#else
+extern "C" OPENMM_EXPORT_COMMON void registerPlatforms() {
+    Platform::registerPlatform(new HipPlatform());
+}
+#endif
+HipPlatform::HipPlatform() {
+    deprecatedPropertyReplacements["HipDeviceIndex"] = HipDeviceIndex();
+    deprecatedPropertyReplacements["HipDeviceName"] = HipDeviceName();
+    deprecatedPropertyReplacements["HipUseBlockingSync"] = HipUseBlockingSync();
+    deprecatedPropertyReplacements["HipPrecision"] = HipPrecision();
+    deprecatedPropertyReplacements["HipUseCpuPme"] = HipUseCpuPme();
+    deprecatedPropertyReplacements["HipTempDirectory"] = HipTempDirectory();
+    deprecatedPropertyReplacements["HipDisablePmeStream"] = HipDisablePmeStream();
+    deprecatedPropertyReplacements["HipDeterministicForces"] = HipDeterministicForces();
+    HipKernelFactory* factory = new HipKernelFactory();
+    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
+    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
+    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
+    registerKernelFactory(VirtualSitesKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
+    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
+    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
+    registerKernelFactory(CalcRMSDForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
+    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
+    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
+    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
+    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
+    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
+    platformProperties.push_back(HipDeviceIndex());
+    platformProperties.push_back(HipDeviceName());
+    platformProperties.push_back(HipUseBlockingSync());
+    platformProperties.push_back(HipPrecision());
+    platformProperties.push_back(HipUseCpuPme());
+    platformProperties.push_back(HipCompiler());
+    platformProperties.push_back(HipTempDirectory());
+    platformProperties.push_back(HipHostCompiler());
+    platformProperties.push_back(HipDisablePmeStream());
+    platformProperties.push_back(HipDeterministicForces());
+    setPropertyDefaultValue(HipDeviceIndex(), "");
+    setPropertyDefaultValue(HipDeviceName(), "");
+    setPropertyDefaultValue(HipUseBlockingSync(), "true");
+    setPropertyDefaultValue(HipPrecision(), "single");
+    setPropertyDefaultValue(HipUseCpuPme(), "false");
+    setPropertyDefaultValue(HipDisablePmeStream(), "false");
+    setPropertyDefaultValue(HipDeterministicForces(), "false");
+    char* compiler = getenv("OPENMM_HIP_COMPILER");
+    char* rocm_path = getenv("ROCM_PATH");
+    string hipcc;
+    if (rocm_path != NULL) {
+        hipcc = string(rocm_path) + "/bin/hipcc";
+    } else if (compiler != NULL) {
+        hipcc = compiler;
+    } else {
+        hipcc = "/opt/rocm/bin/hipcc";
+    }
+    setPropertyDefaultValue(HipCompiler(), hipcc);
+    char* tmpdir = getenv("TMPDIR");
+    string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
+    setPropertyDefaultValue(HipTempDirectory(), tmp);
+    char* hostCompiler = getenv("HIP_HOST_COMPILER");
+    setPropertyDefaultValue(HipHostCompiler(), (hostCompiler == NULL ? "" : string(hostCompiler)));
+}
+double HipPlatform::getSpeed() const {
+    return 100;
+}
+bool HipPlatform::supportsDoublePrecision() const {
+    return true;
+}
+const string& HipPlatform::getPropertyValue(const Context& context, const string& property) const {
+    const ContextImpl& impl = getContextImpl(context);
+    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
+    string propertyName = property;
+    if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
+        propertyName = deprecatedPropertyReplacements.find(property)->second;
+    map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
+    if (value != data->propertyValues.end())
+        return value->second;
+    return Platform::getPropertyValue(context, property);
+}
+void HipPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
+}
+void HipPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
+    const string& devicePropValue = (properties.find(HipDeviceIndex()) == properties.end() ?
+            getPropertyDefaultValue(HipDeviceIndex()) : properties.find(HipDeviceIndex())->second);
+    string blockingPropValue = (properties.find(HipUseBlockingSync()) == properties.end() ?
+            getPropertyDefaultValue(HipUseBlockingSync()) : properties.find(HipUseBlockingSync())->second);
+    string precisionPropValue = (properties.find(HipPrecision()) == properties.end() ?
+            getPropertyDefaultValue(HipPrecision()) : properties.find(HipPrecision())->second);
+    string cpuPmePropValue = (properties.find(HipUseCpuPme()) == properties.end() ?
+            getPropertyDefaultValue(HipUseCpuPme()) : properties.find(HipUseCpuPme())->second);
+    const string& compilerPropValue = (properties.find(HipCompiler()) == properties.end() ?
+            getPropertyDefaultValue(HipCompiler()) : properties.find(HipCompiler())->second);
+    const string& tempPropValue = (properties.find(HipTempDirectory()) == properties.end() ?
+            getPropertyDefaultValue(HipTempDirectory()) : properties.find(HipTempDirectory())->second);
+    const string& hostCompilerPropValue = (properties.find(HipHostCompiler()) == properties.end() ?
+            getPropertyDefaultValue(HipHostCompiler()) : properties.find(HipHostCompiler())->second);
+    string pmeStreamPropValue = (properties.find(HipDisablePmeStream()) == properties.end() ?
+            getPropertyDefaultValue(HipDisablePmeStream()) : properties.find(HipDisablePmeStream())->second);
+    string deterministicForcesValue = (properties.find(HipDeterministicForces()) == properties.end() ?
+            getPropertyDefaultValue(HipDeterministicForces()) : properties.find(HipDeterministicForces())->second);
+    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
+    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
+    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
+    transform(pmeStreamPropValue.begin(), pmeStreamPropValue.end(), pmeStreamPropValue.begin(), ::tolower);
+    transform(deterministicForcesValue.begin(), deterministicForcesValue.end(), deterministicForcesValue.begin(), ::tolower);
+    vector<string> pmeKernelName;
+    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
+    if (!supportsKernels(pmeKernelName))
+        cpuPmePropValue = "false";
+    int threads = getNumProcessors();
+    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
+    if (threadsEnv != NULL)
+        stringstream(threadsEnv) >> threads;
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
+}
+void HipPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
+    Platform& platform = originalContext.getPlatform();
+    string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), HipDeviceIndex());
+    string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseBlockingSync());
+    string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), HipPrecision());
+    string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseCpuPme());
+    string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), HipCompiler());
+    string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), HipTempDirectory());
+    string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), HipHostCompiler());
+    string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), HipDisablePmeStream());
+    string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), HipDeterministicForces());
+    int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
+}
+void HipPlatform::contextDestroyed(ContextImpl& context) const {
+    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
+    delete data;
+}
+HipPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
+            const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
+                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
+    bool blocking = (blockingProperty == "true");
+    vector<string> devices;
+    size_t searchPos = 0, nextPos;
+    while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
+        devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
+        searchPos = nextPos+1;
+    }
+    devices.push_back(deviceIndexProperty.substr(searchPos));
+    PlatformData* originalData = NULL;
+    if (originalContext != NULL)
+        originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
+    try {
+        for (int i = 0; i < (int) devices.size(); i++) {
+            if (devices[i].length() > 0) {
+                int deviceIndex;
+                stringstream(devices[i]) >> deviceIndex;
+                contexts.push_back(new HipContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
+            }
+        }
+        if (contexts.size() == 0)
+            contexts.push_back(new HipContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
+    }
+    catch (...) {
+        // If an exception was thrown, do our best to clean up memory.
+        for (int i = 0; i < (int) contexts.size(); i++)
+            delete contexts[i];
+        throw;
+    }
+    stringstream deviceIndex, deviceName;
+    for (int i = 0; i < (int) contexts.size(); i++) {
+        if (i > 0) {
+            deviceIndex << ',';
+            deviceName << ',';
+        }
+        deviceIndex << contexts[i]->getDeviceIndex();
+        char name[1000];
+        CHECK_RESULT(hipDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
+        deviceName << name;
+    }
+    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
+    disablePmeStream = (pmeStreamProperty == "true");
+    deterministicForces = (deterministicForcesProperty == "true");
+    propertyValues[HipPlatform::HipDeviceIndex()] = deviceIndex.str();
+    propertyValues[HipPlatform::HipDeviceName()] = deviceName.str();
+    propertyValues[HipPlatform::HipUseBlockingSync()] = blocking ? "true" : "false";
+    propertyValues[HipPlatform::HipPrecision()] = precisionProperty;
+    propertyValues[HipPlatform::HipUseCpuPme()] = useCpuPme ? "true" : "false";
+    propertyValues[HipPlatform::HipCompiler()] = compilerProperty;
+    propertyValues[HipPlatform::HipTempDirectory()] = tempProperty;
+    propertyValues[HipPlatform::HipHostCompiler()] = hostCompilerProperty;
+    propertyValues[HipPlatform::HipDisablePmeStream()] = disablePmeStream ? "true" : "false";
+    propertyValues[HipPlatform::HipDeterministicForces()] = deterministicForces ? "true" : "false";
+    contextEnergy.resize(contexts.size());
+    // Determine whether peer-to-peer copying is supported, and enable it if so.
+    peerAccessSupported = true;
+    for (int i = 1; i < contexts.size(); i++) {
+        int canAccess;
+        hipDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
+        if (!canAccess) {
+            peerAccessSupported = false;
+            break;
+        }
+    }
+}
+HipPlatform::PlatformData::~PlatformData() {
+    for (int i = 0; i < (int) contexts.size(); i++)
+        delete contexts[i];
+}
+void HipPlatform::PlatformData::initializeContexts(const System& system) {
+    if (hasInitializedContexts)
+        return;
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->initialize();
+    hasInitializedContexts = true;
+}
+void HipPlatform::PlatformData::syncContexts() {
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->getWorkThread().flush();
+}
--- a/platforms/hip/src/HipProgram.cpp
+++ b/platforms/hip/src/HipProgram.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipProgram.h"
+#include "HipKernel.h"
+using namespace OpenMM;
+using namespace std;
+HipProgram::HipProgram(HipContext& context, hipModule_t module) : context(context), module(module) {
+}
+ComputeKernel HipProgram::createKernel(const string& name) {
+    hipFunction_t kernel = context.getKernel(module, name.c_str());
+    return shared_ptr<ComputeKernelImpl>(new HipKernel(context, kernel, name));
+}
--- a/platforms/hip/src/HipSort.cpp
+++ b/platforms/hip/src/HipSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "HipSort.h"
+#include "HipKernelSources.h"
+#include <algorithm>
+#include <map>
+using namespace OpenMM;
+using namespace std;
+HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
+    // Create kernels.
+    map<string, string> replacements;
+    replacements["DATA_TYPE"] = trait->getDataType();
+    replacements["KEY_TYPE"] =  trait->getKeyType();
+    replacements["SORT_KEY"] = trait->getSortKey();
+    replacements["MIN_KEY"] = trait->getMinKey();
+    replacements["MAX_KEY"] = trait->getMaxKey();
+    replacements["MAX_VALUE"] = trait->getMaxValue();
+    hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
+    shortListKernel = context.getKernel(module, "sortShortList");
+    shortList2Kernel = context.getKernel(module, "sortShortList2");
+    computeRangeKernel = context.getKernel(module, "computeRange");
+    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
+    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
+    copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
+    sortBucketsKernel = context.getKernel(module, "sortBuckets");
+    // Work out the work group sizes for various kernels.
+    int maxBlockSize;
+    hipDeviceGetAttribute(&maxBlockSize, hipDeviceAttributeMaxBlockDimX, context.getDevice());
+    int maxSharedMem;
+    hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
+    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
+    int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
+    isShortList = (length <= maxShortList);
+    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
+        ;
+    positionsKernelSize = rangeKernelSize;
+    sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
+    if (rangeKernelSize > length)
+        rangeKernelSize = length;
+    if (sortKernelSize > maxLocalBuffer)
+        sortKernelSize = maxLocalBuffer;
+    unsigned int targetBucketSize = sortKernelSize/2;
+    unsigned int numBuckets = length/targetBucketSize;
+    if (numBuckets < 1)
+        numBuckets = 1;
+    if (positionsKernelSize > numBuckets)
+        positionsKernelSize = numBuckets;
+    // Create workspace arrays.
+    if (!isShortList) {
+        dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
+        bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
+        bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
+        offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
+    }
+    buckets.initialize(context, length, trait->getDataSize(), "buckets");
+}
+HipSort::~HipSort() {
+    delete trait;
+}
+void HipSort::sort(HipArray& data) {
+    if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
+        throw OpenMMException("HipSort called with different data size");
+    if (data.getSize() == 0)
+        return;
+    if (isShortList) {
+        // We can use a simpler sort kernel that does the entire operation in one kernel.
+        if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
+            void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
+            context.executeKernel(shortList2Kernel, sortArgs, dataLength);
+            buckets.copyTo(data);
+        }
+        else {
+            void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
+            context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
+        }
+    }
+    else {
+        // Compute the range of data values.
+        unsigned int numBuckets = bucketOffset.getSize();
+        void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
+        context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
+        // Assign array elements to buckets.
+        void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
+                &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
+        context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
+        // Compute the position of each bucket.
+        void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()};
+        context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
+        // Copy the data into the buckets.
+        void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
+                &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
+        context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
+        // Sort each bucket.
+        void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
+        context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+    }
+}
--- a/platforms/hip/src/kernels/common.hip
+++ b/platforms/hip/src/kernels/common.hip
+/**
+ * This file contains HIP definitions for the macros and functions needed for the
+ * common compute framework.
+ */
+#define KERNEL extern "C" __global__
+#define DEVICE __device__
+#define LOCAL __shared__
+#define LOCAL_ARG
+#define GLOBAL
+#define RESTRICT __restrict__
+#define LOCAL_ID threadIdx.x
+#define LOCAL_SIZE blockDim.x
+#define GLOBAL_ID (blockIdx.x*blockDim.x+threadIdx.x)
+#define GLOBAL_SIZE (blockDim.x*gridDim.x)
+#define GROUP_ID blockIdx.x
+#define NUM_GROUPS gridDim.x
+#define SYNC_THREADS __syncthreads();
+#define MEM_FENCE __threadfence_block();
+#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
+typedef long long mm_long;
+typedef unsigned long long mm_ulong;
+#define SUPPORTS_64_BIT_ATOMICS __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
+#define SUPPORTS_DOUBLE_PRECISION __HIP_ARCH_HAS_DOUBLES__
--- a/platforms/hip/src/kernels/fft.hip
+++ b/platforms/hip/src/kernels/fft.hip
+static __inline__ __device__ real2 multiplyComplex(real2 c1, real2 c2) {
+    return make_real2(c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
+}
+/**
+ * Load a value from the half-complex grid produces by a real-to-complex transform.
+ */
+static __inline__ __device__ real2 loadComplexValue(const real2* __restrict__ in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return make_real2(value.x, -value.y);
+}
+/**
+ * Perform a 1D FFT on each row along one axis.
+ */
+extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYPE* __restrict__ out) {
+    __shared__ real2 w[ZSIZE];
+    __shared__ real2 data0[BLOCKS_PER_GROUP*ZSIZE];
+    __shared__ real2 data1[BLOCKS_PER_GROUP*ZSIZE];
+    for (int i = threadIdx.x; i < ZSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
+    __syncthreads();
+    const int block = threadIdx.x/THREADS_PER_BLOCK;
+    for (int baseIndex = blockIdx.x*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += gridDim.x*BLOCKS_PER_GROUP) {
+        int index = baseIndex+block;
+        int x = index/YSIZE;
+        int y = index-x*YSIZE;
+#if OUTPUT_IS_PACKED
+        if (x < XSIZE/2+1) {
+#endif
+        if (index < XSIZE*YSIZE)
+            for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
+    #if INPUT_IS_REAL
+                data0[i+block*ZSIZE] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+i], 0);
+    #elif INPUT_IS_PACKED
+                data0[i+block*ZSIZE] = loadComplexValue(in, x, y, i);
+    #else
+                data0[i+block*ZSIZE] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+i];
+    #endif
+#if OUTPUT_IS_PACKED
+        }
+#endif
+        __syncthreads();
+        COMPUTE_FFT
+    }
+}
--- a/platforms/hip/src/kernels/fftR2C.hip
+++ b/platforms/hip/src/kernels/fftR2C.hip
+/**
+ * Combine the two halves of a real grid into a complex grid that is half as large.
+ */
+extern "C" __global__ void packForwardData(const real* __restrict__ in, real2* __restrict__ out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+#if PACKED_AXIS == 0
+        real2 value = make_real2(in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
+#elif PACKED_AXIS == 1
+        real2 value = make_real2(in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
+#else
+        real2 value = make_real2(in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
+#endif
+        out[index] = value;
+    }
+}
+/**
+ * Split the transformed data back into a full sized, symmetric grid.
+ */
+extern "C" __global__ void unpackForwardData(const real2* __restrict__ in, real2* __restrict__ out) {
+    // Compute the phase factors.
+#if PACKED_AXIS == 0
+    __shared__ real2 w[PACKED_XSIZE];
+    for (int i = threadIdx.x; i < PACKED_XSIZE; i += blockDim.x)
+        w[i] = make_real2(sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    __shared__ real2 w[PACKED_YSIZE];
+    for (int i = threadIdx.x; i < PACKED_YSIZE; i += blockDim.x)
+        w[i] = make_real2(sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
+#else
+    __shared__ real2 w[PACKED_ZSIZE];
+    for (int i = threadIdx.x; i < PACKED_ZSIZE; i += blockDim.x)
+        w[i] = make_real2(sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
+#endif
+    __syncthreads();
+    // Transform the data.
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    const int outputZSize = ZSIZE/2+1;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
+        real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+#else
+        real2 wfac = w[z];
+#endif
+        real2 output = make_real2((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
+        if (z < outputZSize)
+            out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
+        xp = (x == 0 ? 0 : XSIZE-x);
+        yp = (y == 0 ? 0 : YSIZE-y);
+        zp = (z == 0 ? 0 : ZSIZE-z);
+        if (zp < outputZSize) {
+#if PACKED_AXIS == 0
+            if (x == 0)
+                out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#elif PACKED_AXIS == 1
+            if (y == 0)
+                out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#else
+            if (z == 0)
+                out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#endif
+            else
+                out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = make_real2(output.x, -output.y);
+        }
+    }
+}
+/**
+ * Load a value from the half-complex grid produced by a real-to-complex transform.
+ */
+static __inline__ __device__ real2 loadComplexValue(const real2* __restrict__ in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return make_real2(value.x, -value.y);
+}
+/**
+ * Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
+ */
+extern "C" __global__ void packBackwardData(const real2* __restrict__ in, real2* __restrict__ out) {
+    // Compute the phase factors.
+#if PACKED_AXIS == 0
+    __shared__ real2 w[PACKED_XSIZE];
+    for (int i = threadIdx.x; i < PACKED_XSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    __shared__ real2 w[PACKED_YSIZE];
+    for (int i = threadIdx.x; i < PACKED_YSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
+#else
+    __shared__ real2 w[PACKED_ZSIZE];
+    for (int i = threadIdx.x; i < PACKED_ZSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
+#endif
+    __syncthreads();
+    // Transform the data.
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = loadComplexValue(in, x, y, z);
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+        real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+        real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
+#else
+        real2 wfac = w[z];
+        real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
+#endif
+        real2 even = make_real2((z1.x+z2.x)/2, (z1.y-z2.y)/2);
+        real2 odd = make_real2((z1.x-z2.x)/2, (z1.y+z2.y)/2);
+        odd = make_real2(odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
+        out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = make_real2(even.x-odd.y, even.y+odd.x);
+    }
+}
+/**
+ * Split the data back into a full sized, real grid after an inverse transform.
+ */
+extern "C" __global__ void unpackBackwardData(const real2* __restrict__ in, real* __restrict__ out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        real2 value = 2*in[index];
+#if PACKED_AXIS == 0
+        out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
+        out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
+#elif PACKED_AXIS == 1
+        out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
+        out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
+#else
+        out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
+        out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
+#endif
+    }
+}
--- a/platforms/hip/src/kernels/findInteractingBlocks.hip
+++ b/platforms/hip/src/kernels/findInteractingBlocks.hip
+#define GROUP_SIZE 256
+#define BUFFER_SIZE 256
+__device__ inline int __TILEFLAGS_FFS(unsigned long long int x) {
+    return __ffsll(x);
+}
+__device__ inline int __TILEFLAGS_CLZ(unsigned long long int x) {
+    return __clzll(x);
+}
+__device__ inline int __TILEFLAGS_POPC(unsigned long long int x) {
+    return __popcll(x);
+}
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
+        real2* __restrict__ sortedBlocks) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int base = index*TILE_SIZE;
+    while (base < numAtoms) {
+        real4 pos = posq[base];
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_POS(pos)
+#endif
+        real4 minPos = pos;
+        real4 maxPos = pos;
+        int last = min(base+TILE_SIZE, numAtoms);
+        for (int i = base+1; i < last; i++) {
+            pos = posq[i];
+#ifdef USE_PERIODIC
+            real4 center = 0.5f*(maxPos+minPos);
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
+#endif
+            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
+            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
+        }
+        real4 blockSize = 0.5f*(maxPos-minPos);
+        real4 center = 0.5f*(maxPos+minPos);
+        center.w = 0;
+        for (int i = base; i < last; i++) {
+            pos = posq[i];
+            real4 delta = posq[i]-center;
+#ifdef USE_PERIODIC
+            APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
+        }
+        center.w = sqrt(center.w);
+        blockBoundingBox[index] = blockSize;
+        blockCenter[index] = center;
+        sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
+        index += blockDim.x*gridDim.x;
+        base = index*TILE_SIZE;
+    }
+    if (blockIdx.x == 0 && threadIdx.x == 0)
+        rebuildNeighborList[0] = 0;
+}
+/**
+ * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
+ */
+extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
+        real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
+    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
+        int index = (int) sortedBlock[i].y;
+        sortedBlockCenter[i] = blockCenter[index];
+        sortedBlockBoundingBox[i] = blockBoundingBox[index];
+    }
+    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
+    bool rebuild = forceRebuild;
+    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        real4 delta = oldPositions[i]-posq[i];
+        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
+            rebuild = true;
+    }
+    if (rebuild) {
+        rebuildNeighborList[0] = 1;
+        interactionCount[0] = 0;
+        interactionCount[1] = 0;
+    }
+}
+__device__ int saveSinglePairs(int x, int* atoms, tileflags* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile int& pairStartIndex) {
+    // Record interactions that should be computed as single pairs rather than in blocks.
+    const int indexInWarp = threadIdx.x%warpSize;
+    int sum = 0;
+    for (int i = indexInWarp; i < length; i += warpSize) {
+        int count = __TILEFLAGS_POPC(flags[i]);
+        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
+    }
+    for (int i = 1; i < warpSize; i *= 2) {
+        int n = SHFL(sum, indexInWarp - i);
+        if (indexInWarp >= i)
+            sum += n;
+    }
+    if (indexInWarp == warpSize - 1)
+        pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum);
+    SYNC_WARPS;
+    int prevSum = SHFL(sum, indexInWarp - 1);
+    int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0);
+    for (int i = indexInWarp; i < length; i += warpSize) {
+        int count = __TILEFLAGS_POPC(flags[i]);
+        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) {
+            tileflags f = flags[i];
+            while (f != 0) {
+                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__TILEFLAGS_FFS(f)-1);
+                f &= f-1;
+                pairIndex++;
+            }
+        }
+    }
+    // Compact the remaining interactions.
+    const tileflags warpMask = (static_cast<tileflags>(1)<<indexInWarp)-1;
+    int numCompacted = 0;
+    for (int start = 0; start < length; start += warpSize) {
+        int i = start+indexInWarp;
+        int atom = atoms[i];
+        tileflags flag = flags[i];
+        bool include = (i < length && __TILEFLAGS_POPC(flags[i]) > MAX_BITS_FOR_PAIRS);
+        tileflags includeFlags = BALLOT(include);
+        if (include) {
+            int index = numCompacted+__TILEFLAGS_POPC(includeFlags&warpMask);
+            atoms[index] = atom;
+            flags[index] = flag;
+        }
+        numCompacted += __TILEFLAGS_POPC(includeFlags);
+    }
+    return numCompacted;
+}
+/**
+ * Compare the bounding boxes for each pair of atom blocks (comprised of warpSize atoms each), forming a tile. If the two
+ * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
+ *
+ * STAGE 1:
+ *
+ * A coarse grained atom block against interacting atom block neighbour list is constructed.
+ *
+ * Each warp first loads in some block X of interest. Each thread within the warp then loads
+ * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes
+ * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
+ * interacting and Y is added to the buffer for X.
+ *
+ * STAGE 2:
+ *
+ * A fine grained atom block against interacting atoms neighbour list is constructed.
+ *
+ * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
+ * in the warp loops over the warpSize atoms in X and compares their positions to one particular atom from block Y.
+ * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
+ * This continues until the buffer fills up, at which point the results are written to global memory.
+ *
+ * [in] periodicBoxSize        - size of the rectangular periodic box
+ * [in] invPeriodicBoxSize     - inverse of the periodic box
+ * [in] blockCenter            - the center of each bounding box
+ * [in] blockBoundingBox       - bounding box of each atom block
+ * [out] interactionCount      - total number of tiles that have interactions
+ * [out] interactingTiles      - set of blocks that have interactions
+ * [out] interactingAtoms      - a list of atoms that interact with each atom block
+ * [in] posq                   - x,y,z coordinates of each atom and charge q
+ * [in] maxTiles               - maximum number of tiles to process, used for multi-GPUs
+ * [in] startBlockIndex        - first block to process, used for multi-GPUs,
+ * [in] numBlocks              - total number of atom blocks
+ * [in] sortedBlocks           - a sorted list of atom blocks based on volume
+ * [in] sortedBlockCenter      - sorted centers, duplicated for fast access to avoid indexing
+ * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
+ * [in] exclusionIndices       - maps into exclusionRowIndices with the starting position for a given atom
+ * [in] exclusionRowIndices    - stores the a continuous list of exclusions
+ *           eg: block 0 is excluded from atom 3,5,6
+ *               block 1 is excluded from atom 3,4
+ *               block 2 is excluded from atom 1,3,5,6
+ *              exclusionIndices[0][3][5][8]
+ *           exclusionRowIndices[3][5][6][3][4][1][3][5][6]
+ *                         index 0  1  2  3  4  5  6  7  8
+ * [out] oldPos                - stores the positions of the atoms in which this neighbourlist was built on
+ *                             - this is used to decide when to rebuild a neighbourlist
+ * [in] rebuildNeighbourList   - whether or not to execute this kernel
+ *
+ */
+extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
+        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
+        unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
+        const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
+    if (rebuildNeighborList[0] == 0)
+        return; // The neighbor list doesn't need to be rebuilt.
+    const int indexInWarp = threadIdx.x%warpSize;
+    const int warpStart = threadIdx.x-indexInWarp;
+    const int totalWarps = blockDim.x*gridDim.x/warpSize;
+    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/warpSize;
+    const tileflags warpMask = (static_cast<tileflags>(1)<<indexInWarp)-1;
+    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/warpSize)];
+    __shared__ tileflags workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/warpSize)];
+    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/warpSize)];
+    __shared__ real4 posBuffer[GROUP_SIZE];
+    __shared__ volatile int workgroupTileIndex[GROUP_SIZE/warpSize];
+    __shared__ int worksgroupPairStartIndex[GROUP_SIZE/warpSize];
+    int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
+    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/warpSize);
+    tileflags* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/warpSize);
+    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/warpSize);
+    volatile int& tileStartIndex = workgroupTileIndex[warpStart/warpSize];
+    volatile int& pairStartIndex = worksgroupPairStartIndex[warpStart/warpSize];
+    // Loop over blocks.
+    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
+        // Load data for this block.  Note that all threads in a warp are processing the same block.
+        real2 sortedKey = sortedBlocks[block1];
+        int x = (int) sortedKey.y;
+        real4 blockCenterX = sortedBlockCenter[block1];
+        real4 blockSizeX = sortedBlockBoundingBox[block1];
+        int neighborsInBuffer = 0;
+        real4 pos1 = posq[x*TILE_SIZE+indexInWarp];
+#ifdef USE_PERIODIC
+        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
+                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
+                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
+        if (singlePeriodicCopy) {
+            // The box is small enough that we can just translate all the atoms into a single periodic
+            // box, then skip having to apply periodic boundary conditions later.
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
+        }
+#endif
+        pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
+        posBuffer[threadIdx.x] = pos1;
+        // Load exclusion data for block x.
+        const int exclusionStart = exclusionRowIndices[x];
+        const int exclusionEnd = exclusionRowIndices[x+1];
+        const int numExclusions = exclusionEnd-exclusionStart;
+        for (int j = indexInWarp; j < numExclusions; j += warpSize)
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+        if (MAX_EXCLUSIONS > warpSize)
+            __syncthreads();
+        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against warpSize
+        // other blocks in parallel.
+        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += warpSize) {
+            int block2 = block2Base+indexInWarp;
+            bool includeBlock2 = (block2 < NUM_BLOCKS);
+            bool forceInclude = false;
+            if (includeBlock2) {
+                real4 blockCenterY = sortedBlockCenter[block2];
+                real4 blockSizeY = sortedBlockBoundingBox[block2];
+                real4 blockDelta = blockCenterX-blockCenterY;
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(blockDelta)
+#endif
+                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
+                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
+                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
+                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
+                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+#ifdef TRICLINIC
+                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
+                // If there's any possibility we might have missed it, do a detailed check.
+                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
+                    includeBlock2 = forceInclude = true;
+#endif
+                if (includeBlock2) {
+                    int y = (int) sortedBlocks[block2].y;
+                    for (int k = 0; k < numExclusions; k++)
+                        includeBlock2 &= (exclusionsForX[k] != y);
+                }
+            }
+            // Loop over any blocks we identified as potentially containing neighbors.
+            tileflags includeBlockFlags = BALLOT(includeBlock2);
+            tileflags forceIncludeFlags = BALLOT(forceInclude);
+            while (includeBlockFlags != 0) {
+                int i = __TILEFLAGS_FFS(includeBlockFlags)-1;
+                includeBlockFlags &= includeBlockFlags-1;
+                forceInclude = (forceIncludeFlags>>i) & 1;
+                int y = (int) sortedBlocks[block2Base+i].y;
+                // Check each atom in block Y for interactions.
+                int atom2 = y*TILE_SIZE+indexInWarp;
+                real4 pos2 = posq[atom2];
+#ifdef USE_PERIODIC
+                if (singlePeriodicCopy) {
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
+                }
+#endif
+                pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
+                real4 blockCenterY = sortedBlockCenter[block2Base+i];
+                real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(atomDelta)
+#endif
+                tileflags atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
+                tileflags interacts = 0;
+                if (atom2 < NUM_ATOMS && atomFlags != 0) {
+                    int first = __TILEFLAGS_FFS(atomFlags)-1;
+                    int last = warpSize-__TILEFLAGS_CLZ(atomFlags);
+#ifdef USE_PERIODIC
+                    if (!singlePeriodicCopy) {
+                        for (int j = first; j < last; j++) {
+                            real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]);
+                            APPLY_PERIODIC_TO_DELTA(delta)
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? static_cast<tileflags>(1)<<j : 0);
+                        }
+                    }
+                    else {
+#endif
+                        #pragma unroll
+                        for (int j = 0; j < warpSize; j++) {
+                            real4 posj = posBuffer[warpStart+j];
+                            real halfDist2 = posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z;
+                            interacts |= (halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? static_cast<tileflags>(1)<<j : 0);
+                        }
+#ifdef USE_PERIODIC
+                    }
+#endif
+                }
+                // Add any interacting atoms to the buffer.
+                tileflags includeAtomFlags = BALLOT(interacts);
+                if (interacts) {
+                    int index = neighborsInBuffer+__TILEFLAGS_POPC(includeAtomFlags&warpMask);
+                    buffer[index] = atom2;
+                    flagsBuffer[index] = interacts;
+                }
+                neighborsInBuffer += __TILEFLAGS_POPC(includeAtomFlags);
+                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
+                    // Store the new tiles to memory.
+#if MAX_BITS_FOR_PAIRS > 0
+                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
+#endif
+                    int tilesToStore = neighborsInBuffer/TILE_SIZE;
+                    if (tilesToStore > 0) {
+                        if (indexInWarp == 0)
+                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
+                        int newTileStartIndex = tileStartIndex;
+                        if (newTileStartIndex+tilesToStore <= maxTiles) {
+                            if (indexInWarp < tilesToStore)
+                                interactingTiles[newTileStartIndex+indexInWarp] = x;
+                            for (int j = 0; j < tilesToStore; j++)
+                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
+                        }
+                        buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
+                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
+                    }
+                }
+            }
+        }
+        // If we have a partially filled buffer,  store it to memory.
+#if MAX_BITS_FOR_PAIRS > 0
+        if (neighborsInBuffer > warpSize)
+            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
+#endif
+        if (neighborsInBuffer > 0) {
+            int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
+            if (indexInWarp == 0)
+                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
+            int newTileStartIndex = tileStartIndex;
+            if (newTileStartIndex+tilesToStore <= maxTiles) {
+                if (indexInWarp < tilesToStore)
+                    interactingTiles[newTileStartIndex+indexInWarp] = x;
+                for (int j = 0; j < tilesToStore; j++)
+                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
+            }
+        }
+    }
+    // Record the positions the neighbor list is based on.
+    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
+        oldPositions[i] = posq[i];
+}