Add hipification of CUDA platform

Port changes in CUDA backend to HIP Fix a warning about arithmetic operations on void* in HipArray::uploadSubArray Fix "Error Initializing context ROCm 5.3.0" https://github.com/StreamHPC/openmm-hip/issues/3 hipDeviceSetCacheConfig returns hipErrorNotSupported on 5.3 Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>

Add hipification of CUDA platform
Port changes in CUDA backend to HIP Fix a warning about arithmetic operations on void* in HipArray::uploadSubArray Fix "Error Initializing context ROCm 5.3.0" https://github.com/StreamHPC/openmm-hip/issues/3 hipDeviceSetCacheConfig returns hipErrorNotSupported on 5.3 Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>
89d2ff0e · Anton Gorenko · 8defca2d · 89d2ff0e · 89d2ff0e · 89d2ff0e
Unverified Commit 89d2ff0e authored Aug 25, 2024 by Anton Gorenko
20 changed files
--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
+#ifndef ENABLE_SHUFFLE
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
+    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    real padding;
+#endif
+} AtomData;
+#endif
+
+#ifdef ENABLE_SHUFFLE
+#define real_shfl SHFL
+#endif
+
+/**
+ * Compute nonbonded interactions. The kernel is separated into two parts,
+ * tiles with exclusions and tiles without exclusions. It relies heavily on
+ * implicit warp-level synchronization. A tile is defined by two atom blocks
+ * each of warpsize. Each warp computes a range of tiles.
+ *
+ * Tiles with exclusions compute the entire set of interactions across
+ * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
+ * the forces are computed and accumulated diagonally in the manner shown below
+ * where, suppose
+ *
+ * [a-h] comprise atom block 1, [i-p] comprise atom block 2
+ *
+ * 1 denotes the first set of calculations within the warp
+ * 2 denotes the second set of calculations within the warp
+ * ... etc.
+ *
+ *        threads
+ *     0 1 2 3 4 5 6 7
+ *         atom1
+ * L    a b c d e f g h
+ * o  i 1 2 3 4 5 6 7 8
+ * c  j 8 1 2 3 4 5 6 7
+ * a  k 7 8 1 2 3 4 5 6
+ * l  l 6 7 8 1 2 3 4 5
+ * D  m 5 6 7 8 1 2 3 4
+ * a  n 4 5 6 7 8 1 2 3
+ * t  o 3 4 5 6 7 8 1 2
+ * a  p 2 3 4 5 6 7 8 1
+ *
+ * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
+ * and follows the same force accumulation method. If more there are more interactingTiles
+ * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
+ * and the full tileset is computed. This should happen on the first step, and very rarely
+ * afterwards.
+ *
+ * On diagonal exclusion tiles use __shfl to broadcast. For all other types of tiles __shfl
+ * is used to pass around the forces, positions, and parameters when computing the forces.
+ *
+ * [out]forceBuffers    - forces on each atom to eventually be accumulated
+ * [out]energyBuffer    - energyBuffer to eventually be accumulated
+ * [in]posq             - x,y,z,charge
+ * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
+ * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
+ * [in]startTileIndex   - index into first tile to be processed
+ * [in]numTileIndices   - number of tiles this context is responsible for processing
+ * [in]int tiles        - the atom block for each tile
+ * [in]interactionCount - total number of tiles that have an interaction
+ * [in]maxTiles         - stores the size of the neighbourlist in case it needs
+ *                      - to be expanded
+ * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
+ * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
+ * [in]blockCenter      - the center of each block in euclidean coordinates
+ * [in]blockSize        - size of the each block, radiating from the center
+ *                      - x is half the distance of total length
+ *                      - y is half the distance of total width
+ *                      - z is half the distance of total height
+ *                      - w is not used
+ * [in]interactingAtoms - a list of interactions within a given tile
+ *
+ */
+extern "C" __global__ void computeNonbonded(
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
+        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
+#ifdef USE_CUTOFF
+        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
+        const int2* __restrict__ singlePairs
+#endif
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
+    mixed energy = 0;
+    INIT_DERIVATIVES
+    // used shared memory if the device cannot shuffle
+#ifndef ENABLE_SHUFFLE
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+#endif
+
+    // First loop: process tiles that contain exclusions.
+
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
+        const int2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real3 force = make_real3(0);
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+        const bool hasExclusions = true;
+        if (x == y) {
+            // This tile is on the diagonal.
+#ifdef ENABLE_SHUFFLE
+            real4 shflPosq = posq1;
+#else
+            localData[threadIdx.x].x = posq1.x;
+            localData[threadIdx.x].y = posq1.y;
+            localData[threadIdx.x].z = posq1.z;
+            localData[threadIdx.x].q = posq1.w;
+            LOAD_LOCAL_PARAMETERS_FROM_1
+#endif
+
+            // we do not need to fetch parameters from global since this is a symmetric tile
+            // instead we can broadcast the values using shuffle
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+j;
+                real4 posq2;
+#ifdef ENABLE_SHUFFLE
+                BROADCAST_WARP_DATA
+#else
+                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                real invR = RSQRT(r2);
+                real r = r2*invR;
+                LOAD_ATOM2_PARAMETERS
+                atom2 = y*TILE_SIZE+j;
+#ifdef USE_SYMMETRIC
+                real dEdR = 0.0f;
+#else
+                real3 dEdR1 = make_real3(0);
+                real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 1));
+#endif
+                real tempEnergy = 0.0f;
+                const real interactionScale = 0.5f;
+                COMPUTE_INTERACTION
+                energy += 0.5f*tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                force.x -= delta.x*dEdR;
+                force.y -= delta.y*dEdR;
+                force.z -= delta.z*dEdR;
+#else
+                force.x -= dEdR1.x;
+                force.y -= dEdR1.y;
+                force.z -= dEdR1.z;
+#endif
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            real4 shflPosq = posq[j];
+#ifdef ENABLE_SHUFFLE
+            real3 shflForce;
+            shflForce.x = 0.0f;
+            shflForce.y = 0.0f;
+            shflForce.z = 0.0f;
+#else
+            localData[threadIdx.x].x = shflPosq.x;
+            localData[threadIdx.x].y = shflPosq.y;
+            localData[threadIdx.x].z = shflPosq.z;
+            localData[threadIdx.x].q = shflPosq.w;
+            localData[threadIdx.x].fx = 0.0f;
+            localData[threadIdx.x].fy = 0.0f;
+            localData[threadIdx.x].fz = 0.0f;
+#endif
+            DECLARE_LOCAL_PARAMETERS
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+#ifdef USE_EXCLUSIONS
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+            unsigned int tj = tgx;
+            for (j = 0; j < TILE_SIZE; j++) {
+                int atom2 = tbx+tj;
+#ifdef ENABLE_SHUFFLE
+                real4 posq2 = shflPosq;
+#else
+                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                real invR = RSQRT(r2);
+                real r = r2*invR;
+                LOAD_ATOM2_PARAMETERS
+                atom2 = y*TILE_SIZE+tj;
+#ifdef USE_SYMMETRIC
+                real dEdR = 0.0f;
+#else
+                real3 dEdR1 = make_real3(0);
+                real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 1));
+#endif
+                real tempEnergy = 0.0f;
+                const real interactionScale = 1.0f;
+                COMPUTE_INTERACTION
+                energy += tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                delta *= dEdR;
+                force.x -= delta.x;
+                force.y -= delta.y;
+                force.z -= delta.z;
+#ifdef ENABLE_SHUFFLE
+                shflForce.x += delta.x;
+                shflForce.y += delta.y;
+                shflForce.z += delta.z;
+
+#else
+                localData[tbx+tj].fx += delta.x;
+                localData[tbx+tj].fy += delta.y;
+                localData[tbx+tj].fz += delta.z;
+#endif
+#else // !USE_SYMMETRIC
+                force.x -= dEdR1.x;
+                force.y -= dEdR1.y;
+                force.z -= dEdR1.z;
+#ifdef ENABLE_SHUFFLE
+                shflForce.x += dEdR2.x;
+                shflForce.y += dEdR2.y;
+                shflForce.z += dEdR2.z;
+#else
+                localData[tbx+tj].fx += dEdR2.x;
+                localData[tbx+tj].fy += dEdR2.y;
+                localData[tbx+tj].fz += dEdR2.z;
+#endif
+#endif // end USE_SYMMETRIC
+#endif
+#ifdef ENABLE_SHUFFLE
+                SHUFFLE_WARP_DATA
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                // cycles the indices
+                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
+                tj = (tj + 1) & (TILE_SIZE - 1);
+            }
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            // write results for off diagonal tiles
+#ifdef INCLUDE_FORCES
+#ifdef ENABLE_SHUFFLE
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
+#else
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+#endif
+#endif
+        }
+        // Write results for on and off diagonal tiles
+#ifdef INCLUDE_FORCES
+        const unsigned int offset = x*TILE_SIZE + tgx;
+        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+#endif
+    }
+
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
+    int pos = (int) (warp*(long long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
+#else
+    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
+    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
+#endif
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    // atomIndices can probably be shuffled as well
+    // but it probably wouldn't make things any faster
+    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
+    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
+    skipTiles[threadIdx.x] = -1;
+
+    while (pos < end) {
+        const bool hasExclusions = false;
+        real3 force = make_real3(0);
+        bool includeTile = true;
+
+        // Extract the coordinates of this tile.
+        int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_CUTOFF
+        x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        }
+
+        // Skip over tiles that have exclusions, since they were already processed.
+
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                int2 tile = exclusionTiles[skipBase+tgx];
+                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+            }
+            else
+                skipTiles[threadIdx.x] = end;
+            skipBase += TILE_SIZE;
+            currentSkipIndex = tbx;
+        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+            //const unsigned int localAtomIndex = threadIdx.x;
+#ifdef USE_CUTOFF
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            atomIndices[threadIdx.x] = j;
+#ifdef ENABLE_SHUFFLE
+            DECLARE_LOCAL_PARAMETERS
+            real4 shflPosq;
+            real3 shflForce;
+            shflForce.x = 0.0f;
+            shflForce.y = 0.0f;
+            shflForce.z = 0.0f;
+#endif
+            if (j < PADDED_NUM_ATOMS) {
+                // Load position of atom j from from global memory
+#ifdef ENABLE_SHUFFLE
+                shflPosq = posq[j];
+#else
+                localData[threadIdx.x].x = posq[j].x;
+                localData[threadIdx.x].y = posq[j].y;
+                localData[threadIdx.x].z = posq[j].z;
+                localData[threadIdx.x].q = posq[j].w;
+                localData[threadIdx.x].fx = 0.0f;
+                localData[threadIdx.x].fy = 0.0f;
+                localData[threadIdx.x].fz = 0.0f;
+#endif
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            }
+            else {
+#ifdef ENABLE_SHUFFLE
+                shflPosq = make_real4(0, 0, 0, 0);
+#else
+                localData[threadIdx.x].x = 0;
+                localData[threadIdx.x].y = 0;
+                localData[threadIdx.x].z = 0;
+#endif
+                CLEAR_LOCAL_PARAMETERS
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
+#ifdef ENABLE_SHUFFLE
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
+#else
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
+#endif
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+#ifdef ENABLE_SHUFFLE
+                    real4 posq2 = shflPosq;
+#else
+                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = atomIndices[tbx+tj];
+#ifdef USE_SYMMETRIC
+                    real dEdR = 0.0f;
+#else
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+#endif
+                    real tempEnergy = 0.0f;
+                    const real interactionScale = 1.0f;
+                    COMPUTE_INTERACTION
+                    energy += tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
+#ifdef ENABLE_SHUFFLE
+                    shflForce.x += delta.x;
+                    shflForce.y += delta.y;
+                    shflForce.z += delta.z;
+
+#else
+                    localData[tbx+tj].fx += delta.x;
+                    localData[tbx+tj].fy += delta.y;
+                    localData[tbx+tj].fz += delta.z;
+#endif
+#else // !USE_SYMMETRIC
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
+#ifdef ENABLE_SHUFFLE
+                    shflForce.x += dEdR2.x;
+                    shflForce.y += dEdR2.y;
+                    shflForce.z += dEdR2.z;
+#else
+                    localData[tbx+tj].fx += dEdR2.x;
+                    localData[tbx+tj].fy += dEdR2.y;
+                    localData[tbx+tj].fz += dEdR2.z;
+#endif
+#endif // end USE_SYMMETRIC
+#endif
+#ifdef ENABLE_SHUFFLE
+                    SHUFFLE_WARP_DATA
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                unsigned int tj = tgx;
+                for (j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+tj;
+#ifdef ENABLE_SHUFFLE
+                    real4 posq2 = shflPosq;
+#else
+                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = atomIndices[tbx+tj];
+#ifdef USE_SYMMETRIC
+                    real dEdR = 0.0f;
+#else
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
+#endif
+                    real tempEnergy = 0.0f;
+                    const real interactionScale = 1.0f;
+                    COMPUTE_INTERACTION
+                    energy += tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
+#ifdef ENABLE_SHUFFLE
+                    shflForce.x += delta.x;
+                    shflForce.y += delta.y;
+                    shflForce.z += delta.z;
+
+#else
+                    localData[tbx+tj].fx += delta.x;
+                    localData[tbx+tj].fy += delta.y;
+                    localData[tbx+tj].fz += delta.z;
+#endif
+#else // !USE_SYMMETRIC
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
+#ifdef ENABLE_SHUFFLE
+                    shflForce.x += dEdR2.x;
+                    shflForce.y += dEdR2.y;
+                    shflForce.z += dEdR2.z;
+#else
+                    localData[tbx+tj].fx += dEdR2.x;
+                    localData[tbx+tj].fy += dEdR2.y;
+                    localData[tbx+tj].fz += dEdR2.z;
+#endif
+#endif // end USE_SYMMETRIC
+#endif
+#ifdef ENABLE_SHUFFLE
+                    SHUFFLE_WARP_DATA
+#endif
+                    tj = (tj + 1) & (TILE_SIZE - 1);
+                }
+            }
+
+            // Write results.
+#ifdef INCLUDE_FORCES
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+#ifdef USE_CUTOFF
+            unsigned int atom2 = atomIndices[threadIdx.x];
+#else
+            unsigned int atom2 = y*TILE_SIZE + tgx;
+#endif
+            if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef ENABLE_SHUFFLE
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
+#else
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+#endif
+            }
+#endif
+        }
+        pos++;
+    }
+
+    // Third loop: single pairs that aren't part of a tile.
+
+#if USE_CUTOFF
+    const unsigned int numPairs = interactionCount[1];
+    if (numPairs > maxSinglePairs)
+        return; // There wasn't enough memory for the neighbor list.
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
+        int2 pair = singlePairs[i];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        real4 posq1 = posq[atom1];
+        real4 posq2 = posq[atom2];
+        LOAD_ATOM1_PARAMETERS
+        int j = atom2;
+atom2 = threadIdx.x;
+        DECLARE_LOCAL_PARAMETERS
+        LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+        LOAD_ATOM2_PARAMETERS
+atom2 = pair.y;
+        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+        real invR = RSQRT(r2);
+        real r = r2*invR;
+#ifdef USE_SYMMETRIC
+        real dEdR = 0.0f;
+#else
+        real3 dEdR1 = make_real3(0);
+        real3 dEdR2 = make_real3(0);
+#endif
+        bool hasExclusions = false;
+        bool isExcluded = false;
+        real tempEnergy = 0.0f;
+        const real interactionScale = 1.0f;
+        COMPUTE_INTERACTION
+        energy += tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+        real3 dEdR1 = delta*dEdR;
+        real3 dEdR2 = -dEdR1;
+#endif
+        atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
+        atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
+        atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
+        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
+#endif
+    }
+#endif
+#ifdef INCLUDE_ENERGY
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+#endif
+    SAVE_DERIVATIVES
+}
--- a/platforms/hip/src/kernels/parallel.hip
+++ b/platforms/hip/src/kernels/parallel.hip
+/**
+ * Sum the forces computed by different contexts.
+ */
+
+extern "C" __global__ void sumForces(long long* __restrict__ force, long long* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
+        long long sum = force[index];
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        force[index] = sum;
+    }
+}
--- a/platforms/hip/src/kernels/sort.hip
+++ b/platforms/hip/src/kernels/sort.hip
+__device__ KEY_TYPE getValue(DATA_TYPE value) {
+    return SORT_KEY;
+}
+
+extern "C" {
+
+/**
+ * Sort a list that is short enough to entirely fit in local memory.  This is executed as
+ * a single thread block.
+ */
+__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
+    // Load the data into local memory.
+
+    HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer)
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        dataBuffer[index] = data[index];
+    __syncthreads();
+
+    // Perform a bitonic sort in local memory.
+
+    for (unsigned int k = 2; k < 2*length; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                int ixj = i^j;
+                if (ixj > i && ixj < length) {
+                    DATA_TYPE value1 = dataBuffer[i];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = ((i&k) == 0);
+                    for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                        ascending = ((i&mask) == 0 ? !ascending : ascending);
+                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[i] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write the data back to global memory.
+
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        data[index] = dataBuffer[index];
+}
+
+/**
+ * An alternate kernel for sorting short lists.  In this version every thread does a full
+ * scan through the data to select the destination for one element.  This involves more
+ * work, but also parallelizes much better.
+ */
+__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
+    __shared__ DATA_TYPE dataBuffer[64];
+    int globalId = blockDim.x*blockIdx.x+threadIdx.x;
+    DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
+    KEY_TYPE key = getValue(value);
+    int count = 0;
+    for (int blockStart = 0; blockStart < length; blockStart += blockDim.x) {
+        int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart));
+        __syncthreads();
+        if (threadIdx.x < numInBlock)
+            dataBuffer[threadIdx.x] = dataIn[blockStart+threadIdx.x];
+        __syncthreads();
+        for (int i = 0; i < numInBlock; i++) {
+            KEY_TYPE otherKey = getValue(dataBuffer[i]);
+            if (otherKey < key || (otherKey == key && blockStart+i < globalId))
+                count++;
+        }
+    }
+    if (globalId < length)
+        dataOut[count] = value;
+}
+
+/**
+ * Calculate the minimum and maximum value in the array to be sorted.  This kernel
+ * is executed as a single work group.
+ */
+__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
+        unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
+    HIP_DYNAMIC_SHARED( KEY_TYPE, minBuffer)
+    KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
+    KEY_TYPE minimum = MAX_KEY;
+    KEY_TYPE maximum = MIN_KEY;
+
+    // Each thread calculates the range of a subset of values.
+
+    for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
+        KEY_TYPE value = getValue(data[index]);
+        minimum = min(minimum, value);
+        maximum = max(maximum, value);
+    }
+
+    // Now reduce them.
+
+    minBuffer[threadIdx.x] = minimum;
+    maxBuffer[threadIdx.x] = maximum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0) {
+            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
+            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
+        }
+        __syncthreads();
+    }
+    minimum = minBuffer[0];
+    maximum = maxBuffer[0];
+    if (threadIdx.x == 0) {
+        range[0] = minimum;
+        range[1] = maximum;
+    }
+
+    // Clear the bucket counters in preparation for the next kernel.
+
+    for (unsigned int index = threadIdx.x; index < numBuckets; index += blockDim.x)
+        bucketOffset[index] = 0;
+}
+
+/**
+ * Assign elements to buckets.
+ */
+__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    float minValue = (float) (range[0]);
+    float maxValue = (float) (range[1]);
+    float bucketWidth = (maxValue-minValue)/numBuckets;
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
+/**
+ * Sum the bucket sizes to compute the start position of each bucket.  This kernel
+ * is executed as a single work group.
+ */
+__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
+    HIP_DYNAMIC_SHARED( unsigned int, posBuffer)
+    unsigned int globalOffset = 0;
+    for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
+        // Load the bucket sizes into local memory.
+
+        unsigned int globalIndex = startBucket+threadIdx.x;
+        __syncthreads();
+        posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
+        __syncthreads();
+
+        // Perform a parallel prefix sum.
+
+        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
+            __syncthreads();
+            posBuffer[threadIdx.x] += add;
+            __syncthreads();
+        }
+
+        // Write the results back to global memory.
+
+        if (globalIndex < numBuckets)
+            bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
+        globalOffset += posBuffer[blockDim.x-1];
+    }
+}
+
+/**
+ * Copy the input data into the buckets for sorting.
+ */
+__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        DATA_TYPE element = data[index];
+        unsigned int bucketIndex = bucketOfElement[index];
+        unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
+        buckets[offset+offsetInBucket[index]] = element;
+    }
+}
+
+/**
+ * Sort the data in each bucket.
+ */
+__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
+    HIP_DYNAMIC_SHARED( DATA_TYPE, dataBuffer)
+    for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
+        unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+        unsigned int endIndex = bucketOffset[index];
+        unsigned int length = endIndex-startIndex;
+        if (length <= blockDim.x) {
+            // Load the data into local memory.
+
+            if (threadIdx.x < length)
+                dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
+            else
+                dataBuffer[threadIdx.x] = MAX_VALUE;
+            __syncthreads();
+
+            // Perform a bitonic sort in local memory.
+
+            for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
+                for (unsigned int j = k/2; j > 0; j /= 2) {
+                    int ixj = threadIdx.x^j;
+                    if (ixj > threadIdx.x) {
+                        DATA_TYPE value1 = dataBuffer[threadIdx.x];
+                        DATA_TYPE value2 = dataBuffer[ixj];
+                        bool ascending = (threadIdx.x&k) == 0;
+                        KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
+                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                        if (lowKey > highKey) {
+                            dataBuffer[threadIdx.x] = value2;
+                            dataBuffer[ixj] = value1;
+                        }
+                    }
+                    __syncthreads();
+                }
+            }
+
+            // Write the data to the sorted array.
+
+            if (threadIdx.x < length)
+                data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
+        }
+        else {
+            // Copy the bucket data over to the output array.
+
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
+                data[startIndex+i] = buckets[startIndex+i];
+            __threadfence_block();
+            __syncthreads();
+
+            // Perform a bitonic sort in global memory.
+
+            for (unsigned int k = 2; k < 2*length; k *= 2) {
+                for (unsigned int j = k/2; j > 0; j /= 2) {
+                    for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                        int ixj = i^j;
+                        if (ixj > i && ixj < length) {
+                            DATA_TYPE value1 = data[startIndex+i];
+                            DATA_TYPE value2 = data[startIndex+ixj];
+                            bool ascending = ((i&k) == 0);
+                            for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                                ascending = ((i&mask) == 0 ? !ascending : ascending);
+                            KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                            KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                            if (lowKey > highKey) {
+                                data[startIndex+i] = value2;
+                                data[startIndex+ixj] = value1;
+                            }
+                        }
+                    }
+                    __threadfence_block();
+                    __syncthreads();
+                }
+            }
+        }
+    }
+}
+
+}
--- a/platforms/hip/src/kernels/utilities.hip
+++ b/platforms/hip/src/kernels/utilities.hip
+extern "C" {
+
+/**
+ * This is called by the various functions below to clear a buffer.
+ */
+__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
+    int index = blockDim.x*blockIdx.x+threadIdx.x;
+    int4* buffer4 = (int4*) buffer;
+    int sizeDiv4 = size/4;
+    while (index < sizeDiv4) {
+        buffer4[index] = make_int4(0);
+        index += blockDim.x*gridDim.x;
+    }
+    if (blockDim.x*blockIdx.x+threadIdx.x == 0)
+        for (int i = sizeDiv4*4; i < size; i++)
+            buffer[i] = 0;
+}
+
+/**
+ * Fill a buffer with 0.
+ */
+__global__ void clearBuffer(int* __restrict__ buffer, int size) {
+    clearSingleBuffer(buffer, size);
+}
+
+/**
+ * Fill two buffers with 0.
+ */
+__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+}
+
+/**
+ * Fill three buffers with 0.
+ */
+__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+}
+
+/**
+ * Fill four buffers with 0.
+ */
+__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+}
+
+/**
+ * Fill five buffers with 0.
+ */
+__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+}
+
+/**
+ * Fill six buffers with 0.
+ */
+__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+    clearSingleBuffer(buffer6, size6);
+}
+
+/**
+ * Sum the energy buffer.
+ */
+__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
+    HIP_DYNAMIC_SHARED( mixed, tempBuffer)
+    const unsigned int thread = threadIdx.x;
+    mixed sum = 0;
+    for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
+        sum += energyBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < workGroupSize)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        *result = tempBuffer[0];
+}
+
+/**
+ * Record the atomic charges into the posq array.
+ */
+__global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq, int* __restrict__ atomOrder, int numAtoms) {
+    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
+        posq[i].w = charges[atomOrder[i]];
+}
+}
--- a/platforms/hip/src/kernels/vectorOps.hip
+++ b/platforms/hip/src/kernels/vectorOps.hip
+/**
+ * This file defines vector operations to simplify code elsewhere.
+ */
+
+// Versions of make_x() that take a single value and set all components to that.
+
+inline __device__ int2 make_int2(int a) {
+    return make_int2(a, a);
+}
+
+inline __device__ int3 make_int3(int a) {
+    return make_int3(a, a, a);
+}
+
+inline __device__ int4 make_int4(int a) {
+    return make_int4(a, a, a, a);
+}
+
+inline __device__ float2 make_float2(float a) {
+    return make_float2(a, a);
+}
+
+inline __device__ float3 make_float3(float a) {
+    return make_float3(a, a, a);
+}
+
+inline __device__ float4 make_float4(float a) {
+    return make_float4(a, a, a, a);
+}
+
+inline __device__ double2 make_double2(double a) {
+    return make_double2(a, a);
+}
+
+inline __device__ double3 make_double3(double a) {
+    return make_double3(a, a, a);
+}
+
+inline __device__ double4 make_double4(double a) {
+    return make_double4(a, a, a, a);
+}
+
+// Multiply a vector by a constant.
+
+inline __device__ int2 operator*(int2 a, int b) {
+    return make_int2(a.x*b, a.y*b);
+}
+
+inline __device__ int3 operator*(int3 a, int b) {
+    return make_int3(a.x*b, a.y*b, a.z*b);
+}
+
+inline __device__ int4 operator*(int4 a, int b) {
+    return make_int4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+
+inline __device__ int2 operator*(int a, int2 b) {
+    return make_int2(a*b.x, a*b.y);
+}
+
+inline __device__ int3 operator*(int a, int3 b) {
+    return make_int3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ int4 operator*(int a, int4 b) {
+    return make_int4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+inline __device__ float2 operator*(float2 a, float b) {
+    return make_float2(a.x*b, a.y*b);
+}
+
+inline __device__ float3 operator*(float3 a, float b) {
+    return make_float3(a.x*b, a.y*b, a.z*b);
+}
+
+inline __device__ float4 operator*(float4 a, float b) {
+    return make_float4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+
+inline __device__ float2 operator*(float a, float2 b) {
+    return make_float2(a*b.x, a*b.y);
+}
+
+inline __device__ float3 operator*(float a, float3 b) {
+    return make_float3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ float4 operator*(float a, float4 b) {
+    return make_float4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+inline __device__ double2 operator*(double2 a, double b) {
+    return make_double2(a.x*b, a.y*b);
+}
+
+inline __device__ double3 operator*(double3 a, double b) {
+    return make_double3(a.x*b, a.y*b, a.z*b);
+}
+
+inline __device__ double4 operator*(double4 a, double b) {
+    return make_double4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+
+inline __device__ double2 operator*(double a, double2 b) {
+    return make_double2(a*b.x, a*b.y);
+}
+
+inline __device__ double3 operator*(double a, double3 b) {
+    return make_double3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ double4 operator*(double a, double4 b) {
+    return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+// Divide a vector by a constant.
+
+inline __device__ int2 operator/(int2 a, int b) {
+    return make_int2(a.x/b, a.y/b);
+}
+
+inline __device__ int3 operator/(int3 a, int b) {
+    return make_int3(a.x/b, a.y/b, a.z/b);
+}
+
+inline __device__ int4 operator/(int4 a, int b) {
+    return make_int4(a.x/b, a.y/b, a.z/b, a.w/b);
+}
+
+inline __device__ float2 operator/(float2 a, float b) {
+    float scale = 1.0f/b;
+    return a*scale;
+}
+
+inline __device__ float3 operator/(float3 a, float b) {
+    float scale = 1.0f/b;
+    return a*scale;
+}
+
+inline __device__ float4 operator/(float4 a, float b) {
+    float scale = 1.0f/b;
+    return a*scale;
+}
+
+inline __device__ double2 operator/(double2 a, double b) {
+    double scale = 1.0/b;
+    return a*scale;
+}
+
+inline __device__ double3 operator/(double3 a, double b) {
+    double scale = 1.0/b;
+    return a*scale;
+}
+
+inline __device__ double4 operator/(double4 a, double b) {
+    double scale = 1.0/b;
+    return a*scale;
+}
+
+// *= operator (multiply vector by constant)
+
+inline __device__ void operator*=(int2& a, int b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(int3& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(int4& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __device__ void operator*=(float2& a, float b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(float3& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(float4& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __device__ void operator*=(double2& a, double b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(double3& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(double4& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+// Dot product
+
+inline __device__ float dot(float3 a, float3 b) {
+    return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+inline __device__ double dot(double3 a, double3 b) {
+    return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+// Cross product
+
+inline __device__ float3 cross(float3 a, float3 b) {
+    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+inline __device__ float4 cross(float4 a, float4 b) {
+    return make_float4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0f);
+}
+
+inline __device__ double3 cross(double3 a, double3 b) {
+    return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+inline __device__ double4 cross(double4 a, double4 b) {
+    return make_double4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0);
+}
+
+// Normalize a vector
+
+inline __device__ float2 normalize(float2 a) {
+    return a*rsqrtf(a.x*a.x+a.y*a.y);
+}
+
+inline __device__ float3 normalize(float3 a) {
+    return a*rsqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
+}
+
+inline __device__ float4 normalize(float4 a) {
+    return a*rsqrtf(a.x*a.x+a.y*a.y+a.z*a.z+a.w*a.w);
+}
+
+inline __device__ double2 normalize(double2 a) {
+    return a*rsqrt(a.x*a.x+a.y*a.y);
+}
+
+inline __device__ double3 normalize(double3 a) {
+    return a*rsqrt(a.x*a.x+a.y*a.y+a.z*a.z);
+}
+
+inline __device__ double4 normalize(double4 a) {
+    return a*rsqrt(a.x*a.x+a.y*a.y+a.z*a.z+a.w*a.w);
+}
+
+// Strip off the fourth component of a vector.
+
+inline __device__ short3 trimTo3(short4 v) {
+    return make_short3(v.x, v.y, v.z);
+}
+
+inline __device__ int3 trimTo3(int4 v) {
+    return make_int3(v.x, v.y, v.z);
+}
+
+inline __device__ float3 trimTo3(float4 v) {
+    return make_float3(v.x, v.y, v.z);
+}
+
+inline __device__ double3 trimTo3(double4 v) {
+    return make_double3(v.x, v.y, v.z);
+}
--- a/platforms/hip/staticTarget/CMakeLists.txt
+++ b/platforms/hip/staticTarget/CMakeLists.txt
+#
+# Include HIP related files.
+#
+
+# add include / link dirs
+INCLUDE_DIRECTORIES(${MMHIP_INCLUDE_DIRS})
+LINK_DIRECTORIES(${MMHIP_LINK_DIRS})
+
+FILE(GLOB HIP_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.hip)
+ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=hip -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
+    DEPENDS ${HIP_KERNELS}
+)
+# set HIP compile flags
+SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${HIPCXXFLAGS}")
+SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
+ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+
+TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${MMHIP_LIBS} ${PTHREADS_LIB_STATIC})
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_STATIC_LIBRARY")
+IF (APPLE)
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP")
+ELSE (APPLE)
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
+ENDIF (APPLE)
+
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/hip/tests/CMakeLists.txt
+++ b/platforms/hip/tests/CMakeLists.txt
+#
+# Testing
+#
+
+ENABLE_TESTING()
+
+# add include / link dirs
+INCLUDE_DIRECTORIES(${MMHIP_INCLUDE_DIRS})
+LINK_DIRECTORIES(${MMHIP_LINK_DIRS})
+
+SET(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS TRUE CACHE BOOL "Whether to build double precision versions of HIP test cases")
+
+SET( INCLUDE_SERIALIZATION FALSE )
+#SET( INCLUDE_SERIALIZATION TRUE )
+
+IF( INCLUDE_SERIALIZATION )
+    INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
+    SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
+ENDIF( INCLUDE_SERIALIZATION )
+
+# Automatically create tests using files named "Test*.cpp"
+FILE(GLOB TEST_PROGS "*Test*.cpp")
+FOREACH(TEST_PROG ${TEST_PROGS})
+    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
+
+    # Link with shared library
+    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework HIP" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} ${HIPCXXFLAGS}")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} ${HIPCXXFLAGS}")
+    ENDIF (APPLE)
+
+    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
+    IF (OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS)
+        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
+        ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
+    ENDIF(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS)
+
+ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/platforms/hip/tests/HipTests.h
+++ b/platforms/hip/tests/HipTests.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
+#include "HipPlatform.h"
+#include <string>
+
+OpenMM::HipPlatform platform;
+
+void initializeTests(int argc, char* argv[]) {
+    if (argc > 1)
+        platform.setPropertyDefaultValue("Precision", std::string(argv[1]));
+}
--- a/platforms/hip/tests/TestHipAndersenThermostat.cpp
+++ b/platforms/hip/tests/TestHipAndersenThermostat.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestAndersenThermostat.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipBrownianIntegrator.cpp
+++ b/platforms/hip/tests/TestHipBrownianIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestBrownianIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCMAPTorsionForce.cpp
+++ b/platforms/hip/tests/TestHipCMAPTorsionForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCMAPTorsionForce.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCMMotionRemover.cpp
+++ b/platforms/hip/tests/TestHipCMMotionRemover.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCMMotionRemover.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCheckpoints.cpp
+++ b/platforms/hip/tests/TestHipCheckpoints.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCheckpoints.h"
+
+void testCheckpoint() {
+    const int numParticles = 100;
+    const double boxSize = 5.0;
+    const double temperature = 200.0;
+    System system;
+    system.addForce(new AndersenThermostat(0.0, 100.0));
+    NonbondedForce* nonbonded = new NonbondedForce();
+    system.addForce(nonbonded);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(i%2 == 0 ? 0.1 : -0.1, 0.2, 0.1);
+        bool clash;
+        do {
+            clash = false;
+            positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+            for (int j = 0; j < i; j++) {
+                Vec3 delta = positions[i]-positions[j];
+                if (sqrt(delta.dot(delta)) < 0.1)
+                    clash = true;
+            }
+        } while (clash);
+    }
+    VerletIntegrator integrator(0.001);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature);
+
+    // Run for a little while.
+
+    integrator.step(100);
+
+    // Record the current state and make a checkpoint.
+
+    State s1 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream1(ios_base::out | ios_base::in | ios_base::binary);
+    context.createCheckpoint(stream1);
+
+    // Continue the simulation for a few more steps and record the state again.
+
+    integrator.step(10);
+    State s2 = context.getState(State::Positions | State::Velocities | State::Parameters);
+
+    // Restore from the checkpoint and see if everything gets restored correctly.
+
+    context.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context.loadCheckpoint(stream1);
+    State s3 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s1, s3);
+
+    // Now simulate from there and see if the trajectory is identical.
+
+    integrator.step(10);
+    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s2, s4);
+
+    // Create a new Context that uses multiple devices.
+
+    string deviceIndex = platform.getPropertyValue(context, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    context2.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature);
+
+    // Now repeat all of the above tests with it.
+
+    integrator2.step(100);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream2(ios_base::out | ios_base::in | ios_base::binary);
+    context2.createCheckpoint(stream2);
+    integrator2.step(10);
+    State s6 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    context2.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context2.loadCheckpoint(stream2);
+    State s7 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s5, s7);
+    integrator2.step(10);
+    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s6, s8);
+
+    // See if a checkpoint created from one Context can be loaded into a different one.
+
+    VerletIntegrator integrator3(0.001);
+    Context context3(system, integrator3, platform);
+    stream1.seekg(0, stream1.beg);
+    context3.loadCheckpoint(stream1);
+    State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s9);
+}
+
+void runPlatformTests() {
+    testCheckpoint();
+}
--- a/platforms/hip/tests/TestHipCompoundIntegrator.cpp
+++ b/platforms/hip/tests/TestHipCompoundIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCompoundIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCustomAngleForce.cpp
+++ b/platforms/hip/tests/TestHipCustomAngleForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomAngleForce.h"
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomAngleForce* force = new CustomAngleForce("(theta-1.1)^2");
+    vector<double> params;
+    for (int i = 2; i < numParticles; i++)
+        force->addAngle(i-2, i-1, i, params);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, i%2, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+void runPlatformTests() {
+    testParallelComputation();
+}
--- a/platforms/hip/tests/TestHipCustomBondForce.cpp
+++ b/platforms/hip/tests/TestHipCustomBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomBondForce.h"
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomBondForce* force = new CustomBondForce(("(r-1.1)^2"));
+    vector<double> params;
+    for (int i = 1; i < numParticles; i++)
+        force->addBond(i-1, i, params);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, 0, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+void runPlatformTests() {
+    testParallelComputation();
+}
--- a/platforms/hip/tests/TestHipCustomCVForce.cpp
+++ b/platforms/hip/tests/TestHipCustomCVForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomCVForce.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCustomCentroidBondForce.cpp
+++ b/platforms/hip/tests/TestHipCustomCentroidBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomCentroidBondForce.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCustomCompoundBondForce.cpp
+++ b/platforms/hip/tests/TestHipCustomCompoundBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomCompoundBondForce.h"
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomCompoundBondForce* force = new CustomCompoundBondForce(2, ("(distance(p1,p2)-1.1)^2"));
+    vector<int> particles(2);
+    vector<double> params;
+    for (int i = 1; i < numParticles; i++) {
+        particles[0] = i-1;
+        particles[1] = i;
+        force->addBond(particles, params);
+    }
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, 0, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+void runPlatformTests() {
+    testParallelComputation();
+}
--- a/platforms/hip/tests/TestHipCustomExternalForce.cpp
+++ b/platforms/hip/tests/TestHipCustomExternalForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomExternalForce.h"
+#include "sfmt/SFMT.h"
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomExternalForce* force = new CustomExternalForce("x^2+y^2+z^2");
+    vector<double> params;
+    for (int i = 0; i < numParticles; i++)
+        force->addParticle(i, params);
+    system.addForce(force);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+void runPlatformTests() {
+    testParallelComputation();
+}