Merge pull request #4632 from ex-rzr/make-hip-standard-platform

HIP platform

Merge pull request #4632 from ex-rzr/make-hip-standard-platform
HIP platform
3b8df952 · Peter Eastman · GitHub · 5ce6a85d · 28fb2918 · 3b8df952
Unverified Commit 3b8df952 authored Sep 05, 2024 by Peter Eastman Committed by GitHub Sep 05, 2024
20 changed files
--- a/platforms/hip/src/kernels/common.hip
+++ b/platforms/hip/src/kernels/common.hip
+/**
+ * This file contains HIP definitions for the macros and functions needed for the
+ * common compute framework.
+ */
+
+#define KERNEL extern "C" __global__
+#define DEVICE __device__
+#define LOCAL __shared__
+#define LOCAL_ARG
+#define GLOBAL
+#define RESTRICT __restrict__
+#define LOCAL_ID threadIdx.x
+#define LOCAL_SIZE blockDim.x
+#define GLOBAL_ID (blockIdx.x*blockDim.x+threadIdx.x)
+#define GLOBAL_SIZE (blockDim.x*gridDim.x)
+#define GROUP_ID blockIdx.x
+#define NUM_GROUPS gridDim.x
+#define SYNC_THREADS __syncthreads();
+#define SYNC_WARPS {__builtin_amdgcn_wave_barrier(); __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront");}
+#define MEM_FENCE __threadfence_block();
+#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
+
+typedef long long mm_long;
+typedef unsigned long long mm_ulong;
+
+#define SUPPORTS_DOUBLE_PRECISION 1
+
+#define LAUNCH_BOUNDS_EXACT(WORK_GROUP_SIZE, WAVES_PER_EU) \
+    __attribute__((amdgpu_flat_work_group_size(WORK_GROUP_SIZE, WORK_GROUP_SIZE), amdgpu_waves_per_eu(WAVES_PER_EU, WAVES_PER_EU)))
+
+#ifdef USE_DOUBLE_PRECISION
+
+__device__ inline long long realToFixedPoint(double x) {
+    return static_cast<long long>(x * 0x100000000);
+}
+
+#else
+
+__device__ inline long long realToFixedPoint(float x) {
+    // Faster way to calculate static_cast<long long>(x * 0x100000000) with exactly the same
+    // results but less instructions.
+    float integral = truncf(x);
+    float fractional = (x - integral) * 0x100000000;
+    unsigned int integral_u32 = static_cast<int>(integral);
+    unsigned int fractional_u32 = static_cast<unsigned int>(fabsf(fractional));
+    // A negative real number (with non-zero fractional) needs rounding-down x for integral and
+    // changing fractional's sign. However, -1 is used as a threshold instead of 0 because, when
+    // fractional is in (-1; 0], fractional_u32 is 0 and the number is considered an integer.
+    bool isNegReal = fractional <= -1.0f;
+    return (static_cast<unsigned long long>(isNegReal ? integral_u32 - 1 : integral_u32) << 32) |
+            static_cast<unsigned long long>(isNegReal ? 0 - fractional_u32 : fractional_u32);
+}
+
+#endif
--- a/platforms/hip/src/kernels/findInteractingBlocks.hip
+++ b/platforms/hip/src/kernels/findInteractingBlocks.hip
+#define BUFFER_SIZE 256
+
+#if defined(AMD_RDNA)
+
+typedef unsigned int warpflags;
+
+__device__ inline int warpPopc(warpflags x) {
+    return __popc(x);
+}
+
+#else
+
+typedef unsigned long long warpflags;
+
+__device__ inline int warpPopc(warpflags x) {
+    return __popcll(x);
+}
+
+#endif
+
+struct alignas(sizeof(__half) * 4) BoundingBox {
+    __device__ BoundingBox(real3 f) {
+        // Round up so we'll err on the side of making the box a little too large.
+        // This ensures interactions will never be missed.
+        v[0] = __float2half_ru((float) f.x);
+        v[1] = __float2half_ru((float) f.y);
+        v[2] = __float2half_ru((float) f.z);
+    }
+    __device__ real3 toReal3() const {
+        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
+    }
+private:
+    __half v[3];
+};
+
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
+        real2* __restrict__ blockSizeRange) {
+    const int indexInTile = threadIdx.x%TILE_SIZE;
+    const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE);
+    const int base = index * TILE_SIZE;
+    if (base >= numAtoms)
+        return;
+
+    real4 tPos = posq[base + indexInTile < numAtoms ? base + indexInTile : 0];
+#ifdef USE_PERIODIC
+    real4 pos;
+    pos.x = SHFL(tPos.x, 0);
+    pos.y = SHFL(tPos.y, 0);
+    pos.z = SHFL(tPos.z, 0);
+    APPLY_PERIODIC_TO_POS(pos)
+
+    real4 minPos = pos;
+    real4 maxPos = pos;
+
+    for (int i = 1; i < TILE_SIZE; i++) {
+        pos.x = SHFL(tPos.x, i);
+        pos.y = SHFL(tPos.y, i);
+        pos.z = SHFL(tPos.z, i);
+        real4 center = 0.5f*(maxPos+minPos);
+        APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
+        minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
+        maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
+    }
+#else
+    real4 minPos = tPos;
+    real4 maxPos = tPos;
+
+    for (int i = TILE_SIZE >> 1; i > 0; i >>= 1) {
+        real4 tpos1, tpos2;
+        tpos1.x = __shfl_down(minPos.x, i, TILE_SIZE);
+        tpos1.y = __shfl_down(minPos.y, i, TILE_SIZE);
+        tpos1.z = __shfl_down(minPos.z, i, TILE_SIZE);
+        tpos2.x = __shfl_down(maxPos.x, i, TILE_SIZE);
+        tpos2.y = __shfl_down(maxPos.y, i, TILE_SIZE);
+        tpos2.z = __shfl_down(maxPos.z, i, TILE_SIZE);
+
+        minPos.x = min(minPos.x, tpos1.x);
+        minPos.y = min(minPos.y, tpos1.y);
+        minPos.z = min(minPos.z, tpos1.z);
+        maxPos.x = max(maxPos.x, tpos2.x);
+        maxPos.y = max(maxPos.y, tpos2.y);
+        maxPos.z = max(maxPos.z, tpos2.z);
+    }
+
+    minPos.x = SHFL(minPos.x, 0);
+    minPos.y = SHFL(minPos.y, 0);
+    minPos.z = SHFL(minPos.z, 0);
+    maxPos.x = SHFL(maxPos.x, 0);
+    maxPos.y = SHFL(maxPos.y, 0);
+    maxPos.z = SHFL(maxPos.z, 0);
+#endif
+
+    real4 blockSize = 0.5f*(maxPos-minPos);
+    real4 center = 0.5f*(maxPos+minPos);
+    center.w = 0;
+    real4 delta = tPos - center;
+#ifdef USE_PERIODIC
+    APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+    real tdelta = delta.x*delta.x+delta.y*delta.y+delta.z*delta.z;
+    real tcenter = max(center.w, tdelta);
+    for (int i = TILE_SIZE >> 1; i > 0; i >>= 1) {
+        real t = __shfl_down(tcenter, i, TILE_SIZE);
+        tcenter = max(tcenter, t);
+    }
+
+    if (indexInTile == 0) {
+        center.w = SQRT(tcenter);
+        blockBoundingBox[index] = blockSize;
+        blockCenter[index] = center;
+
+        // Record the range of sizes.
+        real totalSize = blockSize.x+blockSize.y+blockSize.z;
+        atomicMin(&blockSizeRange->x, totalSize);
+        atomicMax(&blockSizeRange->y, totalSize);
+    }
+    if (blockIdx.x == 0 && threadIdx.x == 0)
+        rebuildNeighborList[0] = 0;
+}
+
+extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) {
+    // Sort keys store the bin in the high order part and the block in the low
+    // order part.
+
+    real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y));
+    int numSizeBins = 20;
+    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
+    unsigned int i = threadIdx.x+blockIdx.x*blockDim.x;
+    if (i < NUM_BLOCKS) {
+        real4 box = blockBoundingBox[i];
+        real size = LOG(box.x+box.y+box.z);
+        int bin = (size-sizeRange.x)*scale;
+        bin = max(0, min(bin, numSizeBins-1));
+        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
+    }
+}
+
+/**
+ * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
+ */
+extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, BoundingBox* __restrict__ sortedBlockBoundingBox,
+#ifdef USE_LARGE_BLOCKS
+        real4* __restrict__ largeBlockCenter, BoundingBox* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
+        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+#endif
+        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild,
+        real2* __restrict__ blockSizeRange) {
+    int i = threadIdx.x+blockIdx.x*blockDim.x;
+    if (i < NUM_BLOCKS) {
+        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
+        sortedBlockCenter[i] = blockCenter[index];
+        sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index]));
+
+#ifdef USE_LARGE_BLOCKS
+        // Compute the sizes of large blocks (composed of warpSize regular blocks) starting from each block.
+
+        real4 minPos = blockCenter[index]-blockBoundingBox[index];
+        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
+        int last = min(i+warpSize, NUM_BLOCKS);
+        for (int j = i+1; j < last; j++) {
+            unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
+            real4 blockPos = blockCenter[index2];
+            real4 width = blockBoundingBox[index2];
+#ifdef USE_PERIODIC
+            real4 center = 0.5f*(maxPos+minPos);
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
+#endif
+            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
+            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
+        }
+        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
+        largeBlockBoundingBox[i] = BoundingBox(trimTo3(0.5f*(maxPos-minPos)));
+#endif
+    }
+
+    // Initialize min and max for the atomic reduction in findBlockBounds (for the next step)
+    if (i == 0) {
+        blockSizeRange->x = 1e38;
+        blockSizeRange->y = 0;
+    }
+
+    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
+
+    bool rebuild = forceRebuild;
+    if (i < NUM_ATOMS) {
+        real4 delta = oldPositions[i]-posq[i];
+        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
+            rebuild = true;
+    }
+
+    if (rebuild) {
+        rebuildNeighborList[0] = 1;
+        interactionCount[0] = 0;
+        interactionCount[1] = 0;
+    }
+}
+
+#if MAX_BITS_FOR_PAIRS > 0
+
+__device__ inline
+void collectInteractions(unsigned int& interacts, float d, int bit) {
+    interacts |= (__float_as_uint(d) >> 31) << bit;
+}
+
+__device__ inline
+void collectInteractions(unsigned int& interacts, double d, int bit) {
+    interacts |= ((unsigned int)__double2hiint(d) >> 31) << bit;
+}
+
+#else
+
+// Simplified version that does not collect individual bits (they are not needed without single pairs),
+// only sets the flag that there are any interactions.
+
+__device__ inline
+void collectInteractions(unsigned int& interacts, float d, int) {
+    interacts |= __float_as_uint(d) & (1 << 31);
+}
+
+__device__ inline
+void collectInteractions(unsigned int& interacts, double d, int) {
+    interacts |= (unsigned int)__double2hiint(d) & (1 << 31);
+}
+
+#endif
+
+#if !defined(USE_DOUBLE_PRECISION) && __has_builtin(__builtin_amdgcn_mfma_f32_4x4x1f32)
+
+#define USE_MFMA
+
+using vfloat = __attribute__((__vector_size__(4 * sizeof(float)))) float;
+
+template<int BlockId>
+inline __device__
+void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned int& interacts) {
+    vfloat d;
+    d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.x, -pos2.x, c, 4, BlockId, 0);
+    d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.y, -pos2.y, d, 4, BlockId, 0);
+    d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.z, -pos2.z, d, 4, BlockId, 0);
+    d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.w,    1.0f, d, 4, BlockId, 0);
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+        collectInteractions(interacts, d[i], BlockId * 4 + i);
+    }
+}
+
+#endif
+
+/**
+ * Compare the bounding boxes for each pair of atom blocks (comprised of TILE_SIZE atoms each), forming a tile. If the two
+ * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
+ *
+ * STAGE 1:
+ *
+ * A coarse grained atom block against interacting atom block neighbour list is constructed.
+ *
+ * Each warp first loads in some block X of interest. Each thread within the warp then loads
+ * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes
+ * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
+ * interacting and Y is added to the buffer for X.
+ *
+ * STAGE 2:
+ *
+ * A fine grained atom block against interacting atoms neighbour list is constructed.
+ *
+ * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
+ * in the warp loops over the TILE_SIZE atoms in X and compares their positions to one particular atom from block Y.
+ * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
+ * This continues until the buffer fills up, at which point the results are written to global memory.
+ *
+ * [in] periodicBoxSize        - size of the rectangular periodic box
+ * [in] invPeriodicBoxSize     - inverse of the periodic box
+ * [in] blockCenter            - the center of each bounding box
+ * [in] blockBoundingBox       - bounding box of each atom block
+ * [out] interactionCount      - total number of tiles that have interactions
+ * [out] interactingTiles      - set of blocks that have interactions
+ * [out] interactingAtoms      - a list of atoms that interact with each atom block
+ * [in] posq                   - x,y,z coordinates of each atom and charge q
+ * [in] maxTiles               - maximum number of tiles to process, used for multi-GPUs
+ * [in] startBlockIndex        - first block to process, used for multi-GPUs,
+ * [in] numBlocks              - total number of atom blocks
+ * [in] sortedBlocks           - a sorted list of atom blocks based on volume
+ * [in] sortedBlockCenter      - sorted centers, duplicated for fast access to avoid indexing
+ * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
+ * [in] exclusionIndices       - maps into exclusionRowIndices with the starting position for a given atom
+ * [in] exclusionRowIndices    - stores the a continuous list of exclusions
+ *           eg: block 0 is excluded from atom 3,5,6
+ *               block 1 is excluded from atom 3,4
+ *               block 2 is excluded from atom 1,3,5,6
+ *              exclusionIndices[0][3][5][8]
+ *           exclusionRowIndices[3][5][6][3][4][1][3][5][6]
+ *                         index 0  1  2  3  4  5  6  7  8
+ * [out] oldPos                - stores the positions of the atoms in which this neighbourlist was built on
+ *                             - this is used to decide when to rebuild a neighbourlist
+ * [in] rebuildNeighbourList   - whether or not to execute this kernel
+ *
+ */
+extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
+        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
+        unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
+        const BoundingBox* __restrict__ sortedBlockBoundingBox,
+#ifdef USE_LARGE_BLOCKS
+        const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox,
+#endif
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
+
+    if (rebuildNeighborList[0] == 0)
+        return; // The neighbor list doesn't need to be rebuilt.
+
+    constexpr int tilesPerWarp = warpSize/TILE_SIZE;
+    constexpr int warpsPerBlock = GROUP_SIZE/warpSize;
+    const int indexInWarp = threadIdx.x%warpSize;
+    const int indexInTile = threadIdx.x%TILE_SIZE;
+    const int tileInWarp = tilesPerWarp == 1 ? 0 : (threadIdx.x/TILE_SIZE)%tilesPerWarp;
+    const int warpInBlock = warpsPerBlock == 1 ? 0 : threadIdx.x/warpSize;
+    const int warpIndex = blockIdx.x*warpsPerBlock + (warpsPerBlock == 1 ? 0 : warpInBlock);
+    const warpflags warpMask = (static_cast<warpflags>(1)<<indexInWarp)-1;
+
+    __shared__ int workgroupBuffer[BUFFER_SIZE*warpsPerBlock];
+    __shared__ real4 workgroupPosBuffer[TILE_SIZE*warpsPerBlock];
+    __shared__ int workgroupExclusions[MAX_EXCLUSIONS*warpsPerBlock];
+    __shared__ int workgroupBlock2Buffer[(warpSize+1)*warpsPerBlock];
+
+    int* buffer = workgroupBuffer+BUFFER_SIZE*warpInBlock;
+    real4* posBuffer = workgroupPosBuffer+TILE_SIZE*warpInBlock;
+    int* exclusionsForX = workgroupExclusions+MAX_EXCLUSIONS*warpInBlock;
+    int* block2Buffer = workgroupBlock2Buffer+(warpSize+1)*warpInBlock;
+
+    // Loop over blocks.
+
+    int block1 = startBlockIndex+warpIndex/NUM_TILES_IN_BATCH;
+    if (block1 < startBlockIndex+numBlocks) {
+        // Load data for this block.  Note that all threads in a warp are processing the same block.
+
+        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
+        real4 blockCenterX = sortedBlockCenter[block1];
+        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
+        int neighborsInBuffer = 0;
+        real4 pos1 = posq[x*TILE_SIZE+indexInTile];
+#ifdef USE_PERIODIC
+        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
+                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
+                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
+        if (singlePeriodicCopy) {
+            // The box is small enough that we can just translate all the atoms into a single periodic
+            // box, then skip having to apply periodic boundary conditions later.
+
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
+        }
+#endif
+        pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
+        if (tileInWarp == 0) {
+            posBuffer[indexInTile] = pos1;
+        }
+
+        // Load exclusion data for block x.
+
+        const int exclusionStart = exclusionRowIndices[x];
+        const int exclusionEnd = exclusionRowIndices[x+1];
+        const int numExclusions = exclusionEnd-exclusionStart;
+        #pragma unroll 1
+        for (int j = indexInWarp; j < numExclusions; j += warpSize)
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+
+        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against warpSize
+        // other blocks in parallel.
+        // For small systems multiple warps (NUM_TILES_IN_BATCH = 4, 2...) process one block1 reducing the overall
+        // duration of the kernel because first blocks block1 have to process more block2 blocks so most of compute
+        // units are idle at the end of the kernel (the kernel works on the upper triangle of
+        // the NUM_BLOCKS x NUM_BLOCKS matrix).
+
+#ifdef USE_LARGE_BLOCKS
+        warpflags largeBlockFlags = 0;
+        int loadedLargeBlocks = 0;
+#endif
+        int block2Count = 0;
+        // Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox.
+        for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) {
+            // Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2)
+            const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS;
+#ifdef USE_LARGE_BLOCKS
+            if (loadedLargeBlocks == 0) {
+                // Check the next set of large blocks.
+
+                int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp;
+                bool includeLargeBlock = false;
+                if (largeBlockIndex < NUM_BLOCKS) {
+                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
+                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
+                    real4 blockDelta = blockCenterX-largeCenter;
+#ifdef USE_PERIODIC
+                    APPLY_PERIODIC_TO_DELTA(blockDelta)
+#endif
+                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
+                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
+                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
+                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+#ifdef TRICLINIC
+                    // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
+                    // If there's any possibility we might have missed it, do a detailed check.
+
+                    if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
+                        includeLargeBlock = true;
+#endif
+                }
+                largeBlockFlags = __ballot(includeLargeBlock);
+                loadedLargeBlocks = warpSize;
+            }
+            loadedLargeBlocks--;
+            if ((largeBlockFlags&1) == 0 && !lastIteration) {
+                // None of the next warpSize blocks interact with block 1.
+
+                largeBlockFlags >>= 1;
+                continue;
+            }
+            largeBlockFlags >>= 1;
+#endif
+            int block2 = block2Base+indexInWarp;
+            bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS);
+            block2 = includeBlock2 ? block2 : block1;
+            bool forceInclude = false;
+            real4 blockCenterY = sortedBlockCenter[block2];
+            real4 blockDelta = blockCenterX-blockCenterY;
+#ifdef USE_PERIODIC
+            APPLY_PERIODIC_TO_DELTA(blockDelta)
+#endif
+            includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
+#ifndef TRICLINIC
+            if (!lastIteration && __ballot(includeBlock2) == 0)
+                continue;
+#endif
+            real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
+            blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
+            blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
+            blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
+            includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+#ifdef TRICLINIC
+            // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
+            // If there's any possibility we might have missed it, do a detailed check.
+
+            if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
+                includeBlock2 = forceInclude = true;
+#endif
+
+            // Collect any blocks we identified as potentially containing neighbors.
+
+            warpflags includeBlockFlags = __ballot(includeBlock2);
+            if (includeBlock2) {
+                int index = block2Count + warpPopc(includeBlockFlags&warpMask);
+                block2Buffer[index] = (block2 << 1) | (forceInclude ? 1 : 0);
+            }
+            block2Count += warpPopc(includeBlockFlags);
+
+            // Loop over the collected candidates (each warp processes 2 blocks on CDNA or 1 block on RDNA).
+            // Process even number of blocks on CDNA so both half-warps have work to do (except for
+            // the last iteration of the for-block2Base loop when the left-over must be processed).
+
+            const int block2ToProcess = lastIteration ? block2Count : block2Count/tilesPerWarp*tilesPerWarp;
+            for (int block2Index = 0; block2Index < block2ToProcess; block2Index += tilesPerWarp) {
+                bool includeBlock2 = block2Index + tileInWarp < block2Count;
+                const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)];
+                const bool forceInclude = b & 1;
+                const int block2 = b >> 1;
+                int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
+
+                #pragma unroll 1
+                for (int k = indexInTile; k < numExclusions; k += TILE_SIZE)
+                    includeBlock2 &= (exclusionsForX[k] != y);
+                includeBlock2 = BALLOT(!includeBlock2) == 0;
+
+                // Check each atom in block Y for interactions.
+
+                int atom2 = y*TILE_SIZE+indexInTile;
+                real4 pos2 = posq[atom2];
+#ifdef USE_PERIODIC
+                if (singlePeriodicCopy) {
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
+                }
+#endif
+                pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
+                real4 blockCenterY = sortedBlockCenter[block2];
+                real3 atomDelta = trimTo3(pos1)-trimTo3(blockCenterY);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(atomDelta)
+#endif
+                tileflags atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
+                tileflags interacts = 0;
+                // The condition `posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z < 0.5f * PADDED_CUTOFF_SQUARED` is expressed as
+                // `posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z - posj.w - 0.5f * PADDED_CUTOFF_SQUARED - pos2.w` and computed using fma
+                // (it saves 1 instruction).
+                // Sign bit is used directly instead of `halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<<j : 0`.
+#ifdef USE_PERIODIC
+                if (!singlePeriodicCopy) {
+                    while (atomFlags) {
+                        int j = __ffs(atomFlags)-1;
+                        atomFlags = atomFlags ^ (static_cast<tileflags>(1) << j);
+                        real3 delta = trimTo3(pos2)-trimTo3(posBuffer[j]);
+                        APPLY_PERIODIC_TO_DELTA(delta)
+                        real d = delta.x*delta.x+delta.y*delta.y+delta.z*delta.z - PADDED_CUTOFF_SQUARED;
+                        collectInteractions(interacts, d, j);
+                    }
+                }
+                else {
+#endif
+                    const real lim = 0.5f * PADDED_CUTOFF_SQUARED - pos2.w;
+#if defined(USE_MFMA)
+                    const vfloat c = { -lim, -lim, -lim, -lim };
+                    mfma4x4<0>(pos1, pos2, c, interacts);
+                    mfma4x4<1>(pos1, pos2, c, interacts);
+                    mfma4x4<2>(pos1, pos2, c, interacts);
+                    mfma4x4<3>(pos1, pos2, c, interacts);
+                    mfma4x4<4>(pos1, pos2, c, interacts);
+                    mfma4x4<5>(pos1, pos2, c, interacts);
+                    mfma4x4<6>(pos1, pos2, c, interacts);
+                    mfma4x4<7>(pos1, pos2, c, interacts);
+#else
+                    while (atomFlags) {
+                        int j = __ffs(atomFlags)-1;
+                        atomFlags = atomFlags ^ (static_cast<tileflags>(1) << j);
+                        real4 posj = posBuffer[j];
+                        real d = fma(-posj.x, pos2.x, fma(-posj.y, pos2.y, fma(-posj.z, pos2.z, posj.w - lim)));
+                        collectInteractions(interacts, d, j);
+                    }
+#endif
+#ifdef USE_PERIODIC
+                }
+#endif
+                if (atom2 >= NUM_ATOMS || !includeBlock2) {
+                    interacts = 0;
+                }
+
+#if MAX_BITS_FOR_PAIRS > 0
+                const unsigned int interactCount = __popc(interacts);
+
+                // Record interactions that should be computed as single pairs rather than in blocks.
+                const bool storeAsSinglePair = interactCount > 0 && interactCount <= MAX_BITS_FOR_PAIRS;
+                if (__ballot(storeAsSinglePair)) {
+                    unsigned int sum = 0;
+                    unsigned int prevSum = 0;
+                    for (int i = 1; i <= MAX_BITS_FOR_PAIRS; i++) {
+                        warpflags b = __ballot(interactCount == i);
+                        sum += warpPopc(b) * i;
+                        prevSum += warpPopc(b&warpMask) * i;
+                    }
+                    unsigned int pairStartIndex = 0;
+                    if (indexInWarp == 0)
+                        pairStartIndex = atomicAdd(&interactionCount[1], sum);
+                    pairStartIndex = __shfl(pairStartIndex, 0);
+                    unsigned int pairIndex = pairStartIndex + prevSum;
+                    if (storeAsSinglePair && pairIndex+interactCount <= maxSinglePairs) {
+                        while (interacts != 0) {
+                            int j = __ffs(interacts)-1;
+                            singlePairs[pairIndex] = make_int2(atom2, x*TILE_SIZE+j);
+                            interacts = interacts ^ (static_cast<tileflags>(1) << j);
+                            pairIndex++;
+                        }
+                    }
+                }
+#else
+                const unsigned int interactCount = interacts;
+#endif
+
+                // Add any interacting atoms to the buffer.
+
+                warpflags includeAtomFlags = __ballot(interactCount > MAX_BITS_FOR_PAIRS);
+                if (interactCount > MAX_BITS_FOR_PAIRS) {
+                    int index = neighborsInBuffer+warpPopc(includeAtomFlags&warpMask);
+                    buffer[index] = atom2;
+                }
+                neighborsInBuffer += warpPopc(includeAtomFlags);
+                if (neighborsInBuffer > BUFFER_SIZE-warpSize) {
+                    // Store the new tiles to memory.
+
+                    unsigned int tilesToStore = neighborsInBuffer/warpSize*tilesPerWarp;
+                    unsigned int tileStartIndex = 0;
+                    if (indexInWarp == 0)
+                        tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
+                    unsigned int newTileStartIndex = __shfl(tileStartIndex, 0);
+                    if (newTileStartIndex+tilesToStore <= maxTiles) {
+                        if (indexInWarp < tilesToStore)
+                            interactingTiles[newTileStartIndex+indexInWarp] = x;
+                        for (int j = 0; j < tilesToStore/tilesPerWarp; j++)
+                            interactingAtoms[newTileStartIndex*TILE_SIZE+j*warpSize+indexInWarp] = buffer[j*warpSize+indexInWarp];
+                    }
+                    if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
+                        buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
+                    neighborsInBuffer -= TILE_SIZE*tilesToStore;
+                }
+            }
+
+            // Move not processed blocks to the head of block2Buffer.
+            if (indexInWarp < block2Count - block2ToProcess)
+                block2Buffer[indexInWarp] = block2Buffer[block2ToProcess + indexInWarp];
+            block2Count = block2Count - block2ToProcess;
+        }
+
+        // If we have a partially filled buffer,  store it to memory.
+
+        if (neighborsInBuffer > 0) {
+            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
+            unsigned int tileStartIndex = 0;
+            if (indexInWarp == 0)
+                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
+            unsigned int newTileStartIndex = __shfl(tileStartIndex, 0);
+            if (newTileStartIndex+tilesToStore <= maxTiles) {
+                if (indexInWarp < tilesToStore)
+                    interactingTiles[newTileStartIndex+indexInWarp] = x;
+                for (int j = 0; j <= tilesToStore/tilesPerWarp; j++) {
+                    if (j*warpSize+indexInWarp < tilesToStore*TILE_SIZE)
+                        interactingAtoms[newTileStartIndex*TILE_SIZE+j*warpSize+indexInWarp] = (j*warpSize+indexInWarp < neighborsInBuffer ? buffer[j*warpSize+indexInWarp] : PADDED_NUM_ATOMS);
+                }
+            }
+        }
+    }
+
+    // Record the positions the neighbor list is based on.
+
+    int i = threadIdx.x+blockIdx.x*GROUP_SIZE;
+    if (i < NUM_ATOMS) {
+        oldPositions[i] = posq[i];
+    }
+}
+
+extern "C" __global__ void copyInteractionCounts(const unsigned int* __restrict__ interactionCount,
+        unsigned int* __restrict__ pinnedInteractionCount) {
+    pinnedInteractionCount[0] = interactionCount[0];
+    pinnedInteractionCount[1] = interactionCount[1];
+}
--- a/platforms/hip/src/kernels/intrinsics.hip
+++ b/platforms/hip/src/kernels/intrinsics.hip
+/**
+ * This file contains the device function for using cross-lane operations (ballot and shuffle)
+ */
+
+#if defined(TILE_SIZE)
+#if !defined(AMD_RDNA)
+// Two subwarps per warp
+#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
+#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
+#else
+#define SHFL(var, srcLane) __shfl(var, srcLane)
+#define BALLOT(var) __ballot(var)
+#endif
+#endif
+
+template<class T>
+static __inline__ __device__
+T warpShuffle(const T& input, const int src_lane) {
+    static_assert(sizeof(T) % sizeof(int) == 0, "incorrect type size");
+    constexpr int words_no = sizeof(T) / sizeof(int);
+
+    T output;
+    #pragma unroll
+    for(int i = 0; i < words_no; i++) {
+        int word;
+        __builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
+        word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
+        __builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, sizeof(int));
+    }
+
+    return output;
+}
+
+template<int Subwarp, class T>
+static __inline__ __device__
+typename std::enable_if<(Subwarp == warpSize), T>::type
+warpRotateLeft(const T& input) {
+    return warpShuffle(input, threadIdx.x + 1);
+}
+
+template<int Subwarp, class T>
+static __inline__ __device__
+typename std::enable_if<!(Subwarp == warpSize), T>::type
+warpRotateLeft(const T& input) {
+    return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
+}
--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
+/**
+ * Compute nonbonded interactions. The kernel is separated into two parts,
+ * tiles with exclusions and tiles without exclusions. It relies heavily on
+ * implicit warp-level synchronization. A tile is defined by two atom blocks
+ * each of warpsize. Each warp computes a range of tiles.
+ *
+ * Tiles with exclusions compute the entire set of interactions across
+ * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
+ * the forces are computed and accumulated diagonally in the manner shown below
+ * where, suppose
+ *
+ * [a-h] comprise atom block 1, [i-p] comprise atom block 2
+ *
+ * 1 denotes the first set of calculations within the warp
+ * 2 denotes the second set of calculations within the warp
+ * ... etc.
+ *
+ *        threads
+ *     0 1 2 3 4 5 6 7
+ *         atom1
+ * L    a b c d e f g h
+ * o  i 1 2 3 4 5 6 7 8
+ * c  j 8 1 2 3 4 5 6 7
+ * a  k 7 8 1 2 3 4 5 6
+ * l  l 6 7 8 1 2 3 4 5
+ * D  m 5 6 7 8 1 2 3 4
+ * a  n 4 5 6 7 8 1 2 3
+ * t  o 3 4 5 6 7 8 1 2
+ * a  p 2 3 4 5 6 7 8 1
+ *
+ * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
+ * and follows the same force accumulation method. If more there are more interactingTiles
+ * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
+ * and the full tileset is computed. This should happen on the first step, and very rarely
+ * afterwards.
+ *
+ * On diagonal exclusion tiles use __shfl to broadcast. For all other types of tiles __shfl
+ * is used to pass around the forces, positions, and parameters when computing the forces.
+ *
+ * [out]forceBuffers    - forces on each atom to eventually be accumulated
+ * [out]energyBuffer    - energyBuffer to eventually be accumulated
+ * [in]posq             - x,y,z,charge
+ * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
+ * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
+ * [in]startTileIndex   - index into first tile to be processed
+ * [in]numTileIndices   - number of tiles this context is responsible for processing
+ * [in]int tiles        - the atom block for each tile
+ * [in]interactionCount - total number of tiles that have an interaction
+ * [in]maxTiles         - stores the size of the neighbourlist in case it needs
+ *                      - to be expanded
+ * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
+ * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
+ * [in]blockCenter      - the center of each block in euclidean coordinates
+ * [in]blockSize        - size of the each block, radiating from the center
+ *                      - x is half the distance of total length
+ *                      - y is half the distance of total width
+ *                      - z is half the distance of total height
+ *                      - w is not used
+ * [in]interactingAtoms - a list of interactions within a given tile
+ *
+ */
+extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded(
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
+        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
+#ifdef USE_CUTOFF
+        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
+        const int2* __restrict__ singlePairs
+#endif
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = blockIdx.x*(THREAD_BLOCK_SIZE/TILE_SIZE) + threadIdx.x/TILE_SIZE; // global warpIndex
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
+    mixed energy = 0;
+    INIT_DERIVATIVES
+
+    // First loop: process tiles that contain exclusions.
+
+    for (int pos = FIRST_EXCLUSION_TILE+warp; pos < LAST_EXCLUSION_TILE; pos+=totalWarps) {
+        const int2 tileIndices = exclusionTiles[pos];
+        const unsigned int x = tileIndices.x;
+        const unsigned int y = tileIndices.y;
+        real3 force = make_real3(0);
+        unsigned int atom1 = x*TILE_SIZE + tgx;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+#ifdef USE_EXCLUSIONS
+        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
+#endif
+        if (x == y) {
+            // This tile is on the diagonal.
+            real4 shflPosq = posq1;
+
+            // we do not need to fetch parameters from global since this is a symmetric tile
+            // instead we can broadcast the values using shuffle
+            #pragma unroll 2
+            for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                real4 posq2;
+                BROADCAST_WARP_DATA
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    int atom2 = y*TILE_SIZE+j;
+#ifdef USE_SYMMETRIC
+                    real dEdR = 0.0f;
+#else
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
+#endif
+                    real tempEnergy = 0.0f;
+                    const real interactionScale = 0.5f;
+                    COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                    energy += 0.5f*tempEnergy;
+#endif
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                    force.x -= delta.x*dEdR;
+                    force.y -= delta.y*dEdR;
+                    force.z -= delta.z*dEdR;
+#else
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
+#endif
+#endif
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            unsigned int j = y*TILE_SIZE + tgx;
+            int atomIndex = j;
+            real4 shflPosq = posq[j];
+            real3 shflForce;
+            shflForce.x = 0.0f;
+            shflForce.y = 0.0f;
+            shflForce.z = 0.0f;
+            DECLARE_LOCAL_PARAMETERS
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+#ifdef USE_EXCLUSIONS
+            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+            #pragma unroll 2
+            for (j = 0; j < TILE_SIZE; j++) {
+                real4 posq2 = shflPosq;
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = r2*invR;
+                    LOAD_ATOM2_PARAMETERS
+                    int atom2 = atomIndex;
+#ifdef USE_SYMMETRIC
+                    real dEdR = 0.0f;
+#else
+                    real3 dEdR1 = make_real3(0);
+                    real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
+#endif
+                    real tempEnergy = 0.0f;
+                    const real interactionScale = 1.0f;
+                    COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                    energy += tempEnergy;
+#endif
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
+                    shflForce.x += delta.x;
+                    shflForce.y += delta.y;
+                    shflForce.z += delta.z;
+#else // !USE_SYMMETRIC
+                    force.x -= dEdR1.x;
+                    force.y -= dEdR1.y;
+                    force.z -= dEdR1.z;
+                    shflForce.x += dEdR2.x;
+                    shflForce.y += dEdR2.y;
+                    shflForce.z += dEdR2.z;
+#endif // end USE_SYMMETRIC
+#endif
+#ifdef USE_CUTOFF
+                }
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
+#endif
+                SHUFFLE_WARP_DATA
+                atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
+            }
+            const unsigned int offset = atomIndex;
+            // write results for off diagonal tiles
+#ifdef INCLUDE_FORCES
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
+#endif
+        }
+        // Write results for on and off diagonal tiles
+#ifdef INCLUDE_FORCES
+        const unsigned int offset = x*TILE_SIZE + tgx;
+        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(force.x)));
+        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
+        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
+#endif
+    }
+
+    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
+    // of them (no cutoff).
+
+#ifdef USE_NEIGHBOR_LIST
+    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
+    for (unsigned int pos0 = warp; pos0 < LAST_EXCLUSION_TILE+numTiles; pos0+=totalWarps) {
+        // Skip warps that may be still busy in the first loop
+        if (pos0 < LAST_EXCLUSION_TILE) {
+            continue;
+        }
+        const unsigned int pos = pos0-LAST_EXCLUSION_TILE;
+#else
+    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
+    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
+    int skipBase = 0;
+    int currentSkipIndex = tbx;
+    int skipTiles;
+    skipTiles = -1;
+    for (; pos < end; pos++) {
+#endif
+        real3 force = make_real3(0);
+        bool includeTile = true;
+
+        // Extract the coordinates of this tile.
+        int x, y;
+        bool singlePeriodicCopy = false;
+#ifdef USE_NEIGHBOR_LIST
+        x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
+            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        }
+
+        // Skip over tiles that have exclusions, since they were already processed.
+
+        while (SHFL(skipTiles, TILE_SIZE-1) < pos) {
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                int2 tile = exclusionTiles[skipBase+tgx];
+                skipTiles = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+            }
+            else
+                skipTiles = end;
+            skipBase += TILE_SIZE;
+            currentSkipIndex = 0;
+        }
+        while (SHFL(skipTiles, currentSkipIndex) < pos)
+            currentSkipIndex++;
+        includeTile = (SHFL(skipTiles, currentSkipIndex) != pos);
+#endif
+        if (includeTile) {
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            // Load atom data for this tile.
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+#ifdef USE_NEIGHBOR_LIST
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
+#else
+            unsigned int j = y*TILE_SIZE + tgx;
+#endif
+            int atomIndex = j;
+            DECLARE_LOCAL_PARAMETERS
+            real4 shflPosq;
+            real3 shflForce;
+            shflForce.x = 0.0f;
+            shflForce.y = 0.0f;
+            shflForce.z = 0.0f;
+            if (j < PADDED_NUM_ATOMS) {
+                // Load position of atom j from from global memory
+                shflPosq = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            }
+            else {
+                shflPosq = make_real4(0, 0, 0, 0);
+                CLEAR_LOCAL_PARAMETERS
+            }
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy) {
+                // The box is small enough that we can just translate all the atoms into a single periodic
+                // box, then skip having to apply periodic boundary conditions later.
+                real4 blockCenterX = blockCenter[x];
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
+                #pragma unroll 2
+                for (j = 0; j < TILE_SIZE; j++) {
+                    real4 posq2 = shflPosq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = r2*invR;
+                        LOAD_ATOM2_PARAMETERS
+                        int atom2 = atomIndex;
+#ifdef USE_SYMMETRIC
+                        real dEdR = 0.0f;
+#else
+                        real3 dEdR1 = make_real3(0);
+                        real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
+#endif
+                        real tempEnergy = 0.0f;
+                        const real interactionScale = 1.0f;
+                        COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                        energy += tempEnergy;
+#endif
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
+                        shflForce.z += delta.z;
+#else // !USE_SYMMETRIC
+                        force.x -= dEdR1.x;
+                        force.y -= dEdR1.y;
+                        force.z -= dEdR1.z;
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
+                        shflForce.z += dEdR2.z;
+#endif // end USE_SYMMETRIC
+#endif
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    SHUFFLE_WARP_DATA
+                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
+                }
+            }
+            else
+#endif
+            {
+                // We need to apply periodic boundary conditions separately for each interaction.
+                #pragma unroll 2
+                for (j = 0; j < TILE_SIZE; j++) {
+                    real4 posq2 = shflPosq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < MAX_CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = r2*invR;
+                        LOAD_ATOM2_PARAMETERS
+                        int atom2 = atomIndex;
+#ifdef USE_SYMMETRIC
+                        real dEdR = 0.0f;
+#else
+                        real3 dEdR1 = make_real3(0);
+                        real3 dEdR2 = make_real3(0);
+#endif
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
+#endif
+                        real tempEnergy = 0.0f;
+                        const real interactionScale = 1.0f;
+                        COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+                        energy += tempEnergy;
+#endif
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
+                        shflForce.z += delta.z;
+#else // !USE_SYMMETRIC
+                        force.x -= dEdR1.x;
+                        force.y -= dEdR1.y;
+                        force.z -= dEdR1.z;
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
+                        shflForce.z += dEdR2.z;
+#endif // end USE_SYMMETRIC
+#endif
+#ifdef USE_CUTOFF
+                    }
+#endif
+                    SHUFFLE_WARP_DATA
+                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
+                }
+            }
+
+            // Write results.
+#ifdef INCLUDE_FORCES
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(force.x)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
+            unsigned int atom2 = atomIndex;
+            if (atom2 < PADDED_NUM_ATOMS) {
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
+            }
+#endif
+        }
+    }
+
+    // Third loop: single pairs that aren't part of a tile.
+
+#if USE_NEIGHBOR_LIST
+    const unsigned int numPairs = interactionCount[1];
+    if (numPairs > maxSinglePairs)
+        return; // There wasn't enough memory for the neighbor list.
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
+        int2 pair = singlePairs[i];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        real4 posq1 = posq[atom1];
+        real4 posq2 = posq[atom2];
+        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+        if (r2 < MAX_CUTOFF_SQUARED) {
+            LOAD_ATOM1_PARAMETERS
+            int j = atom2;
+            DECLARE_LOCAL_PARAMETERS
+            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            LOAD_ATOM2_PARAMETERS
+            real invR = RSQRT(r2);
+            real r = r2*invR;
+#ifdef USE_SYMMETRIC
+            real dEdR = 0.0f;
+#else
+            real3 dEdR1 = make_real3(0);
+            real3 dEdR2 = make_real3(0);
+#endif
+            bool isExcluded = false;
+            real tempEnergy = 0.0f;
+            const real interactionScale = 1.0f;
+            COMPUTE_INTERACTION
+#ifdef INCLUDE_ENERGY
+            energy += tempEnergy;
+#endif
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+            real3 dEdR1 = delta*dEdR;
+            real3 dEdR2 = -dEdR1;
+#endif
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.x)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.y)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.z)));
+            atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.x)));
+            atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.y)));
+            atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.z)));
+#endif
+        }
+    }
+#endif
+#ifdef INCLUDE_ENERGY
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+#endif
+    SAVE_DERIVATIVES
+}
--- a/platforms/hip/src/kernels/parallel.hip
+++ b/platforms/hip/src/kernels/parallel.hip
+/**
+ * Sum the forces computed by different contexts.
+ */
+
+extern "C" __global__ void sumForces(long long* __restrict__ force, long long* __restrict__ buffer, int bufferSize, int numBuffers) {
+    int totalSize = bufferSize*numBuffers;
+    for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
+        long long sum = force[index];
+        for (int i = index; i < totalSize; i += bufferSize)
+            sum += buffer[i];
+        force[index] = sum;
+    }
+}
--- a/platforms/hip/src/kernels/sort.hip
+++ b/platforms/hip/src/kernels/sort.hip
+__device__ KEY_TYPE getValue(DATA_TYPE value) {
+    return SORT_KEY;
+}
+
+extern "C" {
+
+/**
+ * Sort a list that is short enough to entirely fit in local memory.  This is executed as
+ * a single thread block.
+ */
+__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
+    // Load the data into local memory.
+
+    extern __shared__ DATA_TYPE dataBuffer[];
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        dataBuffer[index] = data[index];
+    __syncthreads();
+
+    // Perform a bitonic sort in local memory.
+
+    unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
+
+    for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                int ixj = i^j;
+                if (ixj > i && ixj < length) {
+                    DATA_TYPE value1 = dataBuffer[i];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
+                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[i] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write the data back to global memory.
+
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        data[index] = dataBuffer[index];
+}
+
+/**
+ * An alternate kernel for sorting short lists.  In this version every thread does a full
+ * scan through the data to select the destination for one element.  This involves more
+ * work, but also parallelizes much better.
+ */
+__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
+    extern __shared__ KEY_TYPE keyBuffer[];
+    int globalId = blockDim.x*blockIdx.x+threadIdx.x;
+    DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
+    KEY_TYPE key = getValue(value);
+    int count = 0;
+    for (int blockStart = 0; blockStart < length; blockStart += blockDim.x) {
+        int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart));
+        __syncthreads();
+        if (threadIdx.x < numInBlock)
+            keyBuffer[threadIdx.x] = getValue(dataIn[blockStart+threadIdx.x]);
+        __syncthreads();
+        for (int i = 0; i < numInBlock; i++) {
+            KEY_TYPE otherKey = keyBuffer[i];
+            count += (otherKey < key) | ((otherKey == key) & (blockStart+i < globalId));
+        }
+    }
+    if (globalId < length)
+        dataOut[count] = value;
+}
+
+inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE* minBuffer, KEY_TYPE* maxBuffer,
+        KEY_TYPE* minResult, KEY_TYPE* maxResult) {
+    minBuffer[threadIdx.x] = minimum;
+    maxBuffer[threadIdx.x] = maximum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if ((threadIdx.x+step < blockDim.x) & ((threadIdx.x&(2*step-1)) == 0)) {
+            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
+            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0) {
+        *minResult = minBuffer[0];
+        *maxResult = maxBuffer[0];
+    }
+}
+
+/**
+ * Calculate the minimum and maximum value in the array to be sorted.
+ */
+__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
+        unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
+#if UNIFORM
+    extern __shared__ KEY_TYPE minBuffer[];
+    KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
+    KEY_TYPE minimum = MAX_KEY;
+    KEY_TYPE maximum = MIN_KEY;
+
+    __shared__ bool isLastFinishedBlock;
+    if (threadIdx.x == 0) {
+        isLastFinishedBlock = false;
+    }
+
+    // Each thread calculates the range of a subset of values.
+
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        KEY_TYPE value = getValue(data[index]);
+        minimum = min(minimum, value);
+        maximum = max(maximum, value);
+    }
+
+    // Now reduce them and save partial results
+
+    reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[blockIdx.x * 2 + 0], &range[blockIdx.x * 2 + 1]);
+    __threadfence();
+    if (threadIdx.x == 0) {
+        isLastFinishedBlock = atomicAdd(&counters[0], 1) + 1 == gridDim.x;
+    }
+    __syncthreads();
+
+    // The last block reduce partial results
+
+    if (isLastFinishedBlock) {
+        for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
+            minimum = min(minimum, range[index * 2 + 0]);
+            maximum = max(maximum, range[index * 2 + 1]);
+        }
+        reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
+    }
+#endif
+
+    // Clear the bucket counters in preparation for the next kernel.
+
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < numBuckets; index += blockDim.x*gridDim.x)
+        bucketOffset[index] = 0;
+}
+
+/**
+ * Assign elements to buckets.  This version is optimized for uniformly distributed data.
+ */
+__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    float minValue = (float) (range[0]);
+    float maxValue = (float) (range[1]);
+    float bucketWidth = (maxValue-minValue)/numBuckets;
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
+
+/**
+ * Assign elements to buckets.  This version is optimized for non-uniformly distributed data.
+ */
+__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    // Load 64 datapoints and sort them to get an estimate of the data distribution.
+
+    __shared__ KEY_TYPE elements[64];
+    if (threadIdx.x < 64) {
+        int index = (int) (threadIdx.x*length/64.0);
+        elements[threadIdx.x] = getValue(data[index]);
+    }
+    __syncthreads();
+    for (unsigned int k = 2; k <= 64; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            if (threadIdx.x < 64) {
+                int ixj = threadIdx.x^j;
+                if (ixj > threadIdx.x) {
+                    KEY_TYPE value1 = elements[threadIdx.x];
+                    KEY_TYPE value2 = elements[ixj];
+                    bool ascending = (threadIdx.x&k) == 0;
+                    KEY_TYPE lowKey = (ascending ? value1 : value2);
+                    KEY_TYPE highKey = (ascending ? value2 : value1);
+                    if (lowKey > highKey) {
+                        elements[threadIdx.x] = value2;
+                        elements[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Create a function composed of linear segments mapping data values to bucket indices.
+
+    __shared__ float segmentLowerBound[9];
+    __shared__ float segmentBaseIndex[9];
+    __shared__ float segmentIndexScale[9];
+    if (threadIdx.x == 0) {
+        segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
+        segmentLowerBound[1] = elements[5];
+        segmentLowerBound[2] = elements[10];
+        segmentLowerBound[3] = elements[20];
+        segmentLowerBound[4] = elements[30];
+        segmentLowerBound[5] = elements[40];
+        segmentLowerBound[6] = elements[50];
+        segmentLowerBound[7] = elements[60];
+        segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
+        segmentBaseIndex[0] = numBuckets/16;
+        segmentBaseIndex[1] = 3*numBuckets/16;
+        segmentBaseIndex[2] = 5*numBuckets/16;
+        segmentBaseIndex[3] = 7*numBuckets/16;
+        segmentBaseIndex[4] = 9*numBuckets/16;
+        segmentBaseIndex[5] = 11*numBuckets/16;
+        segmentBaseIndex[6] = 13*numBuckets/16;
+        segmentBaseIndex[7] = 15*numBuckets/16;
+        segmentBaseIndex[8] = numBuckets;
+        for (int i = 0; i < 8; i++)
+            if (segmentLowerBound[i+1] == segmentLowerBound[i])
+                segmentIndexScale[i] = 0;
+            else
+                segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
+    }
+    __syncthreads();
+
+    // Assign elements to buckets.
+
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        int segment;
+        for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
+            ;
+        unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
+        bucketIndex = min(max(0, bucketIndex), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
+/**
+ * Sum the bucket sizes to compute the start position of each bucket.  This kernel
+ * is executed as a single work group.
+ */
+__global__ __launch_bounds__(1024) void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
+    extern __shared__ unsigned int posBuffer[];
+    unsigned int globalOffset = 0;
+    for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
+        // Load the bucket sizes into local memory.
+
+        unsigned int globalIndex = startBucket+threadIdx.x;
+        __syncthreads();
+        posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
+        __syncthreads();
+
+        // Perform a parallel prefix sum.
+
+        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
+            __syncthreads();
+            posBuffer[threadIdx.x] += add;
+            __syncthreads();
+        }
+
+        // Write the results back to global memory.
+
+        if (globalIndex < numBuckets)
+            bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
+        globalOffset += posBuffer[blockDim.x-1];
+    }
+    if (threadIdx.x == 0) {
+        counters[0] = 0;
+    }
+}
+
+/**
+ * Copy the input data into the buckets for sorting.
+ */
+__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        DATA_TYPE element = data[index];
+        unsigned int bucketIndex = bucketOfElement[index];
+        unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
+        buckets[offset+offsetInBucket[index]] = element;
+    }
+}
+
+/**
+ * Sort the data in each bucket.
+ */
+__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, const unsigned int* __restrict__ bucketOffset) {
+    extern __shared__ DATA_TYPE dataBuffer[];
+    unsigned int index = blockIdx.x;
+    unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+    unsigned int endIndex = bucketOffset[index];
+    unsigned int length = endIndex-startIndex;
+    unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
+    if (length <= blockDim.x) {
+        // Load the data into local memory.
+
+        if (threadIdx.x < length)
+            dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
+        else if (threadIdx.x < lenghtNextPow2)
+            dataBuffer[threadIdx.x] = MAX_VALUE;
+        __syncthreads();
+
+        // Perform a bitonic sort in local memory.
+
+        for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
+            for (unsigned int j = k/2; j > 0; j /= 2) {
+                int ixj = threadIdx.x^j;
+                if (threadIdx.x < lenghtNextPow2 && ixj > threadIdx.x) {
+                    DATA_TYPE value1 = dataBuffer[threadIdx.x];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = (threadIdx.x&k) == 0;
+                    KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[threadIdx.x] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+                __syncthreads();
+            }
+        }
+
+        // Write the data to the sorted array.
+
+        if (threadIdx.x < length)
+            data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
+    }
+    else {
+        // Copy the bucket data over to the output array.
+
+        for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
+            data[startIndex+i] = buckets[startIndex+i];
+        __syncthreads();
+
+        // Perform a bitonic sort in global memory.
+
+        for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
+            for (unsigned int j = k/2; j > 0; j /= 2) {
+                for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                    int ixj = i^j;
+                    if (ixj > i && ixj < length) {
+                        DATA_TYPE value1 = data[startIndex+i];
+                        DATA_TYPE value2 = data[startIndex+ixj];
+                        bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
+                        KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                        if (lowKey > highKey) {
+                            data[startIndex+i] = value2;
+                            data[startIndex+ixj] = value1;
+                        }
+                    }
+                }
+                __syncthreads();
+            }
+        }
+    }
+}
+
+}
--- a/platforms/hip/src/kernels/utilities.hip
+++ b/platforms/hip/src/kernels/utilities.hip
+extern "C" {
+
+/**
+ * This is called by the various functions below to clear a buffer.
+ */
+__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
+    int index = blockDim.x*blockIdx.x+threadIdx.x;
+    int4* buffer4 = (int4*) buffer;
+    int sizeDiv4 = size/4;
+    while (index < sizeDiv4) {
+        buffer4[index] = make_int4(0);
+        index += blockDim.x*gridDim.x;
+    }
+    if (blockDim.x*blockIdx.x+threadIdx.x == 0)
+        for (int i = sizeDiv4*4; i < size; i++)
+            buffer[i] = 0;
+}
+
+/**
+ * Fill a buffer with 0.
+ */
+__global__ void clearBuffer(int* __restrict__ buffer, int size) {
+    clearSingleBuffer(buffer, size);
+}
+
+/**
+ * Fill two buffers with 0.
+ */
+__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+}
+
+/**
+ * Fill three buffers with 0.
+ */
+__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+}
+
+/**
+ * Fill four buffers with 0.
+ */
+__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+}
+
+/**
+ * Fill five buffers with 0.
+ */
+__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+}
+
+/**
+ * Fill six buffers with 0.
+ */
+__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
+    clearSingleBuffer(buffer1, size1);
+    clearSingleBuffer(buffer2, size2);
+    clearSingleBuffer(buffer3, size3);
+    clearSingleBuffer(buffer4, size4);
+    clearSingleBuffer(buffer5, size5);
+    clearSingleBuffer(buffer6, size6);
+}
+
+/**
+ * Sum the energy buffer.
+ */
+__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
+    extern __shared__ mixed tempBuffer[];
+    const unsigned int thread = threadIdx.x;
+    mixed sum = 0;
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x)
+        sum += energyBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < workGroupSize)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        result[blockIdx.x] = tempBuffer[0];
+}
+
+/**
+ * Record the atomic charges into the posq array.
+ */
+__global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq, int* __restrict__ atomOrder, int numAtoms) {
+    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
+        posq[i].w = charges[atomOrder[i]];
+}
+}
--- a/platforms/hip/src/kernels/vectorOps.hip
+++ b/platforms/hip/src/kernels/vectorOps.hip
+/**
+ * This file defines vector operations to simplify code elsewhere.
+ */
+
+// Versions of make_x() that take a single value and set all components to that.
+
+inline __device__ int2 make_int2(int a) {
+    return make_int2(a, a);
+}
+
+inline __device__ int3 make_int3(int a) {
+    return make_int3(a, a, a);
+}
+
+inline __device__ int4 make_int4(int a) {
+    return make_int4(a, a, a, a);
+}
+
+inline __device__ float2 make_float2(float a) {
+    return make_float2(a, a);
+}
+
+inline __device__ float3 make_float3(float a) {
+    return make_float3(a, a, a);
+}
+
+inline __device__ float4 make_float4(float a) {
+    return make_float4(a, a, a, a);
+}
+
+inline __device__ double2 make_double2(double a) {
+    return make_double2(a, a);
+}
+
+inline __device__ double3 make_double3(double a) {
+    return make_double3(a, a, a);
+}
+
+inline __device__ double4 make_double4(double a) {
+    return make_double4(a, a, a, a);
+}
+
+// Multiply a vector by a constant.
+
+inline __device__ int2 operator*(int2 a, int b) {
+    return make_int2(a.x*b, a.y*b);
+}
+
+inline __device__ int3 operator*(int3 a, int b) {
+    return make_int3(a.x*b, a.y*b, a.z*b);
+}
+
+inline __device__ int4 operator*(int4 a, int b) {
+    return make_int4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+
+inline __device__ int2 operator*(int a, int2 b) {
+    return make_int2(a*b.x, a*b.y);
+}
+
+inline __device__ int3 operator*(int a, int3 b) {
+    return make_int3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ int4 operator*(int a, int4 b) {
+    return make_int4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+inline __device__ float2 operator*(float2 a, float b) {
+    return make_float2(a.x*b, a.y*b);
+}
+
+inline __device__ float3 operator*(float3 a, float b) {
+    return make_float3(a.x*b, a.y*b, a.z*b);
+}
+
+inline __device__ float4 operator*(float4 a, float b) {
+    return make_float4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+
+inline __device__ float2 operator*(float a, float2 b) {
+    return make_float2(a*b.x, a*b.y);
+}
+
+inline __device__ float3 operator*(float a, float3 b) {
+    return make_float3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ float4 operator*(float a, float4 b) {
+    return make_float4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+inline __device__ double2 operator*(double2 a, double b) {
+    return make_double2(a.x*b, a.y*b);
+}
+
+inline __device__ double3 operator*(double3 a, double b) {
+    return make_double3(a.x*b, a.y*b, a.z*b);
+}
+
+inline __device__ double4 operator*(double4 a, double b) {
+    return make_double4(a.x*b, a.y*b, a.z*b, a.w*b);
+}
+
+inline __device__ double2 operator*(double a, double2 b) {
+    return make_double2(a*b.x, a*b.y);
+}
+
+inline __device__ double3 operator*(double a, double3 b) {
+    return make_double3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ double4 operator*(double a, double4 b) {
+    return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
+}
+
+// Divide a vector by a constant.
+
+inline __device__ int2 operator/(int2 a, int b) {
+    return make_int2(a.x/b, a.y/b);
+}
+
+inline __device__ int3 operator/(int3 a, int b) {
+    return make_int3(a.x/b, a.y/b, a.z/b);
+}
+
+inline __device__ int4 operator/(int4 a, int b) {
+    return make_int4(a.x/b, a.y/b, a.z/b, a.w/b);
+}
+
+inline __device__ float2 operator/(float2 a, float b) {
+    float scale = 1.0f/b;
+    return a*scale;
+}
+
+inline __device__ float3 operator/(float3 a, float b) {
+    float scale = 1.0f/b;
+    return a*scale;
+}
+
+inline __device__ float4 operator/(float4 a, float b) {
+    float scale = 1.0f/b;
+    return a*scale;
+}
+
+inline __device__ double2 operator/(double2 a, double b) {
+    double scale = 1.0/b;
+    return a*scale;
+}
+
+inline __device__ double3 operator/(double3 a, double b) {
+    double scale = 1.0/b;
+    return a*scale;
+}
+
+inline __device__ double4 operator/(double4 a, double b) {
+    double scale = 1.0/b;
+    return a*scale;
+}
+
+// *= operator (multiply vector by constant)
+
+inline __device__ void operator*=(int2& a, int b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(int3& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(int4& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __device__ void operator*=(float2& a, float b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(float3& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(float4& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __device__ void operator*=(double2& a, double b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(double3& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(double4& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+// Dot product
+
+inline __device__ float dot(float3 a, float3 b) {
+    return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+inline __device__ double dot(double3 a, double3 b) {
+    return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+// Cross product
+
+inline __device__ float3 cross(float3 a, float3 b) {
+    return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+inline __device__ float4 cross(float4 a, float4 b) {
+    return make_float4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0f);
+}
+
+inline __device__ double3 cross(double3 a, double3 b) {
+    return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+inline __device__ double4 cross(double4 a, double4 b) {
+    return make_double4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0);
+}
+
+// Normalize a vector
+
+inline __device__ float2 normalize(float2 a) {
+    return a*rsqrtf(a.x*a.x+a.y*a.y);
+}
+
+inline __device__ float3 normalize(float3 a) {
+    return a*rsqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
+}
+
+inline __device__ float4 normalize(float4 a) {
+    return a*rsqrtf(a.x*a.x+a.y*a.y+a.z*a.z+a.w*a.w);
+}
+
+inline __device__ double2 normalize(double2 a) {
+    return a*rsqrt(a.x*a.x+a.y*a.y);
+}
+
+inline __device__ double3 normalize(double3 a) {
+    return a*rsqrt(a.x*a.x+a.y*a.y+a.z*a.z);
+}
+
+inline __device__ double4 normalize(double4 a) {
+    return a*rsqrt(a.x*a.x+a.y*a.y+a.z*a.z+a.w*a.w);
+}
+
+// Strip off the fourth component of a vector.
+
+inline __device__ short3 trimTo3(short4 v) {
+    return make_short3(v.x, v.y, v.z);
+}
+
+inline __device__ int3 trimTo3(int4 v) {
+    return make_int3(v.x, v.y, v.z);
+}
+
+inline __device__ float3 trimTo3(float4 v) {
+    return make_float3(v.x, v.y, v.z);
+}
+
+inline __device__ double3 trimTo3(double4 v) {
+    return make_double3(v.x, v.y, v.z);
+}
--- a/platforms/hip/tests/CMakeLists.txt
+++ b/platforms/hip/tests/CMakeLists.txt
+#
+# Testing
+#
+
+ENABLE_TESTING()
+
+SET(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS TRUE CACHE BOOL "Whether to build double precision versions of HIP test cases")
+
+SET( INCLUDE_SERIALIZATION FALSE )
+#SET( INCLUDE_SERIALIZATION TRUE )
+
+IF( INCLUDE_SERIALIZATION )
+    INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
+    SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
+ENDIF( INCLUDE_SERIALIZATION )
+
+# Automatically create tests using files named "Test*.cpp"
+FILE(GLOB TEST_PROGS "*Test*.cpp")
+FOREACH(TEST_PROG ${TEST_PROGS})
+    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
+
+    # Link with shared library
+    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+
+    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
+    IF(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS)
+        ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
+        ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
+    ENDIF(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS)
+
+ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/platforms/hip/tests/HipTests.h
+++ b/platforms/hip/tests/HipTests.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
+#include "HipPlatform.h"
+#include <string>
+
+OpenMM::HipPlatform platform;
+
+void initializeTests(int argc, char* argv[]) {
+    if (argc > 1)
+        platform.setPropertyDefaultValue("Precision", std::string(argv[1]));
+    if (argc > 2)
+        platform.setPropertyDefaultValue("DeviceIndex", std::string(argv[2]));
+}
--- a/platforms/hip/tests/TestHipATMForce.cpp
+++ b/platforms/hip/tests/TestHipATMForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2023 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestATMForce.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipAndersenThermostat.cpp
+++ b/platforms/hip/tests/TestHipAndersenThermostat.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestAndersenThermostat.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipBrownianIntegrator.cpp
+++ b/platforms/hip/tests/TestHipBrownianIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestBrownianIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCMAPTorsionForce.cpp
+++ b/platforms/hip/tests/TestHipCMAPTorsionForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCMAPTorsionForce.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCMMotionRemover.cpp
+++ b/platforms/hip/tests/TestHipCMMotionRemover.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCMMotionRemover.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCheckpoints.cpp
+++ b/platforms/hip/tests/TestHipCheckpoints.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2012-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCheckpoints.h"
+
+void testCheckpoint() {
+    const int numParticles = 100;
+    const double boxSize = 5.0;
+    const double temperature = 200.0;
+    System system;
+    system.addForce(new AndersenThermostat(0.0, 100.0));
+    NonbondedForce* nonbonded = new NonbondedForce();
+    system.addForce(nonbonded);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(i%2 == 0 ? 0.1 : -0.1, 0.2, 0.1);
+        bool clash;
+        do {
+            clash = false;
+            positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+            for (int j = 0; j < i; j++) {
+                Vec3 delta = positions[i]-positions[j];
+                if (sqrt(delta.dot(delta)) < 0.1)
+                    clash = true;
+            }
+        } while (clash);
+    }
+    VerletIntegrator integrator(0.001);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature);
+
+    // Run for a little while.
+
+    integrator.step(100);
+
+    // Record the current state and make a checkpoint.
+
+    State s1 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream1(ios_base::out | ios_base::in | ios_base::binary);
+    context.createCheckpoint(stream1);
+
+    // Continue the simulation for a few more steps and record the state again.
+
+    integrator.step(10);
+    State s2 = context.getState(State::Positions | State::Velocities | State::Parameters);
+
+    // Restore from the checkpoint and see if everything gets restored correctly.
+
+    context.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context.loadCheckpoint(stream1);
+    State s3 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s1, s3);
+
+    // Now simulate from there and see if the trajectory is identical.
+
+    integrator.step(10);
+    State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s2, s4);
+
+    // Create a new Context that uses multiple devices.
+
+    string deviceIndex = platform.getPropertyValue(context, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    VerletIntegrator integrator2(0.001);
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    context2.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature);
+
+    // Now repeat all of the above tests with it.
+
+    integrator2.step(100);
+    State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    stringstream stream2(ios_base::out | ios_base::in | ios_base::binary);
+    context2.createCheckpoint(stream2);
+    integrator2.step(10);
+    State s6 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    context2.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
+    context2.setParameter(AndersenThermostat::Temperature(), temperature+10);
+    context2.loadCheckpoint(stream2);
+    State s7 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s5, s7);
+    integrator2.step(10);
+    State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
+    compareStates(s6, s8);
+
+    // See if a checkpoint created from one Context can be loaded into a different one.
+
+    VerletIntegrator integrator3(0.001);
+    Context context3(system, integrator3, platform);
+    stream1.seekg(0, stream1.beg);
+    context3.loadCheckpoint(stream1);
+    State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
+    compareStates(s1, s9);
+}
+
+void runPlatformTests() {
+    testCheckpoint();
+}
--- a/platforms/hip/tests/TestHipCompoundIntegrator.cpp
+++ b/platforms/hip/tests/TestHipCompoundIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCompoundIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/hip/tests/TestHipCustomAngleForce.cpp
+++ b/platforms/hip/tests/TestHipCustomAngleForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomAngleForce.h"
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomAngleForce* force = new CustomAngleForce("(theta-1.1)^2");
+    vector<double> params;
+    for (int i = 2; i < numParticles; i++)
+        force->addAngle(i-2, i-1, i, params);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, i%2, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+void runPlatformTests() {
+    testParallelComputation();
+}
--- a/platforms/hip/tests/TestHipCustomBondForce.cpp
+++ b/platforms/hip/tests/TestHipCustomBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2020 Advanced Micro Devices, Inc.                   *
+ * Authors: Peter Eastman, Nicholas Curtis                                    *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomBondForce.h"
+
+void testParallelComputation() {
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomBondForce* force = new CustomBondForce(("(r-1.1)^2"));
+    vector<double> params;
+    for (int i = 1; i < numParticles; i++)
+        force->addBond(i-1, i, params);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, 0, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
+    map<string, string> props;
+    props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+void runPlatformTests() {
+    testParallelComputation();
+}
--- a/platforms/hip/tests/TestHipCustomCPPForce.cpp
+++ b/platforms/hip/tests/TestHipCustomCPPForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2023 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "HipTests.h"
+#include "TestCustomCPPForce.h"
+
+void runPlatformTests() {
+}