Merge pull request #2051 from peastman/syncerror

Fixed incorrect synchronization on Volta

Merge pull request #2051 from peastman/syncerror
Fixed incorrect synchronization on Volta
72def6fb · peastman · GitHub · c9eb7fb5 · f1bbb8e7 · 72def6fb
Unverified Commit 72def6fb authored Apr 26, 2018 by peastman Committed by GitHub Apr 26, 2018
2 changed files
--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -8,9 +8,22 @@ typedef struct {
 #endif
 } AtomData;
+/**
+ * Find the maximum of a value across all threads in a warp, and return that to
+ * every thread.  This is only needed on Volta and later.  On earlier architectures,
+ * we can just return the value that was passed in.
+ */
+__device__ int reduceMax(int val) {
+#if __CUDA_ARCH__ >= 700
+    for (int mask = 16; mask > 0; mask /= 2) 
+        val = max(val, __shfl_xor(val, mask));
+#endif
+    return val;
+}
 extern "C" __global__ void computeInteractionGroups(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
-        int* __restrict__ numGroupTiles, bool useNeighborList,
+        const int* __restrict__ numGroupTiles, bool useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -43,7 +56,10 @@ extern "C" __global__ void computeInteractionGroups(
        localData[threadIdx.x].fy = 0.0f;
        localData[threadIdx.x].fz = 0.0f;
        int tj = tgx;
-        for (int j = rangeStart; j < rangeEnd; j++) {
+        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
+        SYNC_WARPS;
+        for (int j = rangeStart; j < rangeStop; j++) {
+            if (j < rangeEnd) {
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
                posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
@@ -75,6 +91,8 @@ extern "C" __global__ void computeInteractionGroups(
 #endif
                tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            }
+            SYNC_WARPS;
+        }
        if (exclusions != 0) {
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
@@ -83,6 +101,7 @@ extern "C" __global__ void computeInteractionGroups(
        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+        SYNC_WARPS;
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
    SAVE_DERIVATIVES
@@ -134,9 +153,11 @@ extern "C" __global__  void buildNeighborList(int* __restrict__ rebuildNeighborL
        if (tgx == 0)
            anyInteraction[local_warp] = false;
        int tj = tgx;
+        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
+        SYNC_WARPS;
+        for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
            SYNC_WARPS;
-        for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
+            if (j < rangeEnd && tj < rangeEnd) {
-            if (tj < rangeEnd) {
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
                real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);

--- a/platforms/opencl/src/kernels/customNonbondedGroups.cl
+++ b/platforms/opencl/src/kernels/customNonbondedGroups.cl
@@ -12,6 +12,22 @@ typedef struct {
 #endif
 } AtomData;
+/**
+ * Find the maximum of a value across all threads in a warp, and return that to
+ * every thread.
+ */
+int reduceMax(int val, __local int* temp) {
+    int indexInWarp = get_local_id(0)%32;
+    temp[get_local_id(0)] = val;
+    SYNC_WARPS;
+    for (int offset = 16; offset > 0; offset /= 2) {
+        if (offset < indexInWarp)
+            temp[get_local_id(0)] = max(temp[get_local_id(0)], temp[get_local_id(0)+offset]);
+        SYNC_WARPS;
+    }
+    return temp[get_local_id(0)-indexInWarp];
+}
 /**
 * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
 * a single tile might have computed forces on the same atom.  This loops over them and makes sure
@@ -53,6 +69,7 @@ __kernel void computeInteractionGroups(
    mixed energy = 0;
    INIT_DERIVATIVES
    __local AtomData localData[LOCAL_MEMORY_SIZE];
+    __local int reductionBuffer[LOCAL_MEMORY_SIZE];
    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
@@ -76,9 +93,10 @@ __kernel void computeInteractionGroups(
        localData[get_local_id(0)].fy = 0.0f;
        localData[get_local_id(0)].fz = 0.0f;
        int tj = tgx;
+        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
        SYNC_WARPS;
-        for (int j = rangeStart; j < rangeEnd; j++) {
+        for (int j = rangeStart; j < rangeStop; j++) {
-            if (tj < rangeEnd) {
+            if (j < rangeEnd) {
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
                posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
@@ -161,6 +179,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
    __local real4 localPos[LOCAL_MEMORY_SIZE];
    __local volatile bool anyInteraction[WARPS_IN_BLOCK];
    __local volatile int tileIndex[WARPS_IN_BLOCK];
+    __local int reductionBuffer[LOCAL_MEMORY_SIZE];
    const unsigned int startTile = warp*NUM_TILES/totalWarps;
    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
@@ -176,9 +195,11 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
        if (tgx == 0)
            anyInteraction[local_warp] = false;
        int tj = tgx;
+        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
+        SYNC_WARPS;
+        for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
            SYNC_WARPS;
-        for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
+            if (j < rangeEnd) {
-            if (tj < rangeEnd) {
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
                real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0);