Merge pull request #7 from proteneer/master

Nonbonded.cu now uses shuffle intrinsic

Merge pull request #7 from proteneer/master
Nonbonded.cu now uses shuffle intrinsic
08f1039f · peastman · 7521da4a · 8e6c1180 · 08f1039f · 08f1039f
Commit 08f1039f authored May 23, 2013 by peastman
Showing with 304 additions and 96 deletions

platforms/cuda/src/CudaNonbondedUtilities.cpp platforms/cuda/src/CudaNonbondedUtilities.cpp +129 -37

platforms/cuda/src/kernels/nonbonded.cu platforms/cuda/src/kernels/nonbonded.cu +175 -59

No files found.
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -416,6 +416,12 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
 }
 CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
+    map<string, string> defines;
+    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) {
+        defines["ENABLE_SHUFFLE"] = "1";
+    }
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
@@ -445,40 +451,100 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        args << arguments[i].getName();
    }
    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    stringstream load1;
+    for (int i = 0; i < (int) params.size(); i++) {
+        load1 << params[i].getType();
+        load1 << " ";
+        load1 << params[i].getName();
+        load1 << "1 = global_";
+        load1 << params[i].getName();
+        load1 << "[atom1];\n";
+    }
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    bool useShuffle;
+    if(defines.find("ENABLE_SHUFFLE") != defines.end()) {
+        useShuffle = true;
+    } else {
+        useShuffle = false;
+    }
+    // Part 1. Defines for on diagonal exclusion tiles
    stringstream loadLocal1;
+    if(useShuffle) {
+        // not needed if using shuffles as we can directly fetch from register
+    } else {
        for (int i = 0; i < (int) params.size(); i++) {
            if (params[i].getNumComponents() == 1) {
-            loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
+                loadLocal1<<"localData[threadIdx.x]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
            }
            else {
                for (int j = 0; j < params[i].getNumComponents(); ++j)
-                loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
+                    loadLocal1<<"localData[threadIdx.x]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
+            }
        }
    }
    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
+    stringstream broadcastWarpData;
+    if(useShuffle) {
+        broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n";
+        broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n";
+        broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n";
+        broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n";
+        for(int i=0; i< (int) params.size();i++) {
+            broadcastWarpData << params[i].getType() << " shfl" << params[i].getName() << ";\n";
+            for(int j=0; j < params[i].getNumComponents(); j++) {
+                string name;
+                if (params[i].getNumComponents() == 1) {
+                    broadcastWarpData << "shfl" << params[i].getName() << "=real_shfl(" << params[i].getName() <<"1,j);\n";
+                } else {
+                    broadcastWarpData << "shfl" << params[i].getName()+"."+suffixes[j] << "=real_shfl(" << params[i].getName()+"1."+suffixes[j] <<",j);\n";
+                }
+            }
+        }
+    } else {
+        // not used if not shuffling
+    }
+    replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
+    // Part 2. Defines for off-diagonal exclusions, and neighborlist tiles. 
+    stringstream declareLocal2;
+    if(useShuffle) {
+        for(int i=0; i< (int) params.size(); i++) {
+            declareLocal2<<params[i].getType()<<" shfl"<<params[i].getName()<<";\n";
+        }
+    } else {
+        // not used if using shared memory
+    }
+    replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
    stringstream loadLocal2;
+    if(useShuffle) {
+        for(int i=0; i< (int) params.size(); i++) {
+            loadLocal2<<"shfl"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
+        }
+    } else {
        for (int i = 0; i < (int) params.size(); i++) {
            if (params[i].getNumComponents() == 1) {
-            loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
+                loadLocal2<<"localData[threadIdx.x]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
            }
            else {
                loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
                for (int j = 0; j < params[i].getNumComponents(); ++j)
-                loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
+                    loadLocal2<<"localData[threadIdx.x]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
            }
        }
-    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
-    stringstream load1;
-    for (int i = 0; i < (int) params.size(); i++) {
-        load1 << params[i].getType();
-        load1 << " ";
-        load1 << params[i].getName();
-        load1 << "1 = global_";
-        load1 << params[i].getName();
-        load1 << "[atom1];\n";
    }
-    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
    stringstream load2j;
+    if(useShuffle) {
+        for(int i = 0; i < (int) params.size(); i++)
+            load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = shfl"<<params[i].getName()<<";\n";
+    } else {
        for (int i = 0; i < (int) params.size(); i++) {
            if (params[i].getNumComponents() == 1) {
                load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
@@ -493,8 +559,36 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
                load2j<<");\n";
            }
        }
+    }
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
-    map<string, string> defines;
+    stringstream shuffleWarpData;
+    if(useShuffle) {
+        shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n";
+        shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n";
+        shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n";
+        shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
+        shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
+        shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
+        shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
+        for(int i=0; i < (int) params.size(); i++) {
+            if(params[i].getNumComponents() == 1) {
+                shuffleWarpData<<"shfl"<<params[i].getName()<<"=real_shfl(shfl"<<params[i].getName()<<", tgx+1);\n";
+            } else {
+                for(int j=0;j<params[i].getNumComponents();j++) {
+                    // looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
+                    shuffleWarpData<<"shfl"<<params[i].getName()
+                        <<"."<<suffixes[j]<<"=real_shfl(shfl"
+                        <<params[i].getName()<<"."<<suffixes[j]
+                        <<", tgx+1);\n";
+                }
+            }
+        }
+    } else {
+        // not used otherwise
+    }
+    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
    if (useCutoff)
        defines["USE_CUTOFF"] = "1";
    if (usePeriodic)
@@ -519,8 +613,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
-        defines["ENABLE_SHUFFLE"] = "1";
    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+#ifndef ENABLE_SHUFFLE
 typedef struct {
    real x, y, z;
    real q;
@@ -9,6 +10,20 @@ typedef struct {
    real padding;
 #endif
 } AtomData;
+#endif
+//support for 64 bit shuffles
+static __inline__ __device__ float real_shfl(float var, int srcLane) {
+    return __shfl(var, srcLane);
+}
+static __inline__ __device__ double real_shfl(double var, int srcLane) {
+    int hi, lo;
+    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+    hi = __shfl(hi, srcLane);
+    lo = __shfl(lo, srcLane);
+    return __hiloint2double( hi, lo );
+}
 /**
 * Compute nonbonded interactions. The kernel is separated into two parts,
@@ -40,12 +55,15 @@ typedef struct {
 * t  o 3 4 5 6 7 8 1 2
 * a  p 2 3 4 5 6 7 8 1
 *
- * TODO: Implement shuffle as opposed to using nonbonded. 
- *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
 * and follows the same force accumulation method. If more there are more interactingTiles
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
- * and the full tileset.
+ * and the full tileset is computed. This should happen on the first step, and very rarely 
+ * afterwards.
+ *
+ * On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
+ * __shfl to broadcast. For all other types of tiles __shfl is used to pass around the 
+ * forces, positions, and parameters when computing the forces. 
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
@@ -83,10 +101,11 @@ extern "C" __global__ void computeNonbonded(
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
    real energy = 0.0f;
+    // used shared memory if the device cannot shuffle
+#ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+#endif
    // First loop: process tiles that contain exclusions.
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -103,16 +122,26 @@ extern "C" __global__ void computeNonbonded(
        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
+#ifdef ENABLE_SHUFFLE
-            const unsigned int localAtomIndex = threadIdx.x;
+            real4 shflPosq = posq1;
-            localData[localAtomIndex].x = posq1.x;
+#else
-            localData[localAtomIndex].y = posq1.y;
+            localData[threadIdx.x].x = posq1.x;
-            localData[localAtomIndex].z = posq1.z;
+            localData[threadIdx.x].y = posq1.y;
-            localData[localAtomIndex].q = posq1.w;
+            localData[threadIdx.x].z = posq1.z;
+            localData[threadIdx.x].q = posq1.w;
            LOAD_LOCAL_PARAMETERS_FROM_1
+#endif
+            // we do not need to fetch parameters from global since this is a symmetric tile
+            // instead we can broadcast the values using shuffle
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
-                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+                real4 posq2;
+#ifdef ENABLE_SHUFFLE
+                BROADCAST_WARP_DATA
+#else   
+                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -152,25 +181,35 @@ extern "C" __global__ void computeNonbonded(
        }
        else {
            // This is an off-diagonal tile.
-            const unsigned int localAtomIndex = threadIdx.x;
            unsigned int j = y*TILE_SIZE + tgx;
-            real4 tempPosq = posq[j];
+            real4 shflPosq = posq[j];
-            localData[localAtomIndex].x = tempPosq.x;
+#ifdef ENABLE_SHUFFLE
-            localData[localAtomIndex].y = tempPosq.y;
+            real3 shflForce;
-            localData[localAtomIndex].z = tempPosq.z;
+            shflForce.x = 0.0f;
-            localData[localAtomIndex].q = tempPosq.w;
+            shflForce.y = 0.0f;
+            shflForce.z = 0.0f;
+#else
+            localData[threadIdx.x].x = shflPosq.x;
+            localData[threadIdx.x].y = shflPosq.y;
+            localData[threadIdx.x].z = shflPosq.z;
+            localData[threadIdx.x].q = shflPosq.w;
+            localData[threadIdx.x].fx = 0.0f;
+            localData[threadIdx.x].fy = 0.0f;
+            localData[threadIdx.x].fz = 0.0f;
+#endif
+            DECLARE_LOCAL_PARAMETERS
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            localData[localAtomIndex].fx = 0.0f;
-            localData[localAtomIndex].fy = 0.0f;
-            localData[localAtomIndex].fz = 0.0f;
 #ifdef USE_EXCLUSIONS
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
 #endif
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
+#ifdef ENABLE_SHUFFLE
+                real4 posq2 = shflPosq;
+#else
                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -202,46 +241,64 @@ extern "C" __global__ void computeNonbonded(
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
+#ifdef ENABLE_SHUFFLE
+                    shflForce.x += delta.x;
+                    shflForce.y += delta.y;
+                    shflForce.z += delta.z;
+#else
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
-#else
+#endif
+#else // !USE_SYMMETRIC
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
+#ifdef ENABLE_SHUFFLE
+                    shflForce.x += dEdR2.x;
+                    shflForce.y += dEdR2.y;
+                    shflForce.z += dEdR2.z;
+#else
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
 #endif 
+#endif // end USE_SYMMETRIC
 #ifdef USE_CUTOFF
                }
 #endif
 #ifdef USE_EXCLUSIONS
                excl >>= 1;
+#endif
+#ifdef ENABLE_SHUFFLE
+                SHUFFLE_WARP_DATA
 #endif
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
-        }
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            // write results for off diagonal tiles
-        // Write results.
+#ifdef ENABLE_SHUFFLE
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
-        unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
-        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
-        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+#else
-        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-        if (x != y) {
-            offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+#endif
        }
+        // Write results for on and off diagonal tiles
+        const unsigned int offset = x*TILE_SIZE + tgx;
+        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
    }
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
    int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
@@ -253,6 +310,8 @@ extern "C" __global__ void computeNonbonded(
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
+    // atomIndices can probably be shuffled as well
+    // but it probably wouldn't make things any faster
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
    skipTiles[threadIdx.x] = -1;
@@ -263,7 +322,6 @@ extern "C" __global__ void computeNonbonded(
        bool includeTile = true;
        // Extract the coordinates of this tile.
        unsigned int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
@@ -303,46 +361,64 @@ extern "C" __global__ void computeNonbonded(
        }
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
-            const unsigned int localAtomIndex = threadIdx.x;
+            //const unsigned int localAtomIndex = threadIdx.x;
 #ifdef USE_CUTOFF
            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
            atomIndices[threadIdx.x] = j;
+#ifdef ENABLE_SHUFFLE
+            DECLARE_LOCAL_PARAMETERS
+            real4 shflPosq;
+            real3 shflForce;
+            shflForce.x = 0.0f;
+            shflForce.y = 0.0f;
+            shflForce.z = 0.0f;
+#endif
            if (j < PADDED_NUM_ATOMS) {
                // Load position of atom j from from global memory
-                real4 tempPosq = posq[j];
+#ifdef ENABLE_SHUFFLE
-                localData[localAtomIndex].x = tempPosq.x;
+                shflPosq = posq[j];
-                localData[localAtomIndex].y = tempPosq.y;
+#else
-                localData[localAtomIndex].z = tempPosq.z;
+                localData[threadIdx.x].x = posq[j].x;
-                localData[localAtomIndex].q = tempPosq.w;
+                localData[threadIdx.x].y = posq[j].y;
+                localData[threadIdx.x].z = posq[j].z;
+                localData[threadIdx.x].q = posq[j].w;
+                localData[threadIdx.x].fx = 0.0f;
+                localData[threadIdx.x].fy = 0.0f;
+                localData[threadIdx.x].fz = 0.0f;
+#endif                
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                localData[localAtomIndex].fx = 0.0f;
-                localData[localAtomIndex].fy = 0.0f;
-                localData[localAtomIndex].fz = 0.0f;
            }
 #ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
                posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
-                localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+#ifdef ENABLE_SHUFFLE
-                localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                shflPosq.x -= floor((shflPosq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                shflPosq.y -= floor((shflPosq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                shflPosq.z -= floor((shflPosq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#else
+                localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
+#ifdef ENABLE_SHUFFLE
+                    real4 posq2 = shflPosq; 
+#else
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
@@ -367,18 +443,34 @@ extern "C" __global__ void computeNonbonded(
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
+#ifdef ENABLE_SHUFFLE
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
+                        shflForce.z += delta.z;
+#else
                        localData[tbx+tj].fx += delta.x;
                        localData[tbx+tj].fy += delta.y;
                        localData[tbx+tj].fz += delta.z;
-#else
+#endif
+#else // !USE_SYMMETRIC
                        force.x -= dEdR1.x;
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
+#ifdef ENABLE_SHUFFLE
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
+                        shflForce.z += dEdR2.z;
+#else
                        localData[tbx+tj].fx += dEdR2.x;
                        localData[tbx+tj].fy += dEdR2.y;
                        localData[tbx+tj].fz += dEdR2.z;
 #endif 
+#endif // end USE_SYMMETRIC
                    }
+#ifdef ENABLE_SHUFFLE
+                    SHUFFLE_WARP_DATA
+#endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
@@ -386,11 +478,14 @@ extern "C" __global__ void computeNonbonded(
 #endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
+#ifdef ENABLE_SHUFFLE
+                    real4 posq2 = shflPosq;
+#else
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
+#endif
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
@@ -422,26 +517,41 @@ extern "C" __global__ void computeNonbonded(
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
+#ifdef ENABLE_SHUFFLE
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
+                        shflForce.z += delta.z;
+#else
                        localData[tbx+tj].fx += delta.x;
                        localData[tbx+tj].fy += delta.y;
                        localData[tbx+tj].fz += delta.z;
-#else
+#endif
+#else // !USE_SYMMETRIC
                        force.x -= dEdR1.x;
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
+#ifdef ENABLE_SHUFFLE
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
+                        shflForce.z += dEdR2.z;
+#else
                        localData[tbx+tj].fx += dEdR2.x;
                        localData[tbx+tj].fy += dEdR2.y;
                        localData[tbx+tj].fz += dEdR2.z;
 #endif 
+#endif // end USE_SYMMETRIC
 #ifdef USE_CUTOFF
                    }
+#endif
+#ifdef ENABLE_SHUFFLE
+                    SHUFFLE_WARP_DATA
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
            // Write results.
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
@@ -451,9 +561,15 @@ extern "C" __global__ void computeNonbonded(
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
            if (atom2 < PADDED_NUM_ATOMS) {
+#ifdef ENABLE_SHUFFLE
+                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
+                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
+#else
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+#endif
            }
        }
        pos++;