Merge pull request #9 from peastman/master

Bug fixes to use of shuffle

Merge pull request #9 from peastman/master
Bug fixes to use of shuffle
6943ef5b · Yutong Zhao · 178aa003 · 8a8873c6 · 6943ef5b · 6943ef5b
Commit 6943ef5b authored May 24, 2013 by Yutong Zhao
Showing with 44 additions and 50 deletions

platforms/cuda/src/CudaNonbondedUtilities.cpp platforms/cuda/src/CudaNonbondedUtilities.cpp +4 -12

platforms/cuda/src/kernels/nonbonded.cu platforms/cuda/src/kernels/nonbonded.cu +40 -38

No files found.
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -416,12 +416,6 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
 }

 CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
-    
-    map<string, string> defines;
-    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) {
-        defines["ENABLE_SHUFFLE"] = "1";
-    }
-
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
@@ -463,12 +457,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    }
    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();

-    bool useShuffle;
-    if(defines.find("ENABLE_SHUFFLE") != defines.end()) {
-        useShuffle = true;
-    } else {
-        useShuffle = false;
-    }
+    bool useShuffle = (context.getComputeCapability() >= 3.0);

    // Part 1. Defines for on diagonal exclusion tiles
    stringstream loadLocal1;
@@ -589,6 +578,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    }
    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();

+    map<string, string> defines;
    if (useCutoff)
        defines["USE_CUTOFF"] = "1";
    if (usePeriodic)
@@ -597,6 +587,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        defines["USE_EXCLUSIONS"] = "1";
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
+    if (useShuffle)
+        defines["ENABLE_SHUFFLE"] = "1";
    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
    defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
    defines["CUTOFF"] = context.doubleToString(cutoff);

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -12,18 +12,20 @@ typedef struct {
 } AtomData;
 #endif

+#ifdef ENABLE_SHUFFLE
 //support for 64 bit shuffles
 static __inline__ __device__ float real_shfl(float var, int srcLane) {
    return __shfl(var, srcLane);
 }

 static __inline__ __device__ double real_shfl(double var, int srcLane) {
-    int hi, lo;
-    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
-    hi = __shfl(hi, srcLane);
-    lo = __shfl(lo, srcLane);
+    int hi, lo;
+    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+    hi = __shfl(hi, srcLane);
+    lo = __shfl(lo, srcLane);
    return __hiloint2double( hi, lo );
 }
+#endif

 /**
 * Compute nonbonded interactions. The kernel is separated into two parts,
@@ -32,27 +34,27 @@ static __inline__ __device__ double real_shfl(double var, int srcLane) {
 * each of warpsize. Each warp computes a range of tiles.
 * 
 * Tiles with exclusions compute the entire set of interactions across
- * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
- * the forces are computed and accumulated diagonally in the manner shown below
- * where, suppose
- *
- * [a-h] comprise atom block 1, [i-p] comprise atom block 2
- *
- * 1 denotes the first set of calculations within the warp
- * 2 denotes the second set of calculations within the warp
- * ... etc.
- * 
- *        threads
- *     0 1 2 3 4 5 6 7
- *         atom1 
- * L    a b c d e f g h 
- * o  i 1 2 3 4 5 6 7 8
- * c  j 8 1 2 3 4 5 6 7
- * a  k 7 8 1 2 3 4 5 6
- * l  l 6 7 8 1 2 3 4 5
- * D  m 5 6 7 8 1 2 3 4 
- * a  n 4 5 6 7 8 1 2 3
- * t  o 3 4 5 6 7 8 1 2
+ * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
+ * the forces are computed and accumulated diagonally in the manner shown below
+ * where, suppose
+ *
+ * [a-h] comprise atom block 1, [i-p] comprise atom block 2
+ *
+ * 1 denotes the first set of calculations within the warp
+ * 2 denotes the second set of calculations within the warp
+ * ... etc.
+ * 
+ *        threads
+ *     0 1 2 3 4 5 6 7
+ *         atom1 
+ * L    a b c d e f g h 
+ * o  i 1 2 3 4 5 6 7 8
+ * c  j 8 1 2 3 4 5 6 7
+ * a  k 7 8 1 2 3 4 5 6
+ * l  l 6 7 8 1 2 3 4 5
+ * D  m 5 6 7 8 1 2 3 4 
+ * a  n 4 5 6 7 8 1 2 3
+ * t  o 3 4 5 6 7 8 1 2
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
@@ -242,8 +244,8 @@ extern "C" __global__ void computeNonbonded(
                    force.y -= delta.y;
                    force.z -= delta.z;
 #ifdef ENABLE_SHUFFLE
-                    shflForce.x += delta.x;
-                    shflForce.y += delta.y;
+                    shflForce.x += delta.x;
+                    shflForce.y += delta.y;
                    shflForce.z += delta.z;

 #else
@@ -256,8 +258,8 @@ extern "C" __global__ void computeNonbonded(
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
 #ifdef ENABLE_SHUFFLE
-                    shflForce.x += dEdR2.x;
-                    shflForce.y += dEdR2.y;
+                    shflForce.x += dEdR2.x;
+                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
 #else
                    localData[tbx+tj].fx += dEdR2.x;
@@ -419,7 +421,7 @@ extern "C" __global__ void computeNonbonded(
 #else
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
 #endif
-                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
@@ -444,8 +446,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= delta.y;
                        force.z -= delta.z;
 #ifdef ENABLE_SHUFFLE
-                        shflForce.x += delta.x;
-                        shflForce.y += delta.y;
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
                        shflForce.z += delta.z;

 #else
@@ -458,8 +460,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
 #ifdef ENABLE_SHUFFLE
-                        shflForce.x += dEdR2.x;
-                        shflForce.y += dEdR2.y;
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
                        shflForce.z += dEdR2.z;
 #else
                        localData[tbx+tj].fx += dEdR2.x;
@@ -518,8 +520,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= delta.y;
                        force.z -= delta.z;
 #ifdef ENABLE_SHUFFLE
-                        shflForce.x += delta.x;
-                        shflForce.y += delta.y;
+                        shflForce.x += delta.x;
+                        shflForce.y += delta.y;
                        shflForce.z += delta.z;

 #else
@@ -532,8 +534,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
 #ifdef ENABLE_SHUFFLE
-                        shflForce.x += dEdR2.x;
-                        shflForce.y += dEdR2.y;
+                        shflForce.x += dEdR2.x;
+                        shflForce.y += dEdR2.y;
                        shflForce.z += dEdR2.z;
 #else
                        localData[tbx+tj].fx += dEdR2.x;