Fixed bug on pre-Kepler GPUs. Also simplified code a little bit.

33df8402 · peastman · 178aa003 · 33df8402 · 33df8402
Commit 33df8402 authored May 24, 2013 by peastman
Showing with 44 additions and 50 deletions

platforms/cuda/src/CudaNonbondedUtilities.cpp platforms/cuda/src/CudaNonbondedUtilities.cpp +4 -12

platforms/cuda/src/kernels/nonbonded.cu platforms/cuda/src/kernels/nonbonded.cu +40 -38

No files found.
--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -416,12 +416,6 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
 }
 CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
-    map<string, string> defines;
-    if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) {
-        defines["ENABLE_SHUFFLE"] = "1";
-    }
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
@@ -463,12 +457,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    }
    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
-    bool useShuffle;
+    bool useShuffle = (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision());
-    if(defines.find("ENABLE_SHUFFLE") != defines.end()) {
-        useShuffle = true;
-    } else {
-        useShuffle = false;
-    }
    // Part 1. Defines for on diagonal exclusion tiles
    stringstream loadLocal1;
@@ -589,6 +578,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    }
    replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
+    map<string, string> defines;
    if (useCutoff)
        defines["USE_CUTOFF"] = "1";
    if (usePeriodic)
@@ -597,6 +587,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        defines["USE_EXCLUSIONS"] = "1";
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
+    if (useShuffle)
+        defines["ENABLE_SHUFFLE"] = "1";
    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
    defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
    defines["CUTOFF"] = context.doubleToString(cutoff);

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -12,18 +12,20 @@ typedef struct {
 } AtomData;
 #endif
+#ifdef ENABLE_SHUFFLE
 //support for 64 bit shuffles
 static __inline__ __device__ float real_shfl(float var, int srcLane) {
    return __shfl(var, srcLane);
 }
 static __inline__ __device__ double real_shfl(double var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
    hi = __shfl(hi, srcLane);
    lo = __shfl(lo, srcLane);
    return __hiloint2double( hi, lo );
 }
+#endif
 /**
 * Compute nonbonded interactions. The kernel is separated into two parts,
@@ -32,27 +34,27 @@ static __inline__ __device__ double real_shfl(double var, int srcLane) {
 * each of warpsize. Each warp computes a range of tiles.
 * 
 * Tiles with exclusions compute the entire set of interactions across
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 * 
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1 
 * L    a b c d e f g h 
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4 
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
@@ -242,8 +244,8 @@ extern "C" __global__ void computeNonbonded(
                    force.y -= delta.y;
                    force.z -= delta.z;
 #ifdef ENABLE_SHUFFLE
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
 #else
@@ -256,8 +258,8 @@ extern "C" __global__ void computeNonbonded(
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
 #ifdef ENABLE_SHUFFLE
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
 #else
                    localData[tbx+tj].fx += dEdR2.x;
@@ -419,7 +421,7 @@ extern "C" __global__ void computeNonbonded(
 #else
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
 #endif
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
@@ -444,8 +446,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= delta.y;
                        force.z -= delta.z;
 #ifdef ENABLE_SHUFFLE
                        shflForce.x += delta.x;
                        shflForce.y += delta.y;
                        shflForce.z += delta.z;
 #else
@@ -458,8 +460,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
 #ifdef ENABLE_SHUFFLE
                        shflForce.x += dEdR2.x;
                        shflForce.y += dEdR2.y;
                        shflForce.z += dEdR2.z;
 #else
                        localData[tbx+tj].fx += dEdR2.x;
@@ -518,8 +520,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= delta.y;
                        force.z -= delta.z;
 #ifdef ENABLE_SHUFFLE
                        shflForce.x += delta.x;
                        shflForce.y += delta.y;
                        shflForce.z += delta.z;
 #else
@@ -532,8 +534,8 @@ extern "C" __global__ void computeNonbonded(
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
 #ifdef ENABLE_SHUFFLE
                        shflForce.x += dEdR2.x;
                        shflForce.y += dEdR2.y;
                        shflForce.z += dEdR2.z;
 #else
                        localData[tbx+tj].fx += dEdR2.x;