Commit 6943ef5b authored by Yutong Zhao's avatar Yutong Zhao
Browse files

Merge pull request #9 from peastman/master

Bug fixes to use of shuffle
parents 178aa003 8a8873c6
......@@ -416,12 +416,6 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
}
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
map<string, string> defines;
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) {
defines["ENABLE_SHUFFLE"] = "1";
}
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"};
......@@ -463,12 +457,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
bool useShuffle;
if(defines.find("ENABLE_SHUFFLE") != defines.end()) {
useShuffle = true;
} else {
useShuffle = false;
}
bool useShuffle = (context.getComputeCapability() >= 3.0);
// Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1;
......@@ -589,6 +578,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
map<string, string> defines;
if (useCutoff)
defines["USE_CUTOFF"] = "1";
if (usePeriodic)
......@@ -597,6 +587,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
if (useShuffle)
defines["ENABLE_SHUFFLE"] = "1";
defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["CUTOFF"] = context.doubleToString(cutoff);
......
......@@ -12,18 +12,20 @@ typedef struct {
} AtomData;
#endif
#ifdef ENABLE_SHUFFLE
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
return __shfl(var, srcLane);
}
static __inline__ __device__ double real_shfl(double var, int srcLane) {
int hi, lo;
asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
hi = __shfl(hi, srcLane);
lo = __shfl(lo, srcLane);
int hi, lo;
asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
hi = __shfl(hi, srcLane);
lo = __shfl(lo, srcLane);
return __hiloint2double( hi, lo );
}
#endif
/**
* Compute nonbonded interactions. The kernel is separated into two parts,
......@@ -32,27 +34,27 @@ static __inline__ __device__ double real_shfl(double var, int srcLane) {
* each of warpsize. Each warp computes a range of tiles.
*
* Tiles with exclusions compute the entire set of interactions across
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below
* where, suppose
*
* [a-h] comprise atom block 1, [i-p] comprise atom block 2
*
* 1 denotes the first set of calculations within the warp
* 2 denotes the second set of calculations within the warp
* ... etc.
*
* threads
* 0 1 2 3 4 5 6 7
* atom1
* L a b c d e f g h
* o i 1 2 3 4 5 6 7 8
* c j 8 1 2 3 4 5 6 7
* a k 7 8 1 2 3 4 5 6
* l l 6 7 8 1 2 3 4 5
* D m 5 6 7 8 1 2 3 4
* a n 4 5 6 7 8 1 2 3
* t o 3 4 5 6 7 8 1 2
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below
* where, suppose
*
* [a-h] comprise atom block 1, [i-p] comprise atom block 2
*
* 1 denotes the first set of calculations within the warp
* 2 denotes the second set of calculations within the warp
* ... etc.
*
* threads
* 0 1 2 3 4 5 6 7
* atom1
* L a b c d e f g h
* o i 1 2 3 4 5 6 7 8
* c j 8 1 2 3 4 5 6 7
* a k 7 8 1 2 3 4 5 6
* l l 6 7 8 1 2 3 4 5
* D m 5 6 7 8 1 2 3 4
* a n 4 5 6 7 8 1 2 3
* t o 3 4 5 6 7 8 1 2
* a p 2 3 4 5 6 7 8 1
*
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
......@@ -242,8 +244,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= delta.y;
force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
......@@ -256,8 +258,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
......@@ -419,7 +421,7 @@ extern "C" __global__ void computeNonbonded(
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -444,8 +446,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= delta.y;
force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
......@@ -458,8 +460,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
......@@ -518,8 +520,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= delta.y;
force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
......@@ -532,8 +534,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment