Commit 6943ef5b authored by Yutong Zhao's avatar Yutong Zhao
Browse files

Merge pull request #9 from peastman/master

Bug fixes to use of shuffle
parents 178aa003 8a8873c6
...@@ -416,12 +416,6 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF ...@@ -416,12 +416,6 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
} }
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) { CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
map<string, string> defines;
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) {
defines["ENABLE_SHUFFLE"] = "1";
}
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"}; const string suffixes[] = {"x", "y", "z", "w"};
...@@ -463,12 +457,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -463,12 +457,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
bool useShuffle; bool useShuffle = (context.getComputeCapability() >= 3.0);
if(defines.find("ENABLE_SHUFFLE") != defines.end()) {
useShuffle = true;
} else {
useShuffle = false;
}
// Part 1. Defines for on diagonal exclusion tiles // Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1; stringstream loadLocal1;
...@@ -589,6 +578,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -589,6 +578,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str(); replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
map<string, string> defines;
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; defines["USE_CUTOFF"] = "1";
if (usePeriodic) if (usePeriodic)
...@@ -597,6 +587,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -597,6 +587,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric) if (isSymmetric)
defines["USE_SYMMETRIC"] = "1"; defines["USE_SYMMETRIC"] = "1";
if (useShuffle)
defines["ENABLE_SHUFFLE"] = "1";
defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize); defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff); defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["CUTOFF"] = context.doubleToString(cutoff); defines["CUTOFF"] = context.doubleToString(cutoff);
......
...@@ -12,18 +12,20 @@ typedef struct { ...@@ -12,18 +12,20 @@ typedef struct {
} AtomData; } AtomData;
#endif #endif
#ifdef ENABLE_SHUFFLE
//support for 64 bit shuffles //support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) { static __inline__ __device__ float real_shfl(float var, int srcLane) {
return __shfl(var, srcLane); return __shfl(var, srcLane);
} }
static __inline__ __device__ double real_shfl(double var, int srcLane) { static __inline__ __device__ double real_shfl(double var, int srcLane) {
int hi, lo; int hi, lo;
asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
hi = __shfl(hi, srcLane); hi = __shfl(hi, srcLane);
lo = __shfl(lo, srcLane); lo = __shfl(lo, srcLane);
return __hiloint2double( hi, lo ); return __hiloint2double( hi, lo );
} }
#endif
/** /**
* Compute nonbonded interactions. The kernel is separated into two parts, * Compute nonbonded interactions. The kernel is separated into two parts,
...@@ -32,27 +34,27 @@ static __inline__ __device__ double real_shfl(double var, int srcLane) { ...@@ -32,27 +34,27 @@ static __inline__ __device__ double real_shfl(double var, int srcLane) {
* each of warpsize. Each warp computes a range of tiles. * each of warpsize. Each warp computes a range of tiles.
* *
* Tiles with exclusions compute the entire set of interactions across * Tiles with exclusions compute the entire set of interactions across
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below * the forces are computed and accumulated diagonally in the manner shown below
* where, suppose * where, suppose
* *
* [a-h] comprise atom block 1, [i-p] comprise atom block 2 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
* *
* 1 denotes the first set of calculations within the warp * 1 denotes the first set of calculations within the warp
* 2 denotes the second set of calculations within the warp * 2 denotes the second set of calculations within the warp
* ... etc. * ... etc.
* *
* threads * threads
* 0 1 2 3 4 5 6 7 * 0 1 2 3 4 5 6 7
* atom1 * atom1
* L a b c d e f g h * L a b c d e f g h
* o i 1 2 3 4 5 6 7 8 * o i 1 2 3 4 5 6 7 8
* c j 8 1 2 3 4 5 6 7 * c j 8 1 2 3 4 5 6 7
* a k 7 8 1 2 3 4 5 6 * a k 7 8 1 2 3 4 5 6
* l l 6 7 8 1 2 3 4 5 * l l 6 7 8 1 2 3 4 5
* D m 5 6 7 8 1 2 3 4 * D m 5 6 7 8 1 2 3 4
* a n 4 5 6 7 8 1 2 3 * a n 4 5 6 7 8 1 2 3
* t o 3 4 5 6 7 8 1 2 * t o 3 4 5 6 7 8 1 2
* a p 2 3 4 5 6 7 8 1 * a p 2 3 4 5 6 7 8 1
* *
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
...@@ -242,8 +244,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -242,8 +244,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
shflForce.x += delta.x; shflForce.x += delta.x;
shflForce.y += delta.y; shflForce.y += delta.y;
shflForce.z += delta.z; shflForce.z += delta.z;
#else #else
...@@ -256,8 +258,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -256,8 +258,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x; shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y; shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z; shflForce.z += dEdR2.z;
#else #else
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
...@@ -419,7 +421,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -419,7 +421,7 @@ extern "C" __global__ void computeNonbonded(
#else #else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif #endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -444,8 +446,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -444,8 +446,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
shflForce.x += delta.x; shflForce.x += delta.x;
shflForce.y += delta.y; shflForce.y += delta.y;
shflForce.z += delta.z; shflForce.z += delta.z;
#else #else
...@@ -458,8 +460,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -458,8 +460,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x; shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y; shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z; shflForce.z += dEdR2.z;
#else #else
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
...@@ -518,8 +520,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -518,8 +520,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
shflForce.x += delta.x; shflForce.x += delta.x;
shflForce.y += delta.y; shflForce.y += delta.y;
shflForce.z += delta.z; shflForce.z += delta.z;
#else #else
...@@ -532,8 +534,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -532,8 +534,8 @@ extern "C" __global__ void computeNonbonded(
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x; shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y; shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z; shflForce.z += dEdR2.z;
#else #else
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment