Unverified Commit c7af17c8 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Removed code for CUDA devices without shuffle (#3528)

parent ae1b02dc
...@@ -556,95 +556,38 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -556,95 +556,38 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
int cudaVersion;
cuDriverGetVersion(&cudaVersion);
bool useShuffle = (context.getComputeCapability() >= 3.0 && cudaVersion >= 5050);
// Part 1. Defines for on diagonal exclusion tiles // Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1;
if(useShuffle) {
// not needed if using shuffles as we can directly fetch from register
} else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
else {
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream broadcastWarpData; stringstream broadcastWarpData;
if (useShuffle) { broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n";
broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n"; broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n";
broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n"; broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n";
broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n"; broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n";
broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n"; for (const ParameterInfo& param : params) {
for (const ParameterInfo& param : params) { broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n"; for (int j = 0; j < param.getNumComponents(); j++) {
for (int j = 0; j < param.getNumComponents(); j++) { if (param.getNumComponents() == 1)
if (param.getNumComponents() == 1) broadcastWarpData << "shfl" << param.getName() << "=real_shfl(" << param.getName() <<"1,j);\n";
broadcastWarpData << "shfl" << param.getName() << "=real_shfl(" << param.getName() <<"1,j);\n"; else
else broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=real_shfl(" << param.getName()+"1."+suffixes[j] <<",j);\n";
broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=real_shfl(" << param.getName()+"1."+suffixes[j] <<",j);\n";
}
} }
} else {
// not used if not shuffling
} }
replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str(); replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles. // Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream declareLocal2; stringstream declareLocal2;
if (useShuffle) { for (const ParameterInfo& param : params)
for (const ParameterInfo& param : params) declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
}
else {
// not used if using shared memory
}
replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str(); replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
stringstream loadLocal2; stringstream loadLocal2;
if (useShuffle) { for (const ParameterInfo& param : params)
for (const ParameterInfo& param : params) loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
}
else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
else {
loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load2j; stringstream load2j;
if (useShuffle) { for (const ParameterInfo& param : params)
for (const ParameterInfo& param : params) load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
}
else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
else {
load2j<<param.getType()<<" "<<param.getName()<<"2 = make_"<<param.getType()<<"(";
for (int j = 0; j < param.getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream load2g; stringstream load2g;
...@@ -654,11 +597,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -654,11 +597,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
stringstream clearLocal; stringstream clearLocal;
for (const ParameterInfo& param : params) { for (const ParameterInfo& param : params) {
if (useShuffle) clearLocal<<"shfl"<<param.getName()<<" = ";
clearLocal<<"shfl";
else
clearLocal<<"localData[atom2].";
clearLocal<<param.getName()<<" = ";
if (param.getNumComponents() == 1) if (param.getNumComponents() == 1)
clearLocal<<"0;\n"; clearLocal<<"0;\n";
else else
...@@ -680,29 +619,25 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -680,29 +619,25 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
replacements["SAVE_DERIVATIVES"] = saveDerivs.str(); replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
stringstream shuffleWarpData; stringstream shuffleWarpData;
if(useShuffle) { shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n";
shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n"; shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n";
shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n"; shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n";
shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n"; shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n"; shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n"; shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n"; shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n"; for (const ParameterInfo& param : params) {
for (const ParameterInfo& param : params) { if (param.getNumComponents() == 1)
if (param.getNumComponents() == 1) shuffleWarpData<<"shfl"<<param.getName()<<"=real_shfl(shfl"<<param.getName()<<", tgx+1);\n";
shuffleWarpData<<"shfl"<<param.getName()<<"=real_shfl(shfl"<<param.getName()<<", tgx+1);\n"; else {
else { for (int j = 0; j < param.getNumComponents(); j++) {
for (int j = 0; j < param.getNumComponents(); j++) { // looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
// looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1); shuffleWarpData<<"shfl"<<param.getName()
shuffleWarpData<<"shfl"<<param.getName() <<"."<<suffixes[j]<<"=real_shfl(shfl"
<<"."<<suffixes[j]<<"=real_shfl(shfl" <<param.getName()<<"."<<suffixes[j]
<<param.getName()<<"."<<suffixes[j] <<", tgx+1);\n";
<<", tgx+1);\n";
}
} }
} }
} else {
// not used otherwise
} }
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str(); replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
...@@ -715,8 +650,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -715,8 +650,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric) if (isSymmetric)
defines["USE_SYMMETRIC"] = "1"; defines["USE_SYMMETRIC"] = "1";
if (useShuffle) defines["ENABLE_SHUFFLE"] = "1";
defines["ENABLE_SHUFFLE"] = "1";
if (includeForces) if (includeForces)
defines["INCLUDE_FORCES"] = "1"; defines["INCLUDE_FORCES"] = "1";
if (includeEnergy) if (includeEnergy)
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#ifndef ENABLE_SHUFFLE
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
#endif
#ifdef ENABLE_SHUFFLE
//support for 64 bit shuffles //support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) { static __inline__ __device__ float real_shfl(float var, int srcLane) {
return SHFL(var, srcLane); return SHFL(var, srcLane);
...@@ -39,7 +26,6 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) { ...@@ -39,7 +26,6 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
int2 fuse; fuse.x = lo; fuse.y = hi; int2 fuse; fuse.x = lo; fuse.y = hi;
return *reinterpret_cast<long long*>(&fuse); return *reinterpret_cast<long long*>(&fuse);
} }
#endif
/** /**
* Save the force on a single atom. * Save the force on a single atom.
...@@ -131,10 +117,6 @@ extern "C" __global__ void computeNonbonded( ...@@ -131,10 +117,6 @@ extern "C" __global__ void computeNonbonded(
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
mixed energy = 0; mixed energy = 0;
INIT_DERIVATIVES INIT_DERIVATIVES
// used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -154,26 +136,14 @@ extern "C" __global__ void computeNonbonded( ...@@ -154,26 +136,14 @@ extern "C" __global__ void computeNonbonded(
const bool hasExclusions = true; const bool hasExclusions = true;
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
#ifdef ENABLE_SHUFFLE
real4 shflPosq = posq1; real4 shflPosq = posq1;
#else
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#endif
// we do not need to fetch parameters from global since this is a symmetric tile // we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle // instead we can broadcast the values using shuffle
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2; real4 posq2;
#ifdef ENABLE_SHUFFLE
BROADCAST_WARP_DATA BROADCAST_WARP_DATA
#else
posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
...@@ -216,20 +186,10 @@ extern "C" __global__ void computeNonbonded( ...@@ -216,20 +186,10 @@ extern "C" __global__ void computeNonbonded(
// This is an off-diagonal tile. // This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 shflPosq = posq[j]; real4 shflPosq = posq[j];
#ifdef ENABLE_SHUFFLE
real3 shflForce; real3 shflForce;
shflForce.x = 0.0f; shflForce.x = 0.0f;
shflForce.y = 0.0f; shflForce.y = 0.0f;
shflForce.z = 0.0f; shflForce.z = 0.0f;
#else
localData[threadIdx.x].x = shflPosq.x;
localData[threadIdx.x].y = shflPosq.y;
localData[threadIdx.x].z = shflPosq.z;
localData[threadIdx.x].q = shflPosq.w;
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
#endif
DECLARE_LOCAL_PARAMETERS DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -238,11 +198,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -238,11 +198,7 @@ extern "C" __global__ void computeNonbonded(
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq; real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
...@@ -271,34 +227,19 @@ extern "C" __global__ void computeNonbonded( ...@@ -271,34 +227,19 @@ extern "C" __global__ void computeNonbonded(
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x; shflForce.x += delta.x;
shflForce.y += delta.y; shflForce.y += delta.y;
shflForce.z += delta.z; shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC #else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x; shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y; shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z; shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC #endif // end USE_SYMMETRIC
#endif #endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA SHUFFLE_WARP_DATA
#endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
...@@ -309,15 +250,9 @@ extern "C" __global__ void computeNonbonded( ...@@ -309,15 +250,9 @@ extern "C" __global__ void computeNonbonded(
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
// write results for off diagonal tiles // write results for off diagonal tiles
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef ENABLE_SHUFFLE
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
#else
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(localData[threadIdx.x].fx)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(localData[threadIdx.x].fy)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(localData[threadIdx.x].fz)));
#endif
#endif #endif
} }
// Write results for on and off diagonal tiles // Write results for on and off diagonal tiles
...@@ -399,37 +334,19 @@ extern "C" __global__ void computeNonbonded( ...@@ -399,37 +334,19 @@ extern "C" __global__ void computeNonbonded(
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j; atomIndices[threadIdx.x] = j;
#ifdef ENABLE_SHUFFLE
DECLARE_LOCAL_PARAMETERS DECLARE_LOCAL_PARAMETERS
real4 shflPosq; real4 shflPosq;
real3 shflForce; real3 shflForce;
shflForce.x = 0.0f; shflForce.x = 0.0f;
shflForce.y = 0.0f; shflForce.y = 0.0f;
shflForce.z = 0.0f; shflForce.z = 0.0f;
#endif
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
// Load position of atom j from from global memory // Load position of atom j from from global memory
#ifdef ENABLE_SHUFFLE
shflPosq = posq[j]; shflPosq = posq[j];
#else
localData[threadIdx.x].x = posq[j].x;
localData[threadIdx.x].y = posq[j].y;
localData[threadIdx.x].z = posq[j].z;
localData[threadIdx.x].q = posq[j].w;
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
else { else {
#ifdef ENABLE_SHUFFLE
shflPosq = make_real4(0, 0, 0, 0); shflPosq = make_real4(0, 0, 0, 0);
#else
localData[threadIdx.x].x = 0;
localData[threadIdx.x].y = 0;
localData[threadIdx.x].z = 0;
#endif
CLEAR_LOCAL_PARAMETERS CLEAR_LOCAL_PARAMETERS
} }
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -438,19 +355,11 @@ extern "C" __global__ void computeNonbonded( ...@@ -438,19 +355,11 @@ extern "C" __global__ void computeNonbonded(
// box, then skip having to apply periodic boundary conditions later. // box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
#ifdef ENABLE_SHUFFLE
APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
#else
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
#endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq; real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -476,34 +385,19 @@ extern "C" __global__ void computeNonbonded( ...@@ -476,34 +385,19 @@ extern "C" __global__ void computeNonbonded(
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x; shflForce.x += delta.x;
shflForce.y += delta.y; shflForce.y += delta.y;
shflForce.z += delta.z; shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC #else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x; shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y; shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z; shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC #endif // end USE_SYMMETRIC
#endif #endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA SHUFFLE_WARP_DATA
#endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
...@@ -514,11 +408,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -514,11 +408,7 @@ extern "C" __global__ void computeNonbonded(
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq; real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
...@@ -547,34 +437,19 @@ extern "C" __global__ void computeNonbonded( ...@@ -547,34 +437,19 @@ extern "C" __global__ void computeNonbonded(
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x; shflForce.x += delta.x;
shflForce.y += delta.y; shflForce.y += delta.y;
shflForce.z += delta.z; shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC #else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x; shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y; shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z; shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC #endif // end USE_SYMMETRIC
#endif #endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA SHUFFLE_WARP_DATA
#endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
...@@ -590,15 +465,9 @@ extern "C" __global__ void computeNonbonded( ...@@ -590,15 +465,9 @@ extern "C" __global__ void computeNonbonded(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x))); atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y))); atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z))); atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
#else
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(localData[threadIdx.x].fx)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(localData[threadIdx.x].fy)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(localData[threadIdx.x].fz)));
#endif
} }
#endif #endif
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment