Commit 08f1039f authored by peastman's avatar peastman
Browse files

Merge pull request #7 from proteneer/master

Nonbonded.cu now uses shuffle intrinsic
parents 7521da4a 8e6c1180
...@@ -416,6 +416,12 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF ...@@ -416,6 +416,12 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
} }
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) { CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
map<string, string> defines;
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) {
defines["ENABLE_SHUFFLE"] = "1";
}
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"}; const string suffixes[] = {"x", "y", "z", "w"};
...@@ -445,40 +451,100 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -445,40 +451,100 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
args << arguments[i].getName(); args << arguments[i].getName();
} }
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream load1;
for (int i = 0; i < (int) params.size(); i++) {
load1 << params[i].getType();
load1 << " ";
load1 << params[i].getName();
load1 << "1 = global_";
load1 << params[i].getName();
load1 << "[atom1];\n";
}
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
bool useShuffle;
if(defines.find("ENABLE_SHUFFLE") != defines.end()) {
useShuffle = true;
} else {
useShuffle = false;
}
// Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1; stringstream loadLocal1;
if(useShuffle) {
// not needed if using shuffles as we can directly fetch from register
} else {
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) { if (params[i].getNumComponents() == 1) {
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n"; loadLocal1<<"localData[threadIdx.x]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
} }
else { else {
for (int j = 0; j < params[i].getNumComponents(); ++j) for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n"; loadLocal1<<"localData[threadIdx.x]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
}
} }
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream broadcastWarpData;
if(useShuffle) {
broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n";
broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n";
broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n";
broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n";
for(int i=0; i< (int) params.size();i++) {
broadcastWarpData << params[i].getType() << " shfl" << params[i].getName() << ";\n";
for(int j=0; j < params[i].getNumComponents(); j++) {
string name;
if (params[i].getNumComponents() == 1) {
broadcastWarpData << "shfl" << params[i].getName() << "=real_shfl(" << params[i].getName() <<"1,j);\n";
} else {
broadcastWarpData << "shfl" << params[i].getName()+"."+suffixes[j] << "=real_shfl(" << params[i].getName()+"1."+suffixes[j] <<",j);\n";
}
}
}
} else {
// not used if not shuffling
}
replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream declareLocal2;
if(useShuffle) {
for(int i=0; i< (int) params.size(); i++) {
declareLocal2<<params[i].getType()<<" shfl"<<params[i].getName()<<";\n";
}
} else {
// not used if using shared memory
}
replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
stringstream loadLocal2; stringstream loadLocal2;
if(useShuffle) {
for(int i=0; i< (int) params.size(); i++) {
loadLocal2<<"shfl"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
} else {
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) { if (params[i].getNumComponents() == 1) {
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n"; loadLocal2<<"localData[threadIdx.x]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
} }
else { else {
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n"; loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j) for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n"; loadLocal2<<"localData[threadIdx.x]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
} }
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load1;
for (int i = 0; i < (int) params.size(); i++) {
load1 << params[i].getType();
load1 << " ";
load1 << params[i].getName();
load1 << "1 = global_";
load1 << params[i].getName();
load1 << "[atom1];\n";
} }
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load2j; stringstream load2j;
if(useShuffle) {
for(int i = 0; i < (int) params.size(); i++)
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = shfl"<<params[i].getName()<<";\n";
} else {
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) { if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n"; load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
...@@ -493,8 +559,36 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -493,8 +559,36 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
load2j<<");\n"; load2j<<");\n";
} }
} }
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines;
stringstream shuffleWarpData;
if(useShuffle) {
shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n";
shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n";
shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n";
shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
for(int i=0; i < (int) params.size(); i++) {
if(params[i].getNumComponents() == 1) {
shuffleWarpData<<"shfl"<<params[i].getName()<<"=real_shfl(shfl"<<params[i].getName()<<", tgx+1);\n";
} else {
for(int j=0;j<params[i].getNumComponents();j++) {
// looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
shuffleWarpData<<"shfl"<<params[i].getName()
<<"."<<suffixes[j]<<"=real_shfl(shfl"
<<params[i].getName()<<"."<<suffixes[j]
<<", tgx+1);\n";
}
}
}
} else {
// not used otherwise
}
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; defines["USE_CUTOFF"] = "1";
if (usePeriodic) if (usePeriodic)
...@@ -519,8 +613,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -519,8 +613,6 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex); defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision()) if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1"; defines["PARAMETER_SIZE_IS_EVEN"] = "1";
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
defines["ENABLE_SHUFFLE"] = "1";
CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines); CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
CUfunction kernel = context.getKernel(program, "computeNonbonded"); CUfunction kernel = context.getKernel(program, "computeNonbonded");
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#ifndef ENABLE_SHUFFLE
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -9,6 +10,20 @@ typedef struct { ...@@ -9,6 +10,20 @@ typedef struct {
real padding; real padding;
#endif #endif
} AtomData; } AtomData;
#endif
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
return __shfl(var, srcLane);
}
static __inline__ __device__ double real_shfl(double var, int srcLane) {
int hi, lo;
asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
hi = __shfl(hi, srcLane);
lo = __shfl(lo, srcLane);
return __hiloint2double( hi, lo );
}
/** /**
* Compute nonbonded interactions. The kernel is separated into two parts, * Compute nonbonded interactions. The kernel is separated into two parts,
...@@ -40,12 +55,15 @@ typedef struct { ...@@ -40,12 +55,15 @@ typedef struct {
* t o 3 4 5 6 7 8 1 2 * t o 3 4 5 6 7 8 1 2
* a p 2 3 4 5 6 7 8 1 * a p 2 3 4 5 6 7 8 1
* *
* TODO: Implement shuffle as opposed to using nonbonded.
*
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* and follows the same force accumulation method. If more there are more interactingTiles * and follows the same force accumulation method. If more there are more interactingTiles
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* and the full tileset. * and the full tileset is computed. This should happen on the first step, and very rarely
* afterwards.
*
* On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
* __shfl to broadcast. For all other types of tiles __shfl is used to pass around the
* forces, positions, and parameters when computing the forces.
* *
* [out]forceBuffers - forces on each atom to eventually be accumulated * [out]forceBuffers - forces on each atom to eventually be accumulated
* [out]energyBuffer - energyBuffer to eventually be accumulated * [out]energyBuffer - energyBuffer to eventually be accumulated
...@@ -83,10 +101,11 @@ extern "C" __global__ void computeNonbonded( ...@@ -83,10 +101,11 @@ extern "C" __global__ void computeNonbonded(
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
real energy = 0.0f; real energy = 0.0f;
// used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
...@@ -103,16 +122,26 @@ extern "C" __global__ void computeNonbonded( ...@@ -103,16 +122,26 @@ extern "C" __global__ void computeNonbonded(
const bool hasExclusions = true; const bool hasExclusions = true;
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
#ifdef ENABLE_SHUFFLE
const unsigned int localAtomIndex = threadIdx.x; real4 shflPosq = posq1;
localData[localAtomIndex].x = posq1.x; #else
localData[localAtomIndex].y = posq1.y; localData[threadIdx.x].x = posq1.x;
localData[localAtomIndex].z = posq1.z; localData[threadIdx.x].y = posq1.y;
localData[localAtomIndex].q = posq1.w; localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#endif
// we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2;
#ifdef ENABLE_SHUFFLE
BROADCAST_WARP_DATA
#else
posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -152,25 +181,35 @@ extern "C" __global__ void computeNonbonded( ...@@ -152,25 +181,35 @@ extern "C" __global__ void computeNonbonded(
} }
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 shflPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; #ifdef ENABLE_SHUFFLE
localData[localAtomIndex].y = tempPosq.y; real3 shflForce;
localData[localAtomIndex].z = tempPosq.z; shflForce.x = 0.0f;
localData[localAtomIndex].q = tempPosq.w; shflForce.y = 0.0f;
shflForce.z = 0.0f;
#else
localData[threadIdx.x].x = shflPosq.x;
localData[threadIdx.x].y = shflPosq.y;
localData[threadIdx.x].z = shflPosq.z;
localData[threadIdx.x].q = shflPosq.w;
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
#endif
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -202,46 +241,64 @@ extern "C" __global__ void computeNonbonded( ...@@ -202,46 +241,64 @@ extern "C" __global__ void computeNonbonded(
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
#else #endif
#else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y; localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z; localData[tbx+tj].fz += dEdR2.z;
#endif #endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif #endif
// cycles the indices // cycles the indices
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0 // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} const unsigned int offset = y*TILE_SIZE + tgx;
// write results for off diagonal tiles
// Write results. #ifdef ENABLE_SHUFFLE
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
unsigned int offset = x*TILE_SIZE + tgx; atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); #else
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
} }
// Write results for on and off diagonal tiles
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff). // of them (no cutoff).
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps); int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
...@@ -253,6 +310,8 @@ extern "C" __global__ void computeNonbonded( ...@@ -253,6 +310,8 @@ extern "C" __global__ void computeNonbonded(
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
// atomIndices can probably be shuffled as well
// but it probably wouldn't make things any faster
__shared__ int atomIndices[THREAD_BLOCK_SIZE]; __shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ volatile int skipTiles[THREAD_BLOCK_SIZE]; __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1; skipTiles[threadIdx.x] = -1;
...@@ -263,7 +322,6 @@ extern "C" __global__ void computeNonbonded( ...@@ -263,7 +322,6 @@ extern "C" __global__ void computeNonbonded(
bool includeTile = true; bool includeTile = true;
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; unsigned int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -303,46 +361,64 @@ extern "C" __global__ void computeNonbonded( ...@@ -303,46 +361,64 @@ extern "C" __global__ void computeNonbonded(
} }
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile. // Load atom data for this tile.
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x; //const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx); unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j; atomIndices[threadIdx.x] = j;
#ifdef ENABLE_SHUFFLE
DECLARE_LOCAL_PARAMETERS
real4 shflPosq;
real3 shflForce;
shflForce.x = 0.0f;
shflForce.y = 0.0f;
shflForce.z = 0.0f;
#endif
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
// Load position of atom j from from global memory // Load position of atom j from from global memory
real4 tempPosq = posq[j]; #ifdef ENABLE_SHUFFLE
localData[localAtomIndex].x = tempPosq.x; shflPosq = posq[j];
localData[localAtomIndex].y = tempPosq.y; #else
localData[localAtomIndex].z = tempPosq.z; localData[threadIdx.x].x = posq[j].x;
localData[localAtomIndex].q = tempPosq.w; localData[threadIdx.x].y = posq[j].y;
localData[threadIdx.x].z = posq[j].z;
localData[threadIdx.x].q = posq[j].w;
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
} }
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
if (singlePeriodicCopy) { if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic // The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later. // box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; #ifdef ENABLE_SHUFFLE
localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; shflPosq.x -= floor((shflPosq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; shflPosq.y -= floor((shflPosq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
shflPosq.z -= floor((shflPosq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#else
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
...@@ -367,18 +443,34 @@ extern "C" __global__ void computeNonbonded( ...@@ -367,18 +443,34 @@ extern "C" __global__ void computeNonbonded(
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
#else #endif
#else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y; localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z; localData[tbx+tj].fz += dEdR2.z;
#endif #endif
#endif // end USE_SYMMETRIC
} }
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
...@@ -386,11 +478,14 @@ extern "C" __global__ void computeNonbonded( ...@@ -386,11 +478,14 @@ extern "C" __global__ void computeNonbonded(
#endif #endif
{ {
// We need to apply periodic boundary conditions separately for each interaction. // We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -422,26 +517,41 @@ extern "C" __global__ void computeNonbonded( ...@@ -422,26 +517,41 @@ extern "C" __global__ void computeNonbonded(
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
#else #endif
#else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y; localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z; localData[tbx+tj].fz += dEdR2.z;
#endif #endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
// Write results. // Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
...@@ -451,9 +561,15 @@ extern "C" __global__ void computeNonbonded( ...@@ -451,9 +561,15 @@ extern "C" __global__ void computeNonbonded(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
#else
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000))); atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000))); atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000))); atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
} }
} }
pos++; pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment