Unverified Commit e581f42b authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Minor optimizations to computing single pairs (#3494)

* Minor optimizations to computing single pairs

* Adjusted MAX_BITS_FOR_PAIRS on Ampere
parent 3b934387
......@@ -499,7 +499,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
if (context.getBoxIsTriclinic())
defines["TRICLINIC"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? (context.getComputeCapability() < 8.0 ? "2" : "4") : "0");
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? (context.getComputeCapability() < 8.0 ? "2" : "3") : "0");
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
......@@ -646,6 +646,11 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream load2g;
for (const ParameterInfo& param : params)
load2g<<param.getType()<<" "<<param.getName()<<"2 = global_"<<param.getName()<<"[atom2];\n";
replacements["LOAD_ATOM2_PARAMETERS_FROM_GLOBAL"] = load2g.str();
stringstream clearLocal;
for (const ParameterInfo& param : params) {
......
......@@ -41,6 +41,18 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
}
#endif
/**
* Save the force on a single atom.
*/
__device__ void saveSingleForce(int atom, real3 force, unsigned long long* forceBuffers) {
if (force.x != 0)
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
if (force.y != 0)
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
if (force.z != 0)
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}
/**
* Compute nonbonded interactions. The kernel is separated into two parts,
* tiles with exclusions and tiles without exclusions. It relies heavily on
......@@ -329,14 +341,14 @@ extern "C" __global__ void computeNonbonded(
#else
int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
#endif
// atomIndices can probably be shuffled as well
// but it probably wouldn't make things any faster
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool hasExclusions = false;
......@@ -381,7 +393,6 @@ extern "C" __global__ void computeNonbonded(
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
//const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
......@@ -607,12 +618,7 @@ extern "C" __global__ void computeNonbonded(
real4 posq1 = posq[atom1];
real4 posq2 = posq[atom2];
LOAD_ATOM1_PARAMETERS
int j = atom2;
atom2 = threadIdx.x;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2 = pair.y;
LOAD_ATOM2_PARAMETERS_FROM_GLOBAL
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
......@@ -637,12 +643,8 @@ extern "C" __global__ void computeNonbonded(
real3 dEdR1 = delta*dEdR;
real3 dEdR2 = -dEdR1;
#endif
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
saveSingleForce(atom1, -dEdR1, forceBuffers);
saveSingleForce(atom2, -dEdR2, forceBuffers);
#endif
}
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment