Commit 6d82a920 authored by Peter Eastman's avatar Peter Eastman
Browse files

Limit thread block size to avoid exceeding available shared memory

parent ff7f1035
......@@ -493,6 +493,19 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
}
}
int CudaContext::computeThreadBlockSize(double memory, bool preferShared) const {
int maxShared = 16*1024;
if (computeCapability >= 2.0 && preferShared)
maxShared = 48*1024;
int max = (int) (maxShared/memory);
if (max < 64)
return 32;
int threads = 64;
while (threads+64 < max)
threads += 64;
return threads;
}
void CudaContext::clearBuffer(CudaArray& array) {
clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
}
......
......@@ -226,6 +226,14 @@ public:
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
/**
* Compute the largest thread block size that can be used for a kernel that requires a particular amount of
* shared memory per thread.
*
* @param memory the number of bytes of shared memory per thread
* @param preferShared whether the kernel is set to prefer shared memory over cache
*/
int computeThreadBlockSize(double memory, bool preferShared=true) const;
/**
* Set all elements of an array to 0.
*/
......
......@@ -408,6 +408,7 @@ void CudaCalcAmoebaStretchBendForceKernel::initialize(const System& system, cons
}
double CudaCalcAmoebaStretchBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
/* -------------------------------------------------------------------------- *
......@@ -841,10 +842,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
// Create the kernels.
bool useShuffle = (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision());
double fixedThreadMemory = 19*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
double inducedThreadMemory = 15*elementSize+2*sizeof(float);
double electrostaticsThreadMemory = 0;
if (useShuffle)
fixedThreadMemory += 3*elementSize;
map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(numMultipoles);
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/innerDielectric);
if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
......@@ -881,15 +887,22 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
defines["GK_FC"] = cu.doubleToString(1*(1-solventDielectric)/(0+1*solventDielectric));
defines["GK_FD"] = cu.doubleToString(2*(1-solventDielectric)/(1+2*solventDielectric));
defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric));
fixedThreadMemory += 4*elementSize;
inducedThreadMemory += 13*elementSize;
}
int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
fixedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(fixedThreadMemory));
inducedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(inducedThreadMemory));
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines);
computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles");
mapTorqueKernel = cu.getKernel(module, "mapTorqueToForce");
computePotentialKernel = cu.getKernel(module, "computePotentialAtPoints");
defines["THREAD_BLOCK_SIZE"] = cu.intToString(fixedFieldThreads);
module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
if (maxInducedIterations > 0) {
defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldBySOR");
......@@ -901,6 +914,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
electrostaticsSource << "#define APPLY_SCALE\n";
electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
if (useShuffle)
electrostaticsThreadMemory += 3*elementSize;
}
else {
electrostaticsSource << CudaKernelSources::vectorOps;
......@@ -913,7 +929,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource << "#undef T1\n";
electrostaticsSource << "#define T3\n";
electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
}
electrostaticsThreadMemory = 21*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
if (useShuffle)
electrostaticsThreadMemory += 3*elementSize;
if (gk != NULL)
electrostaticsThreadMemory += 4*elementSize;
}
electrostaticsThreads = min(maxThreads, cu.computeThreadBlockSize(electrostaticsThreadMemory));
defines["THREAD_BLOCK_SIZE"] = cu.intToString(electrostaticsThreads);
module = cu.createModule(electrostaticsSource.str(), defines);
electrostaticsKernel = cu.getKernel(module, "computeElectrostatics");
......@@ -1151,7 +1174,6 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
int startTileIndex = nb.getStartTileIndex();
int numTileIndices = nb.getNumTiles();
int numForceThreadBlocks = nb.getNumForceThreadBlocks();
int forceThreadBlockSize = nb.getForceThreadBlockSize();
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
void* npt = NULL;
if (pmeGrid == NULL) {
......@@ -1162,7 +1184,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
......@@ -1174,7 +1196,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&gkKernel->getBornRadii()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(),
&gkKernel->getInducedDipolesPolar()->getDevicePointer(), &inducedDipole->getDevicePointer(),
......@@ -1192,7 +1214,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
&dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
}
else {
cu.clearBuffer(*gkKernel->getInducedField());
......@@ -1202,7 +1224,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
&gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
void* updateInducedGkFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
&gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(),
......@@ -1230,7 +1252,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
if (gkKernel != NULL)
gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags);
}
......@@ -1280,7 +1302,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
......@@ -1316,7 +1338,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
&dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
......@@ -1354,7 +1376,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(),
......
......@@ -323,6 +323,7 @@ private:
void initializeScaleFactors();
template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
int numMultipoles, maxInducedIterations;
int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
double inducedEpsilon;
bool hasInitializedScaleFactors, hasInitializedFFT;
CudaContext& cu;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment