Commit 6d82a920 authored by Peter Eastman's avatar Peter Eastman
Browse files

Limit thread block size to avoid exceeding available shared memory

parent ff7f1035
...@@ -493,6 +493,19 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads ...@@ -493,6 +493,19 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
} }
} }
int CudaContext::computeThreadBlockSize(double memory, bool preferShared) const {
int maxShared = 16*1024;
if (computeCapability >= 2.0 && preferShared)
maxShared = 48*1024;
int max = (int) (maxShared/memory);
if (max < 64)
return 32;
int threads = 64;
while (threads+64 < max)
threads += 64;
return threads;
}
void CudaContext::clearBuffer(CudaArray& array) { void CudaContext::clearBuffer(CudaArray& array) {
clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize()); clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
} }
......
...@@ -226,6 +226,14 @@ public: ...@@ -226,6 +226,14 @@ public:
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes * @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/ */
void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0); void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
/**
* Compute the largest thread block size that can be used for a kernel that requires a particular amount of
* shared memory per thread.
*
* @param memory the number of bytes of shared memory per thread
* @param preferShared whether the kernel is set to prefer shared memory over cache
*/
int computeThreadBlockSize(double memory, bool preferShared=true) const;
/** /**
* Set all elements of an array to 0. * Set all elements of an array to 0.
*/ */
......
...@@ -408,6 +408,7 @@ void CudaCalcAmoebaStretchBendForceKernel::initialize(const System& system, cons ...@@ -408,6 +408,7 @@ void CudaCalcAmoebaStretchBendForceKernel::initialize(const System& system, cons
} }
double CudaCalcAmoebaStretchBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) { double CudaCalcAmoebaStretchBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
} }
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
...@@ -841,10 +842,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -841,10 +842,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
// Create the kernels. // Create the kernels.
bool useShuffle = (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision());
double fixedThreadMemory = 19*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
double inducedThreadMemory = 15*elementSize+2*sizeof(float);
double electrostaticsThreadMemory = 0;
if (useShuffle)
fixedThreadMemory += 3*elementSize;
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(numMultipoles); defines["NUM_ATOMS"] = cu.intToString(numMultipoles);
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks()); defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/innerDielectric); defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/innerDielectric);
if (force.getPolarizationType() == AmoebaMultipoleForce::Direct) if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
...@@ -881,15 +887,22 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -881,15 +887,22 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
defines["GK_FC"] = cu.doubleToString(1*(1-solventDielectric)/(0+1*solventDielectric)); defines["GK_FC"] = cu.doubleToString(1*(1-solventDielectric)/(0+1*solventDielectric));
defines["GK_FD"] = cu.doubleToString(2*(1-solventDielectric)/(1+2*solventDielectric)); defines["GK_FD"] = cu.doubleToString(2*(1-solventDielectric)/(1+2*solventDielectric));
defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric)); defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric));
fixedThreadMemory += 4*elementSize;
inducedThreadMemory += 13*elementSize;
} }
int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
fixedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(fixedThreadMemory));
inducedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(inducedThreadMemory));
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines);
computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments"); computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles"); recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles");
mapTorqueKernel = cu.getKernel(module, "mapTorqueToForce"); mapTorqueKernel = cu.getKernel(module, "mapTorqueToForce");
computePotentialKernel = cu.getKernel(module, "computePotentialAtPoints"); computePotentialKernel = cu.getKernel(module, "computePotentialAtPoints");
defines["THREAD_BLOCK_SIZE"] = cu.intToString(fixedFieldThreads);
module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines); module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
computeFixedFieldKernel = cu.getKernel(module, "computeFixedField"); computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
if (maxInducedIterations > 0) { if (maxInducedIterations > 0) {
defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines); module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
computeInducedFieldKernel = cu.getKernel(module, "computeInducedField"); computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldBySOR"); updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldBySOR");
...@@ -901,6 +914,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -901,6 +914,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce; electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
electrostaticsSource << "#define APPLY_SCALE\n"; electrostaticsSource << "#define APPLY_SCALE\n";
electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce; electrostaticsSource << CudaAmoebaKernelSources::pmeElectrostaticPairForce;
electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
if (useShuffle)
electrostaticsThreadMemory += 3*elementSize;
} }
else { else {
electrostaticsSource << CudaKernelSources::vectorOps; electrostaticsSource << CudaKernelSources::vectorOps;
...@@ -913,7 +929,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -913,7 +929,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource << "#undef T1\n"; electrostaticsSource << "#undef T1\n";
electrostaticsSource << "#define T3\n"; electrostaticsSource << "#define T3\n";
electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce; electrostaticsSource << CudaAmoebaKernelSources::electrostaticPairForce;
} electrostaticsThreadMemory = 21*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
if (useShuffle)
electrostaticsThreadMemory += 3*elementSize;
if (gk != NULL)
electrostaticsThreadMemory += 4*elementSize;
}
electrostaticsThreads = min(maxThreads, cu.computeThreadBlockSize(electrostaticsThreadMemory));
defines["THREAD_BLOCK_SIZE"] = cu.intToString(electrostaticsThreads);
module = cu.createModule(electrostaticsSource.str(), defines); module = cu.createModule(electrostaticsSource.str(), defines);
electrostaticsKernel = cu.getKernel(module, "computeElectrostatics"); electrostaticsKernel = cu.getKernel(module, "computeElectrostatics");
...@@ -1151,7 +1174,6 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1151,7 +1174,6 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
int startTileIndex = nb.getStartTileIndex(); int startTileIndex = nb.getStartTileIndex();
int numTileIndices = nb.getNumTiles(); int numTileIndices = nb.getNumTiles();
int numForceThreadBlocks = nb.getNumForceThreadBlocks(); int numForceThreadBlocks = nb.getNumForceThreadBlocks();
int forceThreadBlockSize = nb.getForceThreadBlockSize();
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
void* npt = NULL; void* npt = NULL;
if (pmeGrid == NULL) { if (pmeGrid == NULL) {
...@@ -1162,7 +1184,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1162,7 +1184,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices, &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()}; &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
...@@ -1174,7 +1196,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1174,7 +1196,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices, &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&gkKernel->getBornRadii()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getBornRadii()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(),
&gkKernel->getInducedDipolesPolar()->getDevicePointer(), &inducedDipole->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), &inducedDipole->getDevicePointer(),
...@@ -1192,7 +1214,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1192,7 +1214,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices, &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
&dampingAndThole->getDevicePointer()}; &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
} }
else { else {
cu.clearBuffer(*gkKernel->getInducedField()); cu.clearBuffer(*gkKernel->getInducedField());
...@@ -1202,7 +1224,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1202,7 +1224,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
&gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
void* updateInducedGkFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* updateInducedGkFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
&gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(),
...@@ -1230,7 +1252,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1230,7 +1252,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices, &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
if (gkKernel != NULL) if (gkKernel != NULL)
gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags); gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags);
} }
...@@ -1280,7 +1302,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1280,7 +1302,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()}; &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
...@@ -1316,7 +1338,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1316,7 +1338,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
&dampingAndThole->getDevicePointer()}; &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
...@@ -1354,7 +1376,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1354,7 +1376,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(), void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(),
......
...@@ -323,6 +323,7 @@ private: ...@@ -323,6 +323,7 @@ private:
void initializeScaleFactors(); void initializeScaleFactors();
template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments); template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
int numMultipoles, maxInducedIterations; int numMultipoles, maxInducedIterations;
int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
double inducedEpsilon; double inducedEpsilon;
bool hasInitializedScaleFactors, hasInitializedFFT; bool hasInitializedScaleFactors, hasInitializedFFT;
CudaContext& cu; CudaContext& cu;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment