Commit 20e4b551 authored by one's avatar one
Browse files

Tune HIP PME kernel launch block sizes

Use explicit 128-thread block launches for selected HIP PME kernels that
benefit from larger blocks.  Keep the platform default block size unchanged,
and leave small-system grid indexing and charge spreading on the existing
default launch configuration.

The heuristic applies 128-thread launches to finishSpreadCharge on HIP, and
uses 128-thread launches for findAtomGridIndex and gridSpreadCharge only for
larger systems.  Coulomb PME and LJPME dispersion paths are handled in
parallel, while interpolation and energy evaluation remain unchanged.
parent 4d20b76e
...@@ -2,5 +2,11 @@ ...@@ -2,5 +2,11 @@
ADD_SUBDIRECTORY(cpp-examples) ADD_SUBDIRECTORY(cpp-examples)
ADD_SUBDIRECTORY(python-examples) ADD_SUBDIRECTORY(python-examples)
INSTALL(DIRECTORY benchmarks DESTINATION examples) INSTALL(DIRECTORY benchmarks DESTINATION examples
PATTERN log EXCLUDE
PATTERN cache EXCLUDE
PATTERN save-temps EXCLUDE
PATTERN "*.db" EXCLUDE
PATTERN "*.pftrace" EXCLUDE
)
INSTALL(DIRECTORY extras DESTINATION examples) INSTALL(DIRECTORY extras DESTINATION examples)
...@@ -47,7 +47,8 @@ public: ...@@ -47,7 +47,8 @@ public:
CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform), CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cc(cc), pmeio(NULL), stepsToSort(0), dispersionStepsToSort(0), hasInitializedKernel(false), cc(cc), pmeio(NULL), stepsToSort(0), dispersionStepsToSort(0),
usePmeDispersionWave64LdsSpread(false), pmeDispersionSpreadWaveSize(0), pmeDispersionSpreadBlockSize(0), usePmeDispersionWave64LdsSpread(false), pmeDispersionSpreadWaveSize(0), pmeDispersionSpreadBlockSize(0),
pmeDispersionAtomsPerWave(0), pmeDispersionAtomsPerBlock(0) { pmeDispersionAtomsPerWave(0), pmeDispersionAtomsPerBlock(0), pmeGridIndexBlockSize(-1),
pmeSpreadChargeBlockSize(-1), pmeFinishSpreadChargeBlockSize(-1) {
} }
~CommonCalcNonbondedForceKernel(); ~CommonCalcNonbondedForceKernel();
/** /**
...@@ -174,6 +175,7 @@ private: ...@@ -174,6 +175,7 @@ private:
bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme; bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme;
bool usePmeDispersionWave64LdsSpread; bool usePmeDispersionWave64LdsSpread;
int pmeDispersionSpreadWaveSize, pmeDispersionSpreadBlockSize, pmeDispersionAtomsPerWave, pmeDispersionAtomsPerBlock; int pmeDispersionSpreadWaveSize, pmeDispersionSpreadBlockSize, pmeDispersionAtomsPerWave, pmeDispersionAtomsPerBlock;
int pmeGridIndexBlockSize, pmeSpreadChargeBlockSize, pmeFinishSpreadChargeBlockSize;
bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets; bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod; NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5; static const int PmeOrder = 5;
......
...@@ -287,12 +287,17 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons ...@@ -287,12 +287,17 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic); bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ); doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cc.requestPosqCharges() : false; usePosqCharges = hasCoulomb ? cc.requestPosqCharges() : false;
bool isHip = (getPlatform().getName() == "HIP");
bool useLargeHipPmeBlocks = (isHip && cc.getNumAtomBlocks() >= 2000);
pmeGridIndexBlockSize = useLargeHipPmeBlocks ? 128 : -1;
pmeSpreadChargeBlockSize = useLargeHipPmeBlocks ? 128 : -1;
pmeFinishSpreadChargeBlockSize = isHip ? 128 : -1;
pmeDispersionSpreadWaveSize = 64; pmeDispersionSpreadWaveSize = 64;
pmeDispersionSpreadBlockSize = 256; pmeDispersionSpreadBlockSize = 256;
pmeDispersionAtomsPerWave = pmeDispersionSpreadWaveSize/PmeOrder; pmeDispersionAtomsPerWave = pmeDispersionSpreadWaveSize/PmeOrder;
pmeDispersionAtomsPerBlock = (pmeDispersionSpreadBlockSize/pmeDispersionSpreadWaveSize)*pmeDispersionAtomsPerWave; pmeDispersionAtomsPerBlock = (pmeDispersionSpreadBlockSize/pmeDispersionSpreadWaveSize)*pmeDispersionAtomsPerWave;
// The LDS spread path assumes wave64 execution and is only used for HIP LJ-PME fixed point spreading. // The LDS spread path assumes wave64 execution and is only used for HIP LJ-PME fixed point spreading.
usePmeDispersionWave64LdsSpread = (getPlatform().getName() == "HIP" && usePmeDispersionWave64LdsSpread = (isHip &&
doLJPME && useFixedPointChargeSpreading && PmeOrder == 5 && doLJPME && useFixedPointChargeSpreading && PmeOrder == 5 &&
cc.getSIMDWidth() == pmeDispersionSpreadWaveSize && cc.getSIMDWidth() == pmeDispersionSpreadWaveSize &&
cc.getMaxThreadBlockSize() >= pmeDispersionSpreadBlockSize); cc.getMaxThreadBlockSize() >= pmeDispersionSpreadBlockSize);
...@@ -994,7 +999,7 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -994,7 +999,7 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]); pmeGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]); pmeGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]);
} }
pmeGridIndexKernel->execute(cc.getNumAtoms()); pmeGridIndexKernel->execute(cc.getNumAtoms(), pmeGridIndexBlockSize);
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
stepsToSort = 3; stepsToSort = 3;
} }
...@@ -1011,9 +1016,9 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1011,9 +1016,9 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]); pmeSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]); pmeSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]);
} }
pmeSpreadChargeKernel->execute(cc.getNumAtoms()); pmeSpreadChargeKernel->execute(cc.getNumAtoms(), pmeSpreadChargeBlockSize);
if (useFixedPointChargeSpreading) if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ); pmeFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ, pmeFinishSpreadChargeBlockSize);
fft->execFFT(pmeGrid1, pmeGrid2, true); fft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) { if (cc.getUseDoublePrecision()) {
pmeConvolutionKernel->setArg<mm_double4>(4, recipBoxVectors[0]); pmeConvolutionKernel->setArg<mm_double4>(4, recipBoxVectors[0]);
...@@ -1065,7 +1070,7 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1065,7 +1070,7 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]); pmeDispersionGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeDispersionGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]); pmeDispersionGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]);
} }
pmeDispersionGridIndexKernel->execute(cc.getNumAtoms()); pmeDispersionGridIndexKernel->execute(cc.getNumAtoms(), pmeGridIndexBlockSize);
sort->sort(pmeDispersionAtomGridIndex); sort->sort(pmeDispersionAtomGridIndex);
dispersionStepsToSort = 3; dispersionStepsToSort = 3;
} }
...@@ -1091,9 +1096,9 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1091,9 +1096,9 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionSpreadChargeKernel->execute(workSize, pmeDispersionSpreadBlockSize); pmeDispersionSpreadChargeKernel->execute(workSize, pmeDispersionSpreadBlockSize);
} }
else else
pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms()); pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms(), pmeSpreadChargeBlockSize);
if (useFixedPointChargeSpreading) if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ); pmeDispersionFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, pmeFinishSpreadChargeBlockSize);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true); dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) { if (cc.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel->setArg(4, recipBoxVectors[0]); pmeDispersionConvolutionKernel->setArg(4, recipBoxVectors[0]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment