"vscode:/vscode.git/clone" did not exist on "7abe516005f6c2b301b4e99b28c6503263f8c223"
Commit 20e4b551 authored by one's avatar one
Browse files

Tune HIP PME kernel launch block sizes

Use explicit 128-thread block launches for selected HIP PME kernels that
benefit from larger blocks.  Keep the platform default block size unchanged,
and leave small-system grid indexing and charge spreading on the existing
default launch configuration.

The heuristic applies 128-thread launches to finishSpreadCharge on HIP, and
uses 128-thread launches for findAtomGridIndex and gridSpreadCharge only for
larger systems.  Coulomb PME and LJPME dispersion paths are handled in
parallel, while interpolation and energy evaluation remain unchanged.
parent 4d20b76e
......@@ -2,5 +2,11 @@
ADD_SUBDIRECTORY(cpp-examples)
ADD_SUBDIRECTORY(python-examples)
INSTALL(DIRECTORY benchmarks DESTINATION examples)
INSTALL(DIRECTORY benchmarks DESTINATION examples
PATTERN log EXCLUDE
PATTERN cache EXCLUDE
PATTERN save-temps EXCLUDE
PATTERN "*.db" EXCLUDE
PATTERN "*.pftrace" EXCLUDE
)
INSTALL(DIRECTORY extras DESTINATION examples)
......@@ -47,7 +47,8 @@ public:
CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cc(cc), pmeio(NULL), stepsToSort(0), dispersionStepsToSort(0),
usePmeDispersionWave64LdsSpread(false), pmeDispersionSpreadWaveSize(0), pmeDispersionSpreadBlockSize(0),
pmeDispersionAtomsPerWave(0), pmeDispersionAtomsPerBlock(0) {
pmeDispersionAtomsPerWave(0), pmeDispersionAtomsPerBlock(0), pmeGridIndexBlockSize(-1),
pmeSpreadChargeBlockSize(-1), pmeFinishSpreadChargeBlockSize(-1) {
}
~CommonCalcNonbondedForceKernel();
/**
......@@ -174,6 +175,7 @@ private:
bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme;
bool usePmeDispersionWave64LdsSpread;
int pmeDispersionSpreadWaveSize, pmeDispersionSpreadBlockSize, pmeDispersionAtomsPerWave, pmeDispersionAtomsPerBlock;
int pmeGridIndexBlockSize, pmeSpreadChargeBlockSize, pmeFinishSpreadChargeBlockSize;
bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
......
......@@ -287,12 +287,17 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cc.requestPosqCharges() : false;
bool isHip = (getPlatform().getName() == "HIP");
bool useLargeHipPmeBlocks = (isHip && cc.getNumAtomBlocks() >= 2000);
pmeGridIndexBlockSize = useLargeHipPmeBlocks ? 128 : -1;
pmeSpreadChargeBlockSize = useLargeHipPmeBlocks ? 128 : -1;
pmeFinishSpreadChargeBlockSize = isHip ? 128 : -1;
pmeDispersionSpreadWaveSize = 64;
pmeDispersionSpreadBlockSize = 256;
pmeDispersionAtomsPerWave = pmeDispersionSpreadWaveSize/PmeOrder;
pmeDispersionAtomsPerBlock = (pmeDispersionSpreadBlockSize/pmeDispersionSpreadWaveSize)*pmeDispersionAtomsPerWave;
// The LDS spread path assumes wave64 execution and is only used for HIP LJ-PME fixed point spreading.
usePmeDispersionWave64LdsSpread = (getPlatform().getName() == "HIP" &&
usePmeDispersionWave64LdsSpread = (isHip &&
doLJPME && useFixedPointChargeSpreading && PmeOrder == 5 &&
cc.getSIMDWidth() == pmeDispersionSpreadWaveSize &&
cc.getMaxThreadBlockSize() >= pmeDispersionSpreadBlockSize);
......@@ -994,7 +999,7 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeGridIndexKernel->execute(cc.getNumAtoms());
pmeGridIndexKernel->execute(cc.getNumAtoms(), pmeGridIndexBlockSize);
sort->sort(pmeAtomGridIndex);
stepsToSort = 3;
}
......@@ -1011,9 +1016,9 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeSpreadChargeKernel->execute(cc.getNumAtoms());
pmeSpreadChargeKernel->execute(cc.getNumAtoms(), pmeSpreadChargeBlockSize);
if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ);
pmeFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ, pmeFinishSpreadChargeBlockSize);
fft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) {
pmeConvolutionKernel->setArg<mm_double4>(4, recipBoxVectors[0]);
......@@ -1065,7 +1070,7 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionGridIndexKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeDispersionGridIndexKernel->setArg(9, recipBoxVectorsFloat[2]);
}
pmeDispersionGridIndexKernel->execute(cc.getNumAtoms());
pmeDispersionGridIndexKernel->execute(cc.getNumAtoms(), pmeGridIndexBlockSize);
sort->sort(pmeDispersionAtomGridIndex);
dispersionStepsToSort = 3;
}
......@@ -1091,9 +1096,9 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionSpreadChargeKernel->execute(workSize, pmeDispersionSpreadBlockSize);
}
else
pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms());
pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms(), pmeSpreadChargeBlockSize);
if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
pmeDispersionFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, pmeFinishSpreadChargeBlockSize);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel->setArg(4, recipBoxVectors[0]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment