Commit 4e7070c2 authored by one's avatar one
Browse files

Add wave64 LDS spreading in HIP LJ-PME

parent 939ecf28
...@@ -45,7 +45,9 @@ namespace OpenMM { ...@@ -45,7 +45,9 @@ namespace OpenMM {
class CommonCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class CommonCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform), CommonCalcNonbondedForceKernel(std::string name, const Platform& platform, ComputeContext& cc, const System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cc(cc), pmeio(NULL), stepsToSort(0), dispersionStepsToSort(0) { hasInitializedKernel(false), cc(cc), pmeio(NULL), stepsToSort(0), dispersionStepsToSort(0),
usePmeDispersionWave64LdsSpread(false), pmeDispersionSpreadWaveSize(0), pmeDispersionSpreadBlockSize(0),
pmeDispersionAtomsPerWave(0), pmeDispersionAtomsPerBlock(0) {
} }
~CommonCalcNonbondedForceKernel(); ~CommonCalcNonbondedForceKernel();
/** /**
...@@ -170,6 +172,8 @@ private: ...@@ -170,6 +172,8 @@ private:
int stepsToSort; int stepsToSort;
int dispersionStepsToSort; int dispersionStepsToSort;
bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme; bool usePmeQueue, deviceIsCpu, useFixedPointChargeSpreading, useCpuPme;
bool usePmeDispersionWave64LdsSpread;
int pmeDispersionSpreadWaveSize, pmeDispersionSpreadBlockSize, pmeDispersionAtomsPerWave, pmeDispersionAtomsPerBlock;
bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets; bool hasCoulomb, hasLJ, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod; NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5; static const int PmeOrder = 5;
......
...@@ -287,6 +287,15 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons ...@@ -287,6 +287,15 @@ void CommonCalcNonbondedForceKernel::commonInitialize(const System& system, cons
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic); bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ); doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cc.requestPosqCharges() : false; usePosqCharges = hasCoulomb ? cc.requestPosqCharges() : false;
pmeDispersionSpreadWaveSize = 64;
pmeDispersionSpreadBlockSize = 256;
pmeDispersionAtomsPerWave = pmeDispersionSpreadWaveSize/PmeOrder;
pmeDispersionAtomsPerBlock = (pmeDispersionSpreadBlockSize/pmeDispersionSpreadWaveSize)*pmeDispersionAtomsPerWave;
// The LDS spread path assumes wave64 execution and is only used for HIP LJ-PME fixed point spreading.
usePmeDispersionWave64LdsSpread = (getPlatform().getName() == "HIP" &&
doLJPME && useFixedPointChargeSpreading && PmeOrder == 5 &&
cc.getSIMDWidth() == pmeDispersionSpreadWaveSize &&
cc.getMaxThreadBlockSize() >= pmeDispersionSpreadBlockSize);
map<string, string> defines; map<string, string> defines;
defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0"); defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0"); defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
...@@ -833,9 +842,16 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -833,9 +842,16 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDefines["RECIP_EXP_FACTOR"] = cc.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha)); pmeDefines["RECIP_EXP_FACTOR"] = cc.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1"; pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1"; pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
if (usePmeDispersionWave64LdsSpread) {
pmeDefines["PME_USE_WAVE64_LDS_SPREAD"] = "1";
pmeDefines["PME_SPREAD_WAVE_SIZE"] = cc.intToString(pmeDispersionSpreadWaveSize);
pmeDefines["PME_SPREAD_BLOCK_SIZE"] = cc.intToString(pmeDispersionSpreadBlockSize);
pmeDefines["PME_SPREAD_ATOMS_PER_WAVE"] = cc.intToString(pmeDispersionAtomsPerWave);
pmeDefines["PME_SPREAD_ATOMS_PER_BLOCK"] = cc.intToString(pmeDispersionAtomsPerBlock);
}
program = cc.compileProgram(CommonKernelSources::pme, pmeDefines); program = cc.compileProgram(CommonKernelSources::pme, pmeDefines);
pmeDispersionGridIndexKernel = program->createKernel("findAtomGridIndex"); pmeDispersionGridIndexKernel = program->createKernel("findAtomGridIndex");
pmeDispersionSpreadChargeKernel = program->createKernel("gridSpreadCharge"); pmeDispersionSpreadChargeKernel = program->createKernel(usePmeDispersionWave64LdsSpread ? "gridSpreadChargeWave64Lds" : "gridSpreadCharge");
pmeDispersionConvolutionKernel = program->createKernel("reciprocalConvolution"); pmeDispersionConvolutionKernel = program->createKernel("reciprocalConvolution");
pmeDispersionEvalEnergyKernel = program->createKernel("gridEvaluateEnergy"); pmeDispersionEvalEnergyKernel = program->createKernel("gridEvaluateEnergy");
pmeDispersionInterpolateForceKernel = program->createKernel("gridInterpolateForce"); pmeDispersionInterpolateForceKernel = program->createKernel("gridInterpolateForce");
...@@ -1070,9 +1086,14 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1070,9 +1086,14 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]); pmeDispersionSpreadChargeKernel->setArg(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]); pmeDispersionSpreadChargeKernel->setArg(9, recipBoxVectorsFloat[2]);
} }
pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms()); if (usePmeDispersionWave64LdsSpread) {
const int workSize = ((cc.getNumAtoms()+pmeDispersionAtomsPerBlock-1)/pmeDispersionAtomsPerBlock)*pmeDispersionSpreadBlockSize;
pmeDispersionSpreadChargeKernel->execute(workSize, pmeDispersionSpreadBlockSize);
}
else
pmeDispersionSpreadChargeKernel->execute(cc.getNumAtoms());
if (useFixedPointChargeSpreading) if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel->execute(gridSizeX*gridSizeY*gridSizeZ); pmeDispersionFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true); dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
if (cc.getUseDoublePrecision()) { if (cc.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel->setArg(4, recipBoxVectors[0]); pmeDispersionConvolutionKernel->setArg(4, recipBoxVectors[0]);
...@@ -1093,8 +1114,8 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1093,8 +1114,8 @@ double CommonCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (!hasCoulomb) if (!hasCoulomb)
cc.clearBuffer(pmeEnergyBuffer); cc.clearBuffer(pmeEnergyBuffer);
if (includeEnergy) if (includeEnergy)
pmeDispersionEvalEnergyKernel->execute(gridSizeX*gridSizeY*gridSizeZ); pmeDispersionEvalEnergyKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
pmeDispersionConvolutionKernel->execute(gridSizeX*gridSizeY*gridSizeZ); pmeDispersionConvolutionKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
dispersionFft->execFFT(pmeGrid2, pmeGrid1, false); dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
setPeriodicBoxArgs(cc, pmeDispersionInterpolateForceKernel, 3); setPeriodicBoxArgs(cc, pmeDispersionInterpolateForceKernel, 3);
if (cc.getUseDoublePrecision()) { if (cc.getUseDoublePrecision()) {
......
...@@ -120,6 +120,131 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -120,6 +120,131 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
} }
} }
#if defined(USE_HIP) && defined(CHARGE_FROM_SIGEPS) && PME_ORDER == 5 && defined(PME_USE_WAVE64_LDS_SPREAD)
// HIP LJ-PME dispersion spread variant for wave64 hardware. It keeps the generic
// PME spread algorithm unchanged, but maps one atom across PME_ORDER wavefront
// lanes so the five z-slices can be accumulated in parallel after one lane
// computes the atom's charge, grid index, and B-spline coefficients.
KERNEL void gridSpreadChargeWave64Lds(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
GLOBAL mm_ulong* RESTRICT pmeGrid,
#else
GLOBAL real* RESTRICT pmeGrid,
#endif
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
GLOBAL const float2* RESTRICT sigmaEpsilon
) {
real3 data[PME_ORDER];
const real scale = RECIP((real) (PME_ORDER-1));
// B-spline coefficients are stored as [localAtom][order] so all wavefront
// lanes for the same atom read contiguous LDS entries during the spread phase.
LOCAL real sharedDataX[PME_SPREAD_BLOCK_SIZE];
LOCAL real sharedDataY[PME_SPREAD_BLOCK_SIZE];
LOCAL real sharedDataZ[PME_SPREAD_BLOCK_SIZE];
LOCAL real sharedCharge[PME_SPREAD_ATOMS_PER_BLOCK];
LOCAL int sharedGridX[PME_SPREAD_ATOMS_PER_BLOCK];
LOCAL int sharedGridY[PME_SPREAD_ATOMS_PER_BLOCK];
LOCAL int sharedGridZ[PME_SPREAD_ATOMS_PER_BLOCK];
// Each wavefront handles floor(PME_SPREAD_WAVE_SIZE/PME_ORDER) atoms. For
// PME_ORDER=5, lanes 0..4 spread one atom, lanes 5..9 spread the next, and
// the remaining lanes in the wavefront are inactive.
const int waveLane = LOCAL_ID & (PME_SPREAD_WAVE_SIZE-1);
const int wave = LOCAL_ID/PME_SPREAD_WAVE_SIZE;
int atomLane = waveLane-(waveLane/PME_ORDER)*PME_ORDER;
int atomInWave = waveLane/PME_ORDER;
int localAtom = wave*PME_SPREAD_ATOMS_PER_WAVE+atomInWave;
const int atomBlockStride = (GLOBAL_SIZE/PME_SPREAD_BLOCK_SIZE)*PME_SPREAD_ATOMS_PER_BLOCK;
for (int atomBlockStart = GROUP_ID*PME_SPREAD_ATOMS_PER_BLOCK; atomBlockStart < NUM_ATOMS; atomBlockStart += atomBlockStride) {
int atomIndex = atomBlockStart+localAtom;
bool isActive = (waveLane < PME_SPREAD_ATOMS_PER_WAVE*PME_ORDER && atomIndex < NUM_ATOMS);
if (isActive && atomLane == 0) {
int atom = pmeAtomGridIndex[atomIndex].x;
real4 pos = posq[atom];
const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*GRID_SIZE_X;
t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
sharedCharge[localAtom] = charge;
sharedGridX[localAtom] = gridIndex.x;
sharedGridY[localAtom] = gridIndex.y;
sharedGridZ[localAtom] = gridIndex.z;
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP((real) (j-1));
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
int offset = localAtom*PME_ORDER+j;
sharedDataX[offset] = data[j].x;
sharedDataY[offset] = data[j].y;
sharedDataZ[offset] = data[j].z;
}
}
// All consumers of an atom's LDS data are in the same wavefront as the
// producing lane, so a wave-level barrier is sufficient here.
SYNC_WARPS
if (isActive) {
real charge = sharedCharge[localAtom];
if (charge != 0) {
int gridX = sharedGridX[localAtom];
int gridY = sharedGridY[localAtom];
int gridZ = sharedGridZ[localAtom];
int zindex = gridZ+atomLane;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real dz = sharedDataZ[localAtom*PME_ORDER+atomLane]*charge;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridX+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dzdx = dz*sharedDataX[localAtom*PME_ORDER+ix];
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridY+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = ybase*GRID_SIZE_Z;
int index = xbase + ybase + zindex;
real add = dzdx*sharedDataY[localAtom*PME_ORDER+iy];
if (fabs(add) > 2.3e-10f) { // Smallest value representable in 64 bit fixed point
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
#if defined(__GFX12__) && defined(USE_HIP)
// Workaround for rare cases when few values of pmeGrid are very large and
// incorrect. The cause is unknown. Why this workaround or other irrelevant
// changes like printf help is also unknown.
asm volatile("s_wait_storecnt 0x0");
#endif
#else
ATOMIC_ADD(&pmeGrid[index], add);
#endif
}
}
}
}
}
SYNC_WARPS
}
}
#endif
#ifdef USE_FIXED_POINT_CHARGE_SPREADING #ifdef USE_FIXED_POINT_CHARGE_SPREADING
KERNEL void finishSpreadCharge( KERNEL void finishSpreadCharge(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment