Unverified Commit 8ea42950 authored by Anton Gorenko's avatar Anton Gorenko Committed by GitHub
Browse files

Optimize PME spread charge kernel (#4633)

* PME_ORDER threads process one atom;
* PME_ORDER threads access consecutive addresses;
* No need to permute z indices with zindexTable;
* finishSpreadCharge is needed only with fixed point charge spreading;
parent edb74aec
......@@ -20,9 +20,6 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
}
}
#if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
GLOBAL mm_ulong* RESTRICT pmeGrid,
......@@ -37,31 +34,13 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
GLOBAL const real* RESTRICT charges
#endif
) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
LOCAL int zindexTable[GRID_SIZE_Z+PME_ORDER];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = LOCAL_ID; i < GRID_SIZE_Z+PME_ORDER; i += LOCAL_SIZE) {
int zindex = i % GRID_SIZE_Z;
int block = zindex % PME_ORDER;
zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
SYNC_THREADS;
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
// the grid values. PME_ORDER threads process one atom.
real3 data[PME_ORDER];
const real scale = RECIP((real) (PME_ORDER-1));
for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) {
int atom = pmeAtomGridIndex[i].x;
for (int i = GLOBAL_ID; i < NUM_ATOMS*PME_ORDER; i += GLOBAL_SIZE) {
int atom = pmeAtomGridIndex[i/PME_ORDER].x;
real4 pos = posq[atom];
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
......@@ -101,71 +80,52 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Spread the charge from this atom onto each grid point.
// Spread the charge from this atom onto each grid point. PME_ORDER threads access
// consecutive addresses.
int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
int iz = i%PME_ORDER;
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real dz = 0;
for (int i = 0; i < PME_ORDER; i++) {
dz = i == iz ? data[i].z : dz;
}
dz *= charge;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y;
real dx = charge*data[ix].x;
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dzdx = dz*data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = (xbase+ybase)*blockSize;
real dxdy = dx*data[iy].y;
for (int i = 0; i < PME_ORDER; i++) {
int iz = (i+izoffset) % PME_ORDER;
int zindex = gridIndex.z+iz;
int index = ybase + zindexTable[zindex];
real add = dxdy*data[iz].z;
ybase = ybase*GRID_SIZE_Z;
int index = xbase + ybase + zindex;
real add = dzdx*data[iy].y;
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
#else
ATOMIC_ADD(&pmeGrid[index], add);
ATOMIC_ADD(&pmeGrid[index], add);
#endif
}
}
}
}
}
KERNEL void finishSpreadCharge(
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
KERNEL void finishSpreadCharge(
GLOBAL const mm_long* RESTRICT grid1,
#else
GLOBAL const real* RESTRICT grid1,
#endif
GLOBAL real* RESTRICT grid2) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
LOCAL int zindexTable[GRID_SIZE_Z];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = LOCAL_ID; i < GRID_SIZE_Z; i += LOCAL_SIZE) {
int block = i % PME_ORDER;
zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
SYNC_THREADS;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = 1/(real) 0x100000000;
for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
int zindex = index%GRID_SIZE_Z;
int loadIndex = zindexTable[zindex] + blockSize*(int) (index/GRID_SIZE_Z);
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
grid2[index] = scale*grid1[loadIndex];
#else
grid2[index] = grid1[loadIndex];
#endif
grid2[index] = scale*grid1[index];
}
}
#endif
KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
......@@ -267,7 +227,7 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
#ifndef USE_LJPME
......@@ -282,9 +242,6 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
#endif
}
#if defined(USE_HIP) && !defined(AMD_RDNA) && !defined(USE_DOUBLE_PRECISION)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
......@@ -297,10 +254,10 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
real3 data[PME_ORDER];
real3 ddata[PME_ORDER];
const real scale = RECIP((real) (PME_ORDER-1));
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) {
int atom = pmeAtomGridIndex[i].x;
real3 force = make_real3(0);
......@@ -315,6 +272,14 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
real q = CHARGE*EPSILON_FACTOR;
#endif
if (q == 0)
continue;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
......@@ -346,14 +311,14 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x;
real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y;
real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
......@@ -365,12 +330,6 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
}
}
}
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
real q = CHARGE*EPSILON_FACTOR;
#endif
real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);
......
......@@ -89,7 +89,7 @@ private:
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), usePmeStream(false) {
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) {
}
~CudaCalcNonbondedForceKernel();
/**
......@@ -215,7 +215,7 @@ private:
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, useCudaFFT, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
......@@ -244,4 +244,3 @@ public:
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELS_H_*/
......@@ -427,7 +427,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
useFixedPointChargeSpreading = cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces;
if (useFixedPointChargeSpreading)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1";
......@@ -455,7 +456,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_SHARED);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
if (doLJPME) {
......@@ -466,10 +468,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
module = cu.createModule(CudaKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
......@@ -481,15 +482,16 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
if (useFixedPointChargeSpreading)
cu.addAutoclearBuffer(pmeGrid2);
else
cu.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
......@@ -890,6 +892,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// Execute the reciprocal space kernels.
// With fixed point charge spreading, finishSpreadCharge kernel is not needed as
// gridSpreadCharge can write directly to pmeGrid1.
CudaArray& pmeSpreadDstGrid = useFixedPointChargeSpreading ? pmeGrid2 : pmeGrid1;
if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
......@@ -898,14 +904,16 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useFixedPointChargeSpreading) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
if (useCudaFFT) {
if (cu.getUseDoublePrecision()) {
......@@ -967,15 +975,17 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.clearBuffer(pmeEnergyBuffer);
}
cu.clearBuffer(pmeGrid2);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.clearBuffer(pmeSpreadDstGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useFixedPointChargeSpreading) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
if (useCudaFFT) {
if (cu.getUseDoublePrecision()) {
......
......@@ -89,7 +89,7 @@ private:
class HipCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
HipCalcNonbondedForceKernel(std::string name, const Platform& platform, HipContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), usePmeStream(false) {
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) {
}
~HipCalcNonbondedForceKernel();
/**
......@@ -211,7 +211,7 @@ private:
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
......
......@@ -418,7 +418,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces)
useFixedPointChargeSpreading = cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces;
if (useFixedPointChargeSpreading)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1";
......@@ -446,9 +447,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
hipFuncSetCacheConfig(pmeSpreadChargeKernel, hipFuncCachePreferShared);
hipFuncSetCacheConfig(pmeInterpolateForceKernel, hipFuncCachePreferL1);
if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
if (doLJPME) {
pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
......@@ -458,27 +458,28 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
hipFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, hipFuncCachePreferL1);
}
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
if (useFixedPointChargeSpreading)
cu.addAutoclearBuffer(pmeGrid2);
else
cu.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
......@@ -850,6 +851,10 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
// Execute the reciprocal space kernels.
// With fixed point charge spreading, finishSpreadCharge kernel is not needed as
// gridSpreadCharge can write directly to pmeGrid1.
HipArray& pmeSpreadDstGrid = useFixedPointChargeSpreading ? pmeGrid2 : pmeGrid1;
if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
......@@ -858,14 +863,16 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useFixedPointChargeSpreading) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
fft->execFFT(true);
......@@ -879,7 +886,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
&pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
cu.executeKernelFlat(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
fft->execFFT(false);
......@@ -887,7 +894,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cu.executeKernelFlat(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (doLJPME && hasLJ) {
......@@ -901,15 +908,17 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.clearBuffer(pmeEnergyBuffer);
}
cu.clearBuffer(pmeGrid2);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.clearBuffer(pmeSpreadDstGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useFixedPointChargeSpreading) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
dispersionFft->execFFT(true);
......@@ -923,7 +932,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
&pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
cu.executeKernelFlat(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
dispersionFft->execFFT(false);
......@@ -931,7 +940,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cu.executeKernelFlat(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (usePmeStream) {
hipEventRecord(pmeSyncEvent, pmeStream);
......
......@@ -25,9 +25,6 @@ typedef unsigned long long mm_ulong;
#define SUPPORTS_DOUBLE_PRECISION 1
#define LAUNCH_BOUNDS_EXACT(WORK_GROUP_SIZE, WAVES_PER_EU) \
__attribute__((amdgpu_flat_work_group_size(WORK_GROUP_SIZE, WORK_GROUP_SIZE), amdgpu_waves_per_eu(WAVES_PER_EU, WAVES_PER_EU)))
#ifdef USE_DOUBLE_PRECISION
__device__ inline long long realToFixedPoint(double x) {
......
......@@ -449,11 +449,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// Create required data structures.
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
......
......@@ -2604,10 +2604,8 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
// Create required data structures.
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
pmeGrid1.initialize(cc, gridElements, elementSize, "pmeGrid1");
pmeGrid2.initialize(cc, gridElements, 2*elementSize, "pmeGrid2");
if (useFixedPointChargeSpreading()) {
......@@ -2785,9 +2783,11 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
pmeDefines["CHARGE"] = "charges[atom]";
pmeDefines["USE_LJPME"] = "1";
program = cc.compileProgram(CommonKernelSources::pme, pmeDefines);
dpmeFinishSpreadChargeKernel = program->createKernel("finishSpreadCharge");
dpmeFinishSpreadChargeKernel->addArg(pmeGrid2);
dpmeFinishSpreadChargeKernel->addArg(pmeGrid1);
if (useFixedPointChargeSpreading()) {
dpmeFinishSpreadChargeKernel = program->createKernel("finishSpreadCharge");
dpmeFinishSpreadChargeKernel->addArg(pmeGrid2);
dpmeFinishSpreadChargeKernel->addArg(pmeGrid1);
}
dpmeGridIndexKernel = program->createKernel("findAtomGridIndex");
dpmeGridIndexKernel->addArg(cc.getPosq());
dpmeGridIndexKernel->addArg(pmeAtomGridIndex);
......@@ -2795,7 +2795,10 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
dpmeGridIndexKernel->addArg();
dpmeSpreadChargeKernel = program->createKernel("gridSpreadCharge");
dpmeSpreadChargeKernel->addArg(cc.getPosq());
dpmeSpreadChargeKernel->addArg(pmeGrid2);
if (useFixedPointChargeSpreading())
dpmeSpreadChargeKernel->addArg(pmeGrid2);
else
dpmeSpreadChargeKernel->addArg(pmeGrid1);
for (int i = 0; i < 8; i++)
dpmeSpreadChargeKernel->addArg();
dpmeSpreadChargeKernel->addArg(pmeAtomGridIndex);
......@@ -3266,9 +3269,13 @@ double CommonCalcHippoNonbondedForceKernel::execute(ContextImpl& context, bool i
dpmeGridIndexKernel->execute(cc.getNumAtoms());
sortGridIndex();
cc.clearBuffer(pmeGrid2);
dpmeSpreadChargeKernel->execute(cc.getNumAtoms(), 128);
dpmeFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useFixedPointChargeSpreading())
cc.clearBuffer(pmeGrid2);
else
cc.clearBuffer(pmeGrid1);
dpmeSpreadChargeKernel->execute(PmeOrder*cc.getNumAtoms(), 128);
if (useFixedPointChargeSpreading())
dpmeFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
computeFFT(true, true);
if (includeEnergy)
dpmeEvalEnergyKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
......@@ -3553,4 +3560,4 @@ void CommonCalcHippoNonbondedForceKernel::getDPMEParameters(double& alpha, int&
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
\ No newline at end of file
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment