Unverified Commit 8ea42950 authored by Anton Gorenko's avatar Anton Gorenko Committed by GitHub
Browse files

Optimize PME spread charge kernel (#4633)

* PME_ORDER threads process one atom;
* PME_ORDER threads access consecutive addresses;
* No need to permute z indices with zindexTable;
* finishSpreadCharge is needed only with fixed point charge spreading;
parent edb74aec
...@@ -20,9 +20,6 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE ...@@ -20,9 +20,6 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
} }
} }
#if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING #ifdef USE_FIXED_POINT_CHARGE_SPREADING
GLOBAL mm_ulong* RESTRICT pmeGrid, GLOBAL mm_ulong* RESTRICT pmeGrid,
...@@ -37,31 +34,13 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -37,31 +34,13 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
GLOBAL const real* RESTRICT charges GLOBAL const real* RESTRICT charges
#endif #endif
) { ) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
LOCAL int zindexTable[GRID_SIZE_Z+PME_ORDER];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = LOCAL_ID; i < GRID_SIZE_Z+PME_ORDER; i += LOCAL_SIZE) {
int zindex = i % GRID_SIZE_Z;
int block = zindex % PME_ORDER;
zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
SYNC_THREADS;
// Process the atoms in spatially sorted order. This improves efficiency when writing // Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values. // the grid values. PME_ORDER threads process one atom.
real3 data[PME_ORDER]; real3 data[PME_ORDER];
const real scale = RECIP((real) (PME_ORDER-1)); const real scale = RECIP((real) (PME_ORDER-1));
for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) { for (int i = GLOBAL_ID; i < NUM_ATOMS*PME_ORDER; i += GLOBAL_SIZE) {
int atom = pmeAtomGridIndex[i].x; int atom = pmeAtomGridIndex[i/PME_ORDER].x;
real4 pos = posq[atom]; real4 pos = posq[atom];
#ifdef CHARGE_FROM_SIGEPS #ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom]; const float2 sigEps = sigmaEpsilon[atom];
...@@ -101,71 +80,52 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -101,71 +80,52 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0]; data[0] = scale*(make_real3(1)-dr)*data[0];
// Spread the charge from this atom onto each grid point. // Spread the charge from this atom onto each grid point. PME_ORDER threads access
// consecutive addresses.
int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER; int iz = i%PME_ORDER;
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real dz = 0;
for (int i = 0; i < PME_ORDER; i++) {
dz = i == iz ? data[i].z : dz;
}
dz *= charge;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix; int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0); xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y; xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = charge*data[ix].x; real dzdx = dz*data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy; int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = (xbase+ybase)*blockSize; ybase = ybase*GRID_SIZE_Z;
real dxdy = dx*data[iy].y; int index = xbase + ybase + zindex;
for (int i = 0; i < PME_ORDER; i++) { real add = dzdx*data[iy].y;
int iz = (i+izoffset) % PME_ORDER;
int zindex = gridIndex.z+iz;
int index = ybase + zindexTable[zindex];
real add = dxdy*data[iz].z;
#ifdef USE_FIXED_POINT_CHARGE_SPREADING #ifdef USE_FIXED_POINT_CHARGE_SPREADING
ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add)); ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
#else #else
ATOMIC_ADD(&pmeGrid[index], add); ATOMIC_ADD(&pmeGrid[index], add);
#endif #endif
}
} }
} }
} }
} }
KERNEL void finishSpreadCharge(
#ifdef USE_FIXED_POINT_CHARGE_SPREADING #ifdef USE_FIXED_POINT_CHARGE_SPREADING
KERNEL void finishSpreadCharge(
GLOBAL const mm_long* RESTRICT grid1, GLOBAL const mm_long* RESTRICT grid1,
#else
GLOBAL const real* RESTRICT grid1,
#endif
GLOBAL real* RESTRICT grid2) { GLOBAL real* RESTRICT grid2) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
LOCAL int zindexTable[GRID_SIZE_Z];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = LOCAL_ID; i < GRID_SIZE_Z; i += LOCAL_SIZE) {
int block = i % PME_ORDER;
zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
}
SYNC_THREADS;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = 1/(real) 0x100000000; real scale = 1/(real) 0x100000000;
for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) { for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
int zindex = index%GRID_SIZE_Z; grid2[index] = scale*grid1[index];
int loadIndex = zindexTable[zindex] + blockSize*(int) (index/GRID_SIZE_Z);
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
grid2[index] = scale*grid1[loadIndex];
#else
grid2[index] = grid1[loadIndex];
#endif
} }
} }
#endif
KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX, KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ, GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) { real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
...@@ -267,7 +227,7 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES ...@@ -267,7 +227,7 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx); kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky); ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz; kz = GRID_SIZE_Z-kz;
} }
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1)); int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid]; real2 grid = pmeGrid[indexInHalfComplexGrid];
#ifndef USE_LJPME #ifndef USE_LJPME
...@@ -282,9 +242,6 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES ...@@ -282,9 +242,6 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
#endif #endif
} }
#if defined(USE_HIP) && !defined(AMD_RDNA) && !defined(USE_DOUBLE_PRECISION)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid, KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
...@@ -297,10 +254,10 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul ...@@ -297,10 +254,10 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
real3 data[PME_ORDER]; real3 data[PME_ORDER];
real3 ddata[PME_ORDER]; real3 ddata[PME_ORDER];
const real scale = RECIP((real) (PME_ORDER-1)); const real scale = RECIP((real) (PME_ORDER-1));
// Process the atoms in spatially sorted order. This improves cache performance when loading // Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values. // the grid values.
for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) { for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) {
int atom = pmeAtomGridIndex[i].x; int atom = pmeAtomGridIndex[i].x;
real3 force = make_real3(0); real3 force = make_real3(0);
...@@ -315,6 +272,14 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul ...@@ -315,6 +272,14 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X, int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z); ((int) t.z) % GRID_SIZE_Z);
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
real q = CHARGE*EPSILON_FACTOR;
#endif
if (q == 0)
continue;
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
...@@ -346,14 +311,14 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul ...@@ -346,14 +311,14 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z; xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x; real dx = data[ix].x;
real ddx = ddata[ix].x; real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy; int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z; ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y; real dy = data[iy].y;
real ddy = ddata[iy].y; real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
...@@ -365,12 +330,6 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul ...@@ -365,12 +330,6 @@ KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ul
} }
} }
} }
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
real q = CHARGE*EPSILON_FACTOR;
#endif
real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x); real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y); real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z); real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);
......
...@@ -89,7 +89,7 @@ private: ...@@ -89,7 +89,7 @@ private:
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), usePmeStream(false) { cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -215,7 +215,7 @@ private: ...@@ -215,7 +215,7 @@ private:
int interpolateForceThreads; int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ; int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME, usePosqCharges, recomputeParams, hasOffsets; bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, useCudaFFT, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod; NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5; static const int PmeOrder = 5;
}; };
...@@ -244,4 +244,3 @@ public: ...@@ -244,4 +244,3 @@ public:
} // namespace OpenMM } // namespace OpenMM
#endif /*OPENMM_CUDAKERNELS_H_*/ #endif /*OPENMM_CUDAKERNELS_H_*/
...@@ -427,7 +427,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -427,7 +427,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ); pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0)); pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI); pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces) useFixedPointChargeSpreading = cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces;
if (useFixedPointChargeSpreading)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1"; pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream) if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1"; pmeDefines["USE_PME_STREAM"] = "1";
...@@ -455,7 +456,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -455,7 +456,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce"); pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy"); pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); if (useFixedPointChargeSpreading)
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_SHARED); cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_SHARED);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1); cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
if (doLJPME) { if (doLJPME) {
...@@ -466,10 +468,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -466,10 +468,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha)); pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1"; pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1"; pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
module = cu.createModule(CudaKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines); module = cu.createModule(CudaKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex"); pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge"); pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
...@@ -481,15 +482,16 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -481,15 +482,16 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures. // Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder); int gridElements = gridSizeX*gridSizeY*gridSizeZ;
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) { if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder); gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
} }
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1"); pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2"); pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2); if (useFixedPointChargeSpreading)
cu.addAutoclearBuffer(pmeGrid2);
else
cu.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -890,6 +892,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -890,6 +892,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// Execute the reciprocal space kernels. // Execute the reciprocal space kernels.
// With fixed point charge spreading, finishSpreadCharge kernel is not needed as
// gridSpreadCharge can write directly to pmeGrid1.
CudaArray& pmeSpreadDstGrid = useFixedPointChargeSpreading ? pmeGrid2 : pmeGrid1;
if (hasCoulomb) { if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
...@@ -898,14 +904,16 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -898,14 +904,16 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()}; &charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()}; if (useFixedPointChargeSpreading) {
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256); void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision()) {
...@@ -967,15 +975,17 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -967,15 +975,17 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.clearBuffer(pmeEnergyBuffer); cu.clearBuffer(pmeEnergyBuffer);
} }
cu.clearBuffer(pmeGrid2); cu.clearBuffer(pmeSpreadDstGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()}; &sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()}; if (useFixedPointChargeSpreading) {
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision()) {
......
...@@ -89,7 +89,7 @@ private: ...@@ -89,7 +89,7 @@ private:
class HipCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class HipCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
HipCalcNonbondedForceKernel(std::string name, const Platform& platform, HipContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), HipCalcNonbondedForceKernel(std::string name, const Platform& platform, HipContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), usePmeStream(false) { cu(cu), hasInitializedFFT(false), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), useFixedPointChargeSpreading(false), usePmeStream(false) {
} }
~HipCalcNonbondedForceKernel(); ~HipCalcNonbondedForceKernel();
/** /**
...@@ -211,7 +211,7 @@ private: ...@@ -211,7 +211,7 @@ private:
int interpolateForceThreads; int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ; int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets; bool hasCoulomb, hasLJ, useFixedPointChargeSpreading, usePmeStream, doLJPME, usePosqCharges, recomputeParams, hasOffsets;
NonbondedMethod nonbondedMethod; NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5; static const int PmeOrder = 5;
}; };
......
...@@ -418,7 +418,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond ...@@ -418,7 +418,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ); pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0)); pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI); pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces) useFixedPointChargeSpreading = cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces;
if (useFixedPointChargeSpreading)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1"; pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream) if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1"; pmeDefines["USE_PME_STREAM"] = "1";
...@@ -446,9 +447,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond ...@@ -446,9 +447,8 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce"); pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy"); pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); if (useFixedPointChargeSpreading)
hipFuncSetCacheConfig(pmeSpreadChargeKernel, hipFuncCachePreferShared); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
hipFuncSetCacheConfig(pmeInterpolateForceKernel, hipFuncCachePreferL1);
if (doLJPME) { if (doLJPME) {
pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha); pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX); pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
...@@ -458,27 +458,28 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond ...@@ -458,27 +458,28 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeDefines["USE_LJPME"] = "1"; pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1"; pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines); module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); if (useFixedPointChargeSpreading)
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex"); pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge"); pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy"); pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce"); pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
hipFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, hipFuncCachePreferL1);
} }
// Create required data structures. // Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder); int gridElements = gridSizeX*gridSizeY*gridSizeZ;
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) { if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder); gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
} }
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1"); pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2"); pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2); if (useFixedPointChargeSpreading)
cu.addAutoclearBuffer(pmeGrid2);
else
cu.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -850,6 +851,10 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -850,6 +851,10 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
// Execute the reciprocal space kernels. // Execute the reciprocal space kernels.
// With fixed point charge spreading, finishSpreadCharge kernel is not needed as
// gridSpreadCharge can write directly to pmeGrid1.
HipArray& pmeSpreadDstGrid = useFixedPointChargeSpreading ? pmeGrid2 : pmeGrid1;
if (hasCoulomb) { if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
...@@ -858,14 +863,16 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -858,14 +863,16 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()}; &charges.getDevicePointer()};
cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()}; if (useFixedPointChargeSpreading) {
cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256); void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
fft->execFFT(true); fft->execFFT(true);
...@@ -879,7 +886,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -879,7 +886,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
&pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256); cu.executeKernelFlat(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
fft->execFFT(false); fft->execFFT(false);
...@@ -887,7 +894,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -887,7 +894,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()}; &charges.getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128); cu.executeKernelFlat(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
} }
if (doLJPME && hasLJ) { if (doLJPME && hasLJ) {
...@@ -901,15 +908,17 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -901,15 +908,17 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.clearBuffer(pmeEnergyBuffer); cu.clearBuffer(pmeEnergyBuffer);
} }
cu.clearBuffer(pmeGrid2); cu.clearBuffer(pmeSpreadDstGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeSpreadDstGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()}; &sigmaEpsilon.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, PmeOrder*cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()}; if (useFixedPointChargeSpreading) {
cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
dispersionFft->execFFT(true); dispersionFft->execFFT(true);
...@@ -923,7 +932,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -923,7 +932,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(), void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
&pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); cu.executeKernelFlat(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
dispersionFft->execFFT(false); dispersionFft->execFFT(false);
...@@ -931,7 +940,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -931,7 +940,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()}; &sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128); cu.executeKernelFlat(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
} }
if (usePmeStream) { if (usePmeStream) {
hipEventRecord(pmeSyncEvent, pmeStream); hipEventRecord(pmeSyncEvent, pmeStream);
......
...@@ -25,9 +25,6 @@ typedef unsigned long long mm_ulong; ...@@ -25,9 +25,6 @@ typedef unsigned long long mm_ulong;
#define SUPPORTS_DOUBLE_PRECISION 1 #define SUPPORTS_DOUBLE_PRECISION 1
#define LAUNCH_BOUNDS_EXACT(WORK_GROUP_SIZE, WAVES_PER_EU) \
__attribute__((amdgpu_flat_work_group_size(WORK_GROUP_SIZE, WORK_GROUP_SIZE), amdgpu_waves_per_eu(WAVES_PER_EU, WAVES_PER_EU)))
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
__device__ inline long long realToFixedPoint(double x) { __device__ inline long long realToFixedPoint(double x) {
......
...@@ -449,11 +449,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -449,11 +449,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// Create required data structures. // Create required data structures.
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder); int gridElements = gridSizeX*gridSizeY*gridSizeZ;
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) { if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder); gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
} }
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1"); pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2"); pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
......
...@@ -2604,10 +2604,8 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const ...@@ -2604,10 +2604,8 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
// Create required data structures. // Create required data structures.
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder); int gridElements = gridSizeX*gridSizeY*gridSizeZ;
int gridElements = gridSizeX*gridSizeY*roundedZSize; gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
pmeGrid1.initialize(cc, gridElements, elementSize, "pmeGrid1"); pmeGrid1.initialize(cc, gridElements, elementSize, "pmeGrid1");
pmeGrid2.initialize(cc, gridElements, 2*elementSize, "pmeGrid2"); pmeGrid2.initialize(cc, gridElements, 2*elementSize, "pmeGrid2");
if (useFixedPointChargeSpreading()) { if (useFixedPointChargeSpreading()) {
...@@ -2785,9 +2783,11 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const ...@@ -2785,9 +2783,11 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
pmeDefines["CHARGE"] = "charges[atom]"; pmeDefines["CHARGE"] = "charges[atom]";
pmeDefines["USE_LJPME"] = "1"; pmeDefines["USE_LJPME"] = "1";
program = cc.compileProgram(CommonKernelSources::pme, pmeDefines); program = cc.compileProgram(CommonKernelSources::pme, pmeDefines);
dpmeFinishSpreadChargeKernel = program->createKernel("finishSpreadCharge"); if (useFixedPointChargeSpreading()) {
dpmeFinishSpreadChargeKernel->addArg(pmeGrid2); dpmeFinishSpreadChargeKernel = program->createKernel("finishSpreadCharge");
dpmeFinishSpreadChargeKernel->addArg(pmeGrid1); dpmeFinishSpreadChargeKernel->addArg(pmeGrid2);
dpmeFinishSpreadChargeKernel->addArg(pmeGrid1);
}
dpmeGridIndexKernel = program->createKernel("findAtomGridIndex"); dpmeGridIndexKernel = program->createKernel("findAtomGridIndex");
dpmeGridIndexKernel->addArg(cc.getPosq()); dpmeGridIndexKernel->addArg(cc.getPosq());
dpmeGridIndexKernel->addArg(pmeAtomGridIndex); dpmeGridIndexKernel->addArg(pmeAtomGridIndex);
...@@ -2795,7 +2795,10 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const ...@@ -2795,7 +2795,10 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
dpmeGridIndexKernel->addArg(); dpmeGridIndexKernel->addArg();
dpmeSpreadChargeKernel = program->createKernel("gridSpreadCharge"); dpmeSpreadChargeKernel = program->createKernel("gridSpreadCharge");
dpmeSpreadChargeKernel->addArg(cc.getPosq()); dpmeSpreadChargeKernel->addArg(cc.getPosq());
dpmeSpreadChargeKernel->addArg(pmeGrid2); if (useFixedPointChargeSpreading())
dpmeSpreadChargeKernel->addArg(pmeGrid2);
else
dpmeSpreadChargeKernel->addArg(pmeGrid1);
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
dpmeSpreadChargeKernel->addArg(); dpmeSpreadChargeKernel->addArg();
dpmeSpreadChargeKernel->addArg(pmeAtomGridIndex); dpmeSpreadChargeKernel->addArg(pmeAtomGridIndex);
...@@ -3266,9 +3269,13 @@ double CommonCalcHippoNonbondedForceKernel::execute(ContextImpl& context, bool i ...@@ -3266,9 +3269,13 @@ double CommonCalcHippoNonbondedForceKernel::execute(ContextImpl& context, bool i
dpmeGridIndexKernel->execute(cc.getNumAtoms()); dpmeGridIndexKernel->execute(cc.getNumAtoms());
sortGridIndex(); sortGridIndex();
cc.clearBuffer(pmeGrid2); if (useFixedPointChargeSpreading())
dpmeSpreadChargeKernel->execute(cc.getNumAtoms(), 128); cc.clearBuffer(pmeGrid2);
dpmeFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); else
cc.clearBuffer(pmeGrid1);
dpmeSpreadChargeKernel->execute(PmeOrder*cc.getNumAtoms(), 128);
if (useFixedPointChargeSpreading())
dpmeFinishSpreadChargeKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
computeFFT(true, true); computeFFT(true, true);
if (includeEnergy) if (includeEnergy)
dpmeEvalEnergyKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ); dpmeEvalEnergyKernel->execute(dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
...@@ -3553,4 +3560,4 @@ void CommonCalcHippoNonbondedForceKernel::getDPMEParameters(double& alpha, int& ...@@ -3553,4 +3560,4 @@ void CommonCalcHippoNonbondedForceKernel::getDPMEParameters(double& alpha, int&
nx = dispersionGridSizeX; nx = dispersionGridSizeX;
ny = dispersionGridSizeY; ny = dispersionGridSizeY;
nz = dispersionGridSizeZ; nz = dispersionGridSizeZ;
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment