Commit 7b66ba19 authored by Andy Simmonett's avatar Andy Simmonett
Browse files

Added initial CUDA LJPME code, and modified cpupme plugin to handle LJPME as a standalone method.

parent 94cb8614
......@@ -1345,6 +1345,57 @@ public:
};
/**
* This kernel performs the dispersion reciprocal space calculation for LJPME. In most cases, this
* calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
* In some cases it may want to outsource the work to a different kernel. In particular,
* GPU based platforms sometimes use a CPU based implementation provided by a separate
* plugin.
*/
class CalcDispersionPmeReciprocalForceKernel : public KernelImpl {
public:
class IO;
static std::string Name() {
return "CalcDispersionPmeReciprocalForce";
}
CalcDispersionPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxVectors the vectors defining the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
virtual void beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) = 0;
/**
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
virtual double finishComputation(IO& io) = 0;
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
};
} // namespace OpenMM
#endif /*OPENMM_KERNELS_H_*/
......@@ -674,8 +674,9 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if (useOptimizedPme) {
optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
optimizedDispersionPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
optimizedDispersionPme.getAs<CalcPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
}
}
}
......
......@@ -598,8 +598,10 @@ private:
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), C6s(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
directDispersionPmeGrid(NULL), reciprocalDispersionPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeAtomDispersionGridIndex(NULL),
pmeEnergyBuffer(NULL), dispersionPmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), dispersionPmeio(NULL) {
}
~CudaCalcNonbondedForceKernel();
/**
......@@ -636,6 +638,15 @@ public:
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 8;}
......@@ -655,38 +666,55 @@ private:
CudaContext& cu;
bool hasInitializedFFT;
CudaArray* sigmaEpsilon;
CudaArray* C6s;
CudaArray* exceptionParams;
CudaArray* cosSinSums;
CudaArray* directPmeGrid;
CudaArray* reciprocalPmeGrid;
CudaArray* directDispersionPmeGrid;
CudaArray* reciprocalDispersionPmeGrid;
CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ;
CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaArray* pmeAtomDispersionGridIndex;
CudaArray* pmeEnergyBuffer;
CudaArray* dispersionPmeEnergyBuffer;
CudaSort* sort;
Kernel cpuPme;
Kernel cpuDispersionPme;
PmeIO* pmeio;
CUstream pmeStream;
CUevent pmeSyncEvent;
PmeIO* dispersionPmeio;
CUstream pmeStream, dispersionPmeStream;
CUevent pmeSyncEvent, dispersionPmeSyncEvent;
CudaFFT3D* fft;
cufftHandle fftForward;
cufftHandle fftBackward;
CudaFFT3D* dispersionFft;
cufftHandle dispersionFftForward;
cufftHandle dispersionFftBackward;
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeDispersionGridIndexKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeDispersionSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeDispersionFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel;
CUfunction pmeEvalDispersionEnergyKernel;
CUfunction pmeConvolutionKernel;
CUfunction pmeDispersionConvolutionKernel;
CUfunction pmeInterpolateForceKernel;
CUfunction pmeInterpolateDispersionForceKernel;
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha;
double ewaldSelfEnergy, dispersionSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
......
......@@ -439,6 +439,15 @@ public:
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class Task;
CudaPlatform::PlatformData& data;
......
This diff is collapsed.
......@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
}
void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
}
class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,
......
......@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
deviceName << name;
}
size_t printfsize;
cuCtxGetLimit(&printfsize, CU_LIMIT_PRINTF_FIFO_SIZE);
cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 10*printfsize);
useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
disablePmeStream = (pmeStreamProperty == "true");
deterministicForces = (deterministicForcesProperty == "true");
......
......@@ -17,6 +17,25 @@
const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
#endif
real tempForce = 0.0f;
#if HAS_LENNARD_JONES
// The multiplicative term to correct for the multiplicative terms that are always
// present in reciprocal space. The real terms have an additive contribution
// added in, but for excluded terms the multiplicative term is just subtracted.
// These factors are needed in both clauses of the needCorrection statement, so
// I declare them up here.
#if DO_LJPME
const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
const real dar2 = dispersionAlphaR*dispersionAlphaR;
const real dar4 = dar2*dar2;
const real dar6 = dar4*dar2;
const real invR2 = invR*invR;
const real expDar2 = EXP(-dar2);
const real c6 = C6s1*C6s2;
const real coef = invR2*invR2*invR2*c6;
const real eprefac = 1.0f + dar2 + 0.5f*dar4;
const real dprefac = eprefac + dar6/6.0f;
#endif
#endif
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
......@@ -29,6 +48,13 @@
includeInteraction = false;
tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
}
#if HAS_LENNARD_JONES
#if DO_LJPME
// The multiplicative grid term
tempEnergy += coef*(1.0f - expDar2*eprefac);
tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
#endif
#endif
}
else {
#if HAS_LENNARD_JONES
......@@ -36,7 +62,8 @@
real sig2 = invR*sig;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
real epssig6 = sig6*eps;
tempForce = epssig6*(12.0f*sig6 - 6.0f);
real ljEnergy = epssig6*(sig6 - 1.0f);
#if USE_LJ_SWITCH
......@@ -48,6 +75,22 @@
ljEnergy *= switchValue;
}
#endif
#if DO_LJPME
// The multiplicative grid term
ljEnergy += coef*(1.0f - expDar2*eprefac);
tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
// The potential shift accounts for the step at the cutoff introduced by the
// transition from additive to multiplicative combintion rules and is only
// needed for the real (not excluded) terms. By addin these terms to ljEnergy
// instead of tempEnergy here, the includeInteraction mask is correctly applied.
sig2 = sig*sig;
sig6 = sig2*sig2*sig2*INVCUT6;
epssig6 = eps*sig6;
// The additive part of the potential shift
ljEnergy += epssig6*(1.0f - sig6);
// The multiplicative part of the potential shift
ljEnergy += MULTSHIFT6*c6;
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
#else
......
extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// Compute the index of the grid point each atom is associated with.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
((int) t.y) % DISPERSION_GRID_SIZE_Y,
((int) t.z) % DISPERSION_GRID_SIZE_Z);
pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z+gridIndex.y*DISPERSION_GRID_SIZE_Z+gridIndex.z);
}
}
extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex,
const real* __restrict__ C6s) {
real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
((int) t.y) % DISPERSION_GRID_SIZE_Y,
((int) t.z) % DISPERSION_GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
real dx = data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
real dy = data[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
int index = ybase + zindex;
// We need to grab the C6 coefficient from the array
real add = C6s[atom]*dx*dy*data[iz].z;
#ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = index;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else
atomicAdd(&originalPmeGrid[index], add);
#endif
}
}
}
}
}
extern "C" __global__ void finishSpreadC6(long long* __restrict__ originalPmeGrid) {
real* floatGrid = (real*) originalPmeGrid;
const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
real scale = 1.0f/(real) 0x100000000;
#ifdef USE_DOUBLE_PRECISION
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index];
#else
for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
floatGrid[index] = scale*originalPmeGrid[index/2];
if (index+1 < gridSize)
floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
}
#endif
}
// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
extern "C" __global__ void
reciprocalDispersionConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real scaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
const real alpha = EWALD_DISPERSION_ALPHA;
real bfac = M_PI / alpha;
real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
real fac2 = alpha*alpha*alpha;
real fac3 = -2*alpha*M_PI*M_PI;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int ky = remainder/(GRID_SIZE_Z/2+1);
int kz = remainder-ky*(GRID_SIZE_Z/2+1);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = scaleFactor/(bx*by*bz);
real2 grid = halfcomplex_pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real m = SQRT(m2);
real m3 = m*m2;
real b = bfac*m;
real expfac = -b*b;
real expterm = EXP(expfac);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real t = 1.0f/(1.0f+0.3275911f*b);
real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
#else
real erfcterm = ERFC(b);
#endif
real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
}
}
extern "C" __global__ void
gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
const real scaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
const real alpha = EWALD_DISPERSION_ALPHA;
real bfac = M_PI / alpha;
real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
real fac2 = alpha*alpha*alpha;
real fac3 = -2*alpha*M_PI*M_PI;
mixed energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices
int kx = index/(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z));
int remainder = index-kx*DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z);
int ky = remainder/(DISPERSION_GRID_SIZE_Z);
int kz = remainder-ky*(DISPERSION_GRID_SIZE_Z);
int mx = (kx < (DISPERSION_GRID_SIZE_X+1)/2) ? kx : (kx-DISPERSION_GRID_SIZE_X);
int my = (ky < (DISPERSION_GRID_SIZE_Y+1)/2) ? ky : (ky-DISPERSION_GRID_SIZE_Y);
int mz = (kz < (DISPERSION_GRID_SIZE_Z+1)/2) ? kz : (kz-DISPERSION_GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = scaleFactor/(bx*by*bz);
real m = SQRT(m2);
real m3 = m*m2;
real b = bfac*m;
real expfac = -b*b;
real expterm = EXP(expfac);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real t = 1.0f/(1.0f+0.3275911f*b);
real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
#else
real erfcterm = ERFC(b);
#endif
real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
if (kz >= (DISPERSION_GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : DISPERSION_GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : DISPERSION_GRID_SIZE_Y-ky);
kz = DISPERSION_GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(DISPERSION_GRID_SIZE_Z/2+1)+kx*(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z/2+1));
real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
// N.B. We inlcude the 0,0,0 point for dispersion
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
#ifdef USE_PME_STREAM
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
#else
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
#endif
}
extern "C" __global__
void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ C6s) {
real3 data[PME_ORDER];
real3 ddata[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real3 force = make_real3(0);
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
((int) t.y) % DISPERSION_GRID_SIZE_Y,
((int) t.z) % DISPERSION_GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
real dx = data[ix].x;
real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
real dy = data[iy].y;
real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
int index = ybase + zindex;
real gridvalue = originalPmeGrid[index];
force.x += ddx*dy*data[iz].z*gridvalue;
force.y += dx*ddy*data[iz].z*gridvalue;
force.z += dx*dy*ddata[iz].z*gridvalue;
}
}
}
real q = C6s[atom];
real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x);
real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y);
real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z);
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (forceX*0x100000000)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceY*0x100000000)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceZ*0x100000000)));
}
}
......@@ -2209,12 +2209,13 @@ void OpenCLCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx,
if (nonbondedMethod != LJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cl.getPlatformData().useCpuPme)
cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->alpha;
nx = gridSizeX;
ny = gridSizeY;
nz = gridSizeZ;
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
}
......
......@@ -1155,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
NonbondedForce nb;
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ);
NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
......
......@@ -55,5 +55,7 @@ extern "C" OPENMM_EXPORT_PME void registerPlatforms() {
KernelImpl* CpuPmeKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
if (name == CalcPmeReciprocalForceKernel::Name())
return new CpuCalcPmeReciprocalForceKernel(name, platform);
if (name == CalcDispersionPmeReciprocalForceKernel::Name())
return new CpuCalcDispersionPmeReciprocalForceKernel(name, platform);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
......@@ -48,8 +48,8 @@ using namespace std;
static const int PME_ORDER = 5;
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
bool CpuCalcDispersionPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcDispersionPmeReciprocalForceKernel::numThreads = 0;
static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
float temp[4];
......@@ -590,7 +590,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
int complexSize = gridx*gridy*(gridz/2+1);
int complexStart = std::max(1, ((index*complexSize)/numThreads));
int complexEnd = (((index+1)*complexSize)/numThreads);
const float epsilonFactor = calculationType==Electrostatic ? sqrt(ONE_4PI_EPS0) : 1.0f;
const float epsilonFactor = sqrt(ONE_4PI_EPS0);
spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
threads.syncThreads();
int numGrids = tempGrid.size();
......@@ -601,37 +601,16 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
sum.store(&realGrid[i]);
}
threads.syncThreads();
switch(calculationType){
case Electrostatic:
if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
}
if (includeEnergy) {
threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
}
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
break;
case Dispersion:
if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
computeReciprocalDispersionEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
}
if (includeEnergy) {
threadEnergy[index] = reciprocalDispersionEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
}
// For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
complexStart = (index*complexSize)/numThreads;
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
}
if (includeEnergy) {
threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
break;
default:
throw OpenMMException("Unimplemented convolution type");
}
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads();
interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
}
......@@ -702,3 +681,296 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
minimum++;
}
}
/*
* Everything below here is just a clone of the above, but to handle the dispersion term
* instead of electrostatics.
*/
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
class CpuCalcDispersionPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
public:
ComputeTask(CpuCalcDispersionPmeReciprocalForceKernel& owner) : owner(owner) {
}
void execute(ThreadPool& threads, int threadIndex) {
owner.runWorkerThread(threads, threadIndex);
}
CpuCalcDispersionPmeReciprocalForceKernel& owner;
};
static void* dispersionThreadBody(void* args) {
CpuCalcDispersionPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcDispersionPmeReciprocalForceKernel*>(args);
owner.runMainThread();
return 0;
}
void CpuCalcDispersionPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
if (!hasInitializedThreads) {
numThreads = getNumProcessors();
char* threadsEnv = getenv("OPENMM_CPU_THREADS");
if (threadsEnv != NULL)
stringstream(threadsEnv) >> numThreads;
fftwf_init_threads();
hasInitializedThreads = true;
}
threadEnergy.resize(numThreads);
gridx = findFFTDimension(xsize, false);
gridy = findFFTDimension(ysize, false);
gridz = findFFTDimension(zsize, true);
this->numParticles = numParticles;
this->alpha = alpha;
force.resize(4*numParticles);
recipEterm.resize(gridx*gridy*gridz);
// Initialize threads.
isFinished = false;
pthread_cond_init(&startCondition, NULL);
pthread_cond_init(&endCondition, NULL);
pthread_mutex_init(&lock, NULL);
pthread_create(&mainThread, NULL, dispersionThreadBody, this);
// Wait until the main thread is up and running.
pthread_mutex_lock(&lock);
while (!isFinished)
pthread_cond_wait(&endCondition, &lock);
pthread_mutex_unlock(&lock);
// Initialize FFTW.
for (int i = 0; i < numThreads; i++)
tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
realGrid = tempGrid[0];
complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
fftwf_plan_with_nthreads(numThreads);
forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
hasCreatedPlan = true;
// Initialize the b-spline moduli.
int maxSize = std::max(std::max(gridx, gridy), gridz);
vector<double> data(PME_ORDER);
vector<double> ddata(PME_ORDER);
vector<double> bsplinesData(maxSize);
data[PME_ORDER-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PME_ORDER; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PME_ORDER; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PME_ORDER-1);
data[PME_ORDER-1] = 0.0;
for (int i = 1; i < (PME_ORDER-1); i++)
data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplinesData[i] = 0.0;
for (int i = 1; i <= PME_ORDER; i++)
bsplinesData[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
bsplineModuli[0].resize(gridx);
bsplineModuli[1].resize(gridy);
bsplineModuli[2].resize(gridz);
for (int dim = 0; dim < 3; dim++) {
int ndata = bsplineModuli[dim].size();
vector<float>& moduli = bsplineModuli[dim];
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplinesData[j]*cos(arg);
ss += bsplinesData[j]*sin(arg);
}
moduli[i] = (float) (sc*sc+ss*ss);
}
for (int i = 0; i < ndata; i++)
if (moduli[i] < 1.0e-7f)
moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
}
}
CpuCalcDispersionPmeReciprocalForceKernel::~CpuCalcDispersionPmeReciprocalForceKernel() {
isDeleted = true;
pthread_mutex_lock(&lock);
pthread_cond_broadcast(&startCondition);
pthread_mutex_unlock(&lock);
pthread_join(mainThread, NULL);
pthread_mutex_destroy(&lock);
pthread_cond_destroy(&startCondition);
pthread_cond_destroy(&endCondition);
for (int i = 0; i < (int) tempGrid.size(); i++)
fftwf_free(tempGrid[i]);
if (complexGrid != NULL)
fftwf_free(complexGrid);
if (hasCreatedPlan) {
fftwf_destroy_plan(forwardFFT);
fftwf_destroy_plan(backwardFFT);
}
}
void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
// This is the main thread that coordinates all the other ones.
pthread_mutex_lock(&lock);
isFinished = true;
pthread_cond_signal(&endCondition);
ThreadPool threads(numThreads);
while (true) {
// Wait for the signal to start.
pthread_cond_wait(&startCondition, &lock);
if (isDeleted)
break;
posq = io->getPosq();
ComputeTask task(*this);
gmx_atomic_set(&atomicCounter, 0);
threads.execute(task); // Signal threads to perform charge spreading.
threads.waitForThreads();
threads.resumeThreads(); // Signal threads to sum the charge grids.
threads.waitForThreads();
fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
threads.waitForThreads();
}
if (includeEnergy) {
threads.resumeThreads(); // Signal threads to compute energy.
threads.waitForThreads();
for (int i = 0; i < (int) threadEnergy.size(); i++)
energy += threadEnergy[i];
}
threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
threads.waitForThreads();
fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
gmx_atomic_set(&atomicCounter, 0);
threads.resumeThreads(); // Signal threads to interpolate forces.
threads.waitForThreads();
isFinished = true;
lastBoxVectors[0] = periodicBoxVectors[0];
lastBoxVectors[1] = periodicBoxVectors[1];
lastBoxVectors[2] = periodicBoxVectors[2];
pthread_cond_signal(&endCondition);
}
pthread_mutex_unlock(&lock);
}
void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
int gridxStart = (index*gridx)/numThreads;
int gridxEnd = ((index+1)*gridx)/numThreads;
int gridSize = (gridx*gridy*gridz+3)/4;
int gridStart = 4*((index*gridSize)/numThreads);
int gridEnd = 4*(((index+1)*gridSize)/numThreads);
int complexSize = gridx*gridy*(gridz/2+1);
int complexStart = std::max(1, ((index*complexSize)/numThreads));
int complexEnd = (((index+1)*complexSize)/numThreads);
const float epsilonFactor = 1.0f;
spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
threads.syncThreads();
int numGrids = tempGrid.size();
for (int i = gridStart; i < gridEnd; i += 4) {
fvec4 sum(&realGrid[i]);
for (int j = 1; j < numGrids; j++)
sum += fvec4(&tempGrid[j][i]);
sum.store(&realGrid[i]);
}
threads.syncThreads();
if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
computeReciprocalDispersionEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
}
if (includeEnergy) {
threadEnergy[index] = reciprocalDispersionEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads();
}
// For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
complexStart = (index*complexSize)/numThreads;
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads();
interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
}
void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
this->io = &io;
this->periodicBoxVectors[0] = periodicBoxVectors[0];
this->periodicBoxVectors[1] = periodicBoxVectors[1];
this->periodicBoxVectors[2] = periodicBoxVectors[2];
this->includeEnergy = includeEnergy;
energy = 0.0;
// Invert the box vectors.
double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
double scale = 1.0/determinant;
recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;
// Do the calculation.
pthread_mutex_lock(&lock);
isFinished = false;
pthread_cond_signal(&startCondition);
pthread_mutex_unlock(&lock);
}
double CpuCalcDispersionPmeReciprocalForceKernel::finishComputation(IO& io) {
pthread_mutex_lock(&lock);
while (!isFinished) {
pthread_cond_wait(&endCondition, &lock);
}
pthread_mutex_unlock(&lock);
io.setForce(&force[0]);
return energy;
}
bool CpuCalcDispersionPmeReciprocalForceKernel::isProcessorSupported() {
return isVec4Supported();
}
void CpuCalcDispersionPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
alpha = this->alpha;
nx = gridx;
ny = gridy;
nz = gridz;
}
int CpuCalcDispersionPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
if (isZ && minimum%2 == 1) {
// Force the last dimension to be even, since this produces better performance in FFTW.
minimum++;
continue;
}
int unfactored = minimum;
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1 || unfactored == 11 || unfactored == 13)
return minimum;
minimum++;
}
}
......@@ -51,10 +51,8 @@ namespace OpenMM {
class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
public:
enum CalculationType { Electrostatic=0, Dispersion=1 };
CpuCalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL), calculationType(Electrostatic) {
hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
}
/**
* Initialize the kernel.
......@@ -103,11 +101,98 @@ public:
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class ComputeTask;
/**
* Select a size for one grid dimension that FFTW can handle efficiently.
*/
int findFFTDimension(int minimum, bool isZ);
static bool hasInitializedThreads;
static int numThreads;
int gridx, gridy, gridz, numParticles;
double alpha;
bool hasCreatedPlan, isFinished, isDeleted;
std::vector<float> force;
std::vector<float> bsplineModuli[3];
std::vector<float> recipEterm;
Vec3 lastBoxVectors[3];
std::vector<float> threadEnergy;
std::vector<float*> tempGrid;
float* realGrid;
fftwf_complex* complexGrid;
fftwf_plan forwardFFT, backwardFFT;
int waitCount;
pthread_cond_t startCondition, endCondition;
pthread_mutex_t lock;
pthread_t mainThread;
// The following variables are used to store information about the calculation currently being performed.
IO* io;
float energy;
float* posq;
Vec3 periodicBoxVectors[3], recipBoxVectors[3];
bool includeEnergy;
gmx_atomic_t atomicCounter;
};
/**
* This is an optimized CPU implementation of CalcDispersionPmeReciprocalForceKernel. It is both
* vectorized (requiring SSE 4.1) and multithreaded. It uses FFTW to perform the FFTs.
*/
class OPENMM_EXPORT_PME CpuCalcDispersionPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
public:
CpuCalcDispersionPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
void initialize(int xsize, int ysize, int zsize, int numParticles, double alpha);
~CpuCalcDispersionPmeReciprocalForceKernel();
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxVectors the vectors defining the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
void beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy);
/**
* Sets the type of reciprocal space computation to perform (Electrostatic or Dispersion).
* @param type The type of computation
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
void setCalculationType(CalculationType type) { calculationType = type; }
double finishComputation(IO& io);
/**
* This routine contains the code executed by the main thread.
*/
void runMainThread();
/**
* This routine contains the code executed by each worker thread.
*/
void runWorkerThread(ThreadPool& threads, int index);
/**
* Get whether the current CPU supports all features needed by this kernel.
*/
static bool isProcessorSupported();
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class ComputeTask;
/**
......@@ -138,7 +223,6 @@ private:
float* posq;
Vec3 periodicBoxVectors[3], recipBoxVectors[3];
bool includeEnergy;
CalculationType calculationType;
gmx_atomic_t atomicCounter;
};
......
......@@ -523,8 +523,7 @@ void test_water2_dpme_energies_forces_no_exclusions() {
const vector<Vec3>& refforces = state.getForces();
// Optimized CPU calculation
CpuCalcPmeReciprocalForceKernel pme(CalcPmeReciprocalForceKernel::Name(), platform);
pme.setCalculationType(CpuCalcPmeReciprocalForceKernel::Dispersion);
CpuCalcDispersionPmeReciprocalForceKernel pme(CalcPmeReciprocalForceKernel::Name(), platform);
IO io;
double selfEwaldEnergy = 0;
double dalpha6 = pow(dalpha, 6.0);
......
......@@ -1695,8 +1695,10 @@ void test_water125_dpme_vs_long_cutoff_with_exclusions() {
ASSERT_EQUAL_TOL(refenergy, energy, 5E-4);
ASSERT_EQUAL_TOL(gromacs_energy, energy, 5E-5);
// Forces accumulated in single precision are tested to a more permissive criterion; the double
// precision platform can match to 5E-5.
for(int n = 0; n < NATOMS; ++n)
ASSERT_EQUAL_VEC(refforces[n], forces[n], 5E-5);
ASSERT_EQUAL_VEC(refforces[n], forces[n], 1E-4);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment