Commit a9f65649 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merge branch 'dpme' of https://github.com/andysim/openmm into ljpme

parents 9567ddb3 58b6e3b6
......@@ -555,7 +555,8 @@ public:
CutoffNonPeriodic = 1,
CutoffPeriodic = 2,
Ewald = 3,
PME = 4
PME = 4,
LJPME = 5
};
static std::string Name() {
return "CalcNonbondedForce";
......@@ -589,13 +590,22 @@ public:
virtual void copyParametersToContext(ContextImpl& context, const NonbondedForce& force) = 0;
/**
* Get the parameters being used for PME.
*
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
/**
* Get the parameters being used for the dispersion terms in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
virtual void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
};
/**
......@@ -1335,6 +1345,57 @@ public:
};
/**
* This kernel performs the dispersion reciprocal space calculation for LJPME. In most cases, this
* calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
* In some cases it may want to outsource the work to a different kernel. In particular,
* GPU based platforms sometimes use a CPU based implementation provided by a separate
* plugin.
*/
class CalcDispersionPmeReciprocalForceKernel : public KernelImpl {
public:
class IO;
static std::string Name() {
return "CalcDispersionPmeReciprocalForce";
}
CalcDispersionPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxVectors the vectors defining the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
virtual void beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) = 0;
/**
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
virtual double finishComputation(IO& io) = 0;
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
};
} // namespace OpenMM
#endif /*OPENMM_KERNELS_H_*/
......@@ -109,7 +109,12 @@ public:
* Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
* with all periodic copies of every other particle.
*/
PME = 4
PME = 4,
/**
* Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
* with all periodic copies of every other particle for both electrostatics and dispersion. No switching is used for either interaction.
*/
LJPME = 5
};
/**
* Create a NonbondedForce.
......@@ -207,6 +212,16 @@ public:
* @param[out] nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the parameters to use for dispersion term in LJ-PME calculations. If alpha is 0 (the default),
* these parameters are ignored and instead their values are chosen based on the Ewald error tolerance.
*
* @param[out] alpha the separation parameter
* @param[out] nx the number of dispersion grid points along the X axis
* @param[out] ny the number of dispersion grid points along the Y axis
* @param[out] nz the number of dispersion grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Set the parameters to use for PME calculations. If alpha is 0 (the default), these parameters are
* ignored and instead their values are chosen based on the Ewald error tolerance.
......@@ -217,6 +232,16 @@ public:
* @param nz the number of grid points along the Z axis
*/
void setPMEParameters(double alpha, int nx, int ny, int nz);
/**
* Set the parameters to use for the dispersion term in LJPME calculations. If alpha is 0 (the default),
* these parameters are ignored and instead their values are chosen based on the Ewald error tolerance.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void setLJPMEParameters(double alpha, int nx, int ny, int nz);
/**
* Get the parameters being used for PME in a particular Context. Because some platforms have restrictions
* on the allowed grid sizes, the values that are actually used may be slightly different from those
......@@ -230,6 +255,19 @@ public:
* @param[out] nz the number of grid points along the Z axis
*/
void getPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the PME parameters being used for the dispersion term for LJPME in a particular Context. Because some
* platforms have restrictions on the allowed grid sizes, the values that are actually used may be slightly different
* from those specified with setPMEParameters(), or the standard values calculated based on the Ewald error tolerance.
* See the manual for details.
*
* @param context the Context for which to get the parameters
* @param[out] alpha the separation parameter
* @param[out] nx the number of grid points along the X axis
* @param[out] ny the number of grid points along the Y axis
* @param[out] nz the number of grid points along the Z axis
*/
void getLJPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const;
/**
* Add the nonbonded force parameters for a particle. This should be called once for each particle
* in the System. When it is called for the i'th time, it specifies the parameters for the i'th particle.
......@@ -382,9 +420,9 @@ private:
class ParticleInfo;
class ExceptionInfo;
NonbondedMethod nonbondedMethod;
double cutoffDistance, switchingDistance, rfDielectric, ewaldErrorTol, alpha;
double cutoffDistance, switchingDistance, rfDielectric, ewaldErrorTol, alpha, dalpha;
bool useSwitchingFunction, useDispersionCorrection;
int recipForceGroup, nx, ny, nz;
int recipForceGroup, nx, ny, nz, dnx, dny, dnz;
void addExclusionsToSet(const std::vector<std::set<int> >& bonded12, std::set<int>& exclusions, int baseParticle, int fromParticle, int currentLevel) const;
std::vector<ParticleInfo> particles;
std::vector<ExceptionInfo> exceptions;
......
......@@ -65,6 +65,7 @@ public:
std::vector<std::string> getKernelNames();
void updateParametersInContext(ContextImpl& context);
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* This is a utility routine that calculates the values to use for alpha and kmax when using
* Ewald summation.
......@@ -74,7 +75,7 @@ public:
* This is a utility routine that calculates the values to use for alpha and grid size when using
* Particle Mesh Ewald.
*/
static void calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize);
static void calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool LJ);
/**
* Compute the coefficient which, when divided by the periodic box volume, gives the
* long range dispersion correction to the energy.
......
......@@ -106,6 +106,13 @@ void NonbondedForce::getPMEParameters(double& alpha, int& nx, int& ny, int& nz)
nz = this->nz;
}
void NonbondedForce::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
alpha = this->dalpha;
nx = this->dnx;
ny = this->dny;
nz = this->dnz;
}
void NonbondedForce::setPMEParameters(double alpha, int nx, int ny, int nz) {
this->alpha = alpha;
this->nx = nx;
......@@ -113,10 +120,21 @@ void NonbondedForce::setPMEParameters(double alpha, int nx, int ny, int nz) {
this->nz = nz;
}
void NonbondedForce::setLJPMEParameters(double alpha, int nx, int ny, int nz) {
this->dalpha = alpha;
this->dnx = nx;
this->dny = ny;
this->dnz = nz;
}
void NonbondedForce::getPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const NonbondedForceImpl&>(getImplInContext(context)).getPMEParameters(alpha, nx, ny, nz);
}
void NonbondedForce::getLJPMEParametersInContext(const Context& context, double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const NonbondedForceImpl&>(getImplInContext(context)).getLJPMEParameters(alpha, nx, ny, nz);
}
int NonbondedForce::addParticle(double charge, double sigma, double epsilon) {
particles.push_back(ParticleInfo(charge, sigma, epsilon));
return particles.size()-1;
......
......@@ -151,8 +151,12 @@ void NonbondedForceImpl::calcEwaldParameters(const System& system, const Nonbond
kmaxz++;
}
void NonbondedForceImpl::calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize) {
force.getPMEParameters(alpha, xsize, ysize, zsize);
void NonbondedForceImpl::calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool LJ) {
if(LJ) {
force.getLJPMEParameters(alpha, xsize, ysize, zsize);
} else {
force.getPMEParameters(alpha, xsize, ysize, zsize);
}
if (alpha == 0.0) {
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
......@@ -283,3 +287,7 @@ void NonbondedForceImpl::updateParametersInContext(ContextImpl& context) {
void NonbondedForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
kernel.getAs<CalcNonbondedForceKernel>().getPMEParameters(alpha, nx, ny, nz);
}
void NonbondedForceImpl::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
kernel.getAs<CalcNonbondedForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
}
......@@ -251,27 +251,37 @@ public:
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
/**
* Get the parameters being used for PME.
*
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class PmeIO;
CpuPlatform::PlatformData& data;
int numParticles, num14;
int **bonded14IndexArray;
double **bonded14ParamArray;
double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldSelfEnergy, dispersionCoefficient;
int kmax[3], gridSize[3];
bool useSwitchingFunction, useOptimizedPme, hasInitializedPme;
double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
int kmax[3], gridSize[3], dispersionGridSize[3];
bool useSwitchingFunction, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme;
std::vector<std::set<int> > exclusions;
std::vector<std::pair<float, float> > particleParams;
std::vector<float> C6params;
NonbondedMethod nonbondedMethod;
CpuNonbondedForce* nonbonded;
Kernel optimizedPme;
Kernel optimizedPme, optimizedDispersionPme;
CpuBondForce bondForce;
};
......
......@@ -104,16 +104,27 @@ class CpuNonbondedForce {
/**---------------------------------------------------------------------------------------
Set the force to use Particle-Mesh Ewald (PME) summation.
@param alpha the Ewald separation parameter
@param gridSize the dimensions of the mesh
--------------------------------------------------------------------------------------- */
void setUsePME(float alpha, int meshSize[3]);
/**---------------------------------------------------------------------------------------
Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
@param alpha the Ewald separation parameter
@param gridSize the dimensions of the mesh
--------------------------------------------------------------------------------------- */
void setUseLJPME(float alpha, int meshSize[3]);
/**---------------------------------------------------------------------------------------
Calculate Ewald ixn
......@@ -122,16 +133,17 @@ class CpuNonbondedForce {
@param posq atom coordinates and charges
@param atomCoordinates atom coordinates (in format needed by PME)
@param atomParameters atom parameters (sigma/2, 2*sqrt(epsilon))
@param C6Paramrs C6 parameters for multiplicative representation of dispersion
@param exclusions atom exclusion indices
exclusions[atomIndex] contains the list of exclusions for that atom
@param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateReciprocalIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates,
const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
std::vector<RealVec>& forces, double* totalEnergy) const;
const std::vector<std::pair<float, float> >& atomParameters, const std::vector<float> &C6params,
const std::vector<std::set<int> >& exclusions, std::vector<RealVec>& forces, double* totalEnergy) const;
/**---------------------------------------------------------------------------------------
......@@ -150,7 +162,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
const std::vector<float>& C6params, const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/**
* This routine contains the code executed by each thread.
......@@ -163,28 +175,32 @@ protected:
bool periodic;
bool triclinic;
bool ewald;
bool pme;
bool tableIsValid;
bool ljpme, pme;
bool tableIsValid, expTableIsValid;
const CpuNeighborList* neighborList;
float recipBoxSize[3];
RealVec periodicBoxVectors[3];
AlignedArray<fvec4> periodicBoxVec4;
float cutoffDistance, switchingDistance;
float krf, crf;
float alphaEwald;
float alphaEwald, alphaDispersionEwald;
int numRx, numRy, numRz;
int meshDim[3];
int meshDim[3], dispersionMeshDim[3];
std::vector<float> erfcTable, ewaldScaleTable;
float ewaldDX, ewaldDXInv, erfcDXInv;
std::vector<float> exptermsTable, dExptermsTable;
float ewaldDX, ewaldDXInv, erfcDXInv, exptermsDX, exptermsDXInv;
std::vector<double> threadEnergy;
// The following variables are used to make information accessible to the individual threads.
int numberOfAtoms;
float* posq;
RealVec const* atomCoordinates;
std::pair<float, float> const* atomParameters;
std::pair<float, float> const* atomParameters;
float const *C6params;
std::set<int> const* exclusions;
std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy;
float inverseRcut6;
float inverseRcut6Expterm;
void* atomicCounter;
static const float TWO_OVER_SQRT_PI;
......@@ -238,10 +254,29 @@ protected:
*/
void tabulateEwaldScaleFactor();
/**
* Create a lookup table for the scale factor used with dispersion PME.
*/
void tabulateExpTerms();
/**
* Compute a fast approximation to erfc(x).
*/
float erfcApprox(float x);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
* where dar = (dispersionAlpha * R)
* needed for LJPME energies.
*/
float exptermsApprox(float R);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
* where dar = (dispersionAlpha * R)
* needed for LJPME forces.
*/
float dExptermsApprox(float R);
};
} // namespace OpenMM
......
......@@ -88,11 +88,25 @@ protected:
* Compute a fast approximation to erfc(x).
*/
fvec4 erfcApprox(const fvec4& x);
/**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec4 ewaldScaleFunction(const fvec4& x);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
* where dar = (dispersionAlpha * R)
* needed for LJPME energies.
*/
fvec4 exptermsApprox(const fvec4& R);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
* where dar = (dispersionAlpha * R)
* needed for LJPME forces.
*/
fvec4 dExptermsApprox(const fvec4& R);
};
} // namespace OpenMM
......
......@@ -92,6 +92,21 @@ protected:
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/
fvec8 ewaldScaleFunction(const fvec8& x);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4))
* where dar = (dispersionAlpha * R)
* needed for LJPME energies.
*/
fvec8 exptermsApprox(const fvec8& R);
/**
* Compute a fast approximation to (1.0 - EXP(-dar^2) * (1.0 + dar^2 + 0.5*dar^4 + dar^6/6.0))
* where dar = (dispersionAlpha * R)
* needed for LJPME forces.
*/
fvec8 dExptermsApprox(const fvec8& R);
};
} // namespace OpenMM
......
......@@ -50,6 +50,7 @@
#include "lepton/CustomFunction.h"
#include "lepton/Operation.h"
#include "lepton/Parser.h"
#include <iostream>
#include "lepton/ParsedExpression.h"
using namespace OpenMM;
......@@ -528,7 +529,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec4();
CpuNonbondedForce* createCpuNonbondedForceVec8();
CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), nonbonded(NULL) {
data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), hasInitializedDispersionPme(false), nonbonded(NULL) {
if (isVec8Supported())
nonbonded = createCpuNonbondedForceVec8();
else
......@@ -575,12 +576,14 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
for (int i = 0; i < num14; i++)
bonded14ParamArray[i] = new double[3];
particleParams.resize(numParticles);
C6params.resize(numParticles);
double sumSquaredCharges = 0.0;
for (int i = 0; i < numParticles; ++i) {
double charge, radius, depth;
force.getParticleParameters(i, charge, radius, depth);
data.posq[4*i+3] = (float) charge;
particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
C6params[i] = 8.0*pow(particleParams[i].first, 3.0) * particleParams[i].second;
sumSquaredCharges += charge*charge;
}
......@@ -616,19 +619,35 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
}
else if (nonbondedMethod == PME) {
double alpha;
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2]);
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
ewaldAlpha = alpha;
}
if (nonbondedMethod == Ewald || nonbondedMethod == PME)
else if (nonbondedMethod == LJPME) {
double alpha;
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSize[0], gridSize[1], gridSize[2], false);
ewaldAlpha = (RealOpenMM) alpha;
NonbondedForceImpl::calcPMEParameters(system, force, alpha, dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], true);
ewaldDispersionAlpha = (RealOpenMM) alpha;
useSwitchingFunction = false;
}
if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
ewaldSelfEnergy = -ONE_4PI_EPS0*ewaldAlpha*sumSquaredCharges/sqrt(M_PI);
else
if(nonbondedMethod == LJPME){
for (int atom = 0; atom < numParticles; atom++) {
// Dispersion self term
ewaldSelfEnergy += pow(ewaldDispersionAlpha, 6.0) * C6params[atom]*C6params[atom] / 12.0;
}
}
} else {
ewaldSelfEnergy = 0.0;
}
rfDielectric = force.getReactionFieldDielectric();
if (force.getUseDispersionCorrection())
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
dispersionCoefficient = 0.0;
data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME);
data.isPeriodic = (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME);
}
double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
......@@ -646,6 +665,20 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
}
}
if (nonbondedMethod == LJPME) {
// If available, use the optimized PME implementation.
vector<string> kernelNames;
kernelNames.push_back("CalcPmeReciprocalForce");
useOptimizedPme = getPlatform().supportsKernels(kernelNames);
if (useOptimizedPme) {
optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
}
}
}
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
......@@ -654,6 +687,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
bool ewald = (nonbondedMethod == Ewald);
bool pme = (nonbondedMethod == PME);
bool ljpme = (nonbondedMethod == LJPME);
if (nonbondedMethod != NoCutoff)
nonbonded->setUseCutoff(nonbondedCutoff, *data.neighborList, rfDielectric);
if (data.isPeriodic) {
......@@ -669,9 +703,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
nonbonded->setUsePME(ewaldAlpha, gridSize);
if (useSwitchingFunction)
nonbonded->setUseSwitchingFunction(switchingDistance);
if (ljpme){
nonbonded->setUsePME(ewaldAlpha, gridSize);
nonbonded->setUseLJPME(ewaldDispersionAlpha, dispersionGridSize);
}
double nonbondedEnergy = 0;
if (includeDirect)
nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
if (includeReciprocal) {
if (useOptimizedPme) {
PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
......@@ -680,13 +718,13 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
else
nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, C6params, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
}
energy += nonbondedEnergy;
if (includeDirect) {
ReferenceLJCoulomb14 nonbonded14;
bondForce.calculateForce(posData, bonded14ParamArray, forceData, includeEnergy ? &energy : NULL, nonbonded14);
if (data.isPeriodic)
if (data.isPeriodic && nonbondedMethod != LJPME)
energy += dispersionCoefficient/(boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2]);
}
return energy;
......@@ -739,7 +777,7 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
}
void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != PME)
if (nonbondedMethod != PME && nonbondedMethod != LJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (useOptimizedPme)
optimizedPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
......@@ -751,6 +789,19 @@ void CpuCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
}
}
void CpuCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != LJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (useOptimizedPme)
optimizedDispersionPme.getAs<const CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
else {
alpha = ewaldDispersionAlpha;
nx = dispersionGridSize[0];
ny = dispersionGridSize[1];
nz = dispersionGridSize[2];
}
}
CpuCalcCustomNonbondedForceKernel::CpuCalcCustomNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) :
CalcCustomNonbondedForceKernel(name, platform), data(data), forceCopy(NULL), nonbonded(NULL) {
}
......
......@@ -30,6 +30,7 @@
#include "ReferencePME.h"
#include "openmm/internal/gmx_atomic.h"
#include <algorithm>
#include <iostream>
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
......@@ -57,7 +58,8 @@ public:
--------------------------------------------------------------------------------------- */
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), ljpme(false), tableIsValid(false), expTableIsValid(false),
cutoffDistance(0.0f), alphaDispersionEwald(0.0f), alphaEwald(0.0f) {
}
CpuNonbondedForce::~CpuNonbondedForce() {
......@@ -78,10 +80,21 @@ void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neig
tableIsValid = false;
cutoff = true;
cutoffDistance = distance;
inverseRcut6 = pow(cutoffDistance, -6);
neighborList = &neighbors;
krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0)/(2.0*solventDielectric+1.0);
crf = (1.0/cutoffDistance)*(3.0*solventDielectric)/(2.0*solventDielectric+1.0);
}
if(alphaDispersionEwald != 0.0f){
// We set this here, in case setUseCutoff is called after the dispersion alpha is set.
double dalphaR = alphaDispersionEwald*cutoffDistance;
double dar2 = dalphaR * dalphaR;
double dar4 = dar2*dar2;
double dar6 = dar4*dar2;
double expterm = EXP(-dar2);
inverseRcut6Expterm = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
}
}
/**---------------------------------------------------------------------------------------
......@@ -96,7 +109,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
switchingDistance = distance;
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use periodic boundary conditions. This requires that a cutoff has
also been set, and the smallest side of the periodic box is at least twice the cutoff
......@@ -106,7 +119,7 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
void CpuNonbondedForce::setPeriodic(RealVec* periodicBoxVectors) {
assert(cutoff);
assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
......@@ -124,11 +137,11 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
periodicBoxVec4[1] = fvec4(periodicBoxVectors[1][0], periodicBoxVectors[1][1], periodicBoxVectors[1][2], 0);
periodicBoxVec4[2] = fvec4(periodicBoxVectors[2][0], periodicBoxVectors[2][1], periodicBoxVectors[2][2], 0);
triclinic = (periodicBoxVectors[0][1] != 0.0 || periodicBoxVectors[0][2] != 0.0 ||
periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
}
periodicBoxVectors[1][0] != 0.0 || periodicBoxVectors[1][2] != 0.0 ||
periodicBoxVectors[2][0] != 0.0 || periodicBoxVectors[2][1] != 0.0);
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use Ewald summation.
......@@ -139,18 +152,18 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
numRx = kmaxx;
numRy = kmaxy;
numRz = kmaxz;
ewald = true;
tabulateEwaldScaleFactor();
}
void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
numRx = kmaxx;
numRy = kmaxy;
numRz = kmaxz;
ewald = true;
tabulateEwaldScaleFactor();
}
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
Set the force to use Particle-Mesh Ewald (PME) summation.
......@@ -159,19 +172,49 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
meshDim[0] = meshSize[0];
meshDim[1] = meshSize[1];
meshDim[2] = meshSize[2];
pme = true;
tabulateEwaldScaleFactor();
}
void CpuNonbondedForce::tabulateEwaldScaleFactor() {
void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha;
meshDim[0] = meshSize[0];
meshDim[1] = meshSize[1];
meshDim[2] = meshSize[2];
pme = true;
tabulateEwaldScaleFactor();
}
/**---------------------------------------------------------------------------------------
Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
@param alpha the Ewald separation parameter
@param gridSize the dimensions of the mesh
--------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseLJPME(float alpha, int meshSize[3]) {
if (alpha != alphaDispersionEwald)
expTableIsValid = false;
alphaDispersionEwald = alpha;
dispersionMeshDim[0] = meshSize[0];
dispersionMeshDim[1] = meshSize[1];
dispersionMeshDim[2] = meshSize[2];
ljpme = true;
tabulateExpTerms();
if(cutoffDistance != 0.0f){
// We set this here, in case setUseLJPME is called after the cutoff is set
double dalphaR = alphaDispersionEwald*cutoffDistance;
double dar2 = dalphaR * dalphaR;
double dar4 = dar2*dar2;
double dar6 = dar4*dar2;
double expterm = EXP(-dar2);
inverseRcut6Expterm = inverseRcut6*(1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
}
}
void CpuNonbondedForce::tabulateEwaldScaleFactor() {
if (tableIsValid)
return;
tableIsValid = true;
......@@ -187,10 +230,30 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
ewaldScaleTable[i] = erfcTable[i] + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
}
void CpuNonbondedForce::tabulateExpTerms() {
if (expTableIsValid)
return;
expTableIsValid = true;
exptermsDX = cutoffDistance/NUM_TABLE_POINTS;
exptermsDXInv = 1.0f/exptermsDX;
exptermsTable.resize(NUM_TABLE_POINTS+4);
dExptermsTable.resize(NUM_TABLE_POINTS+4);
for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
double r = i*ewaldDX;
double dalphaR = alphaDispersionEwald*r;
double dar2 = dalphaR * dalphaR;
double dar4 = dar2*dar2;
double dar6 = dar4*dar2;
double expterm = EXP(-dar2);
exptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4));
dExptermsTable[i] = (1.0 - expterm * (1.0 + dar2 + 0.5*dar4 + dar6/6.0));
}
}
void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates,
const vector<pair<float, float> >& atomParameters, const vector<set<int> >& exclusions,
vector<RealVec>& forces, double* totalEnergy) const {
const vector<pair<float, float> >& atomParameters, const vector<float> &C6params, const vector<set<int> >& exclusions,
vector<RealVec>& forces, double* totalEnergy) const {
typedef std::complex<float> d_complex;
static const float epsilon = 1.0;
......@@ -211,6 +274,29 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
if (totalEnergy)
*totalEnergy += recipEnergy;
pme_destroy(pmedata);
if (ljpme) {
// Dispersion reciprocal space terms
pme_init(&pmedata,alphaDispersionEwald,numberOfAtoms,dispersionMeshDim,5,1);
std::vector<RealVec> dpmeforces;
for (int i = 0; i < numberOfAtoms; i++){
charges[i] = (RealOpenMM)C6params[i];
dpmeforces.push_back(RealVec());
}
RealOpenMM recipDispersionEnergy = 0.0;
pme_exec_dpme(pmedata,atomCoordinates,dpmeforces,charges,periodicBoxVectors,&recipDispersionEnergy);
for (int i = 0; i < numberOfAtoms; i++){
forces[i][0] -= 2.0*dpmeforces[i][0];
forces[i][1] -= 2.0*dpmeforces[i][1];
forces[i][2] -= 2.0*dpmeforces[i][2];
}
if (totalEnergy)
*totalEnergy += recipDispersionEnergy;
pme_destroy(pmedata);
}
}
// Ewald method
......@@ -224,7 +310,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
// setup K-vectors
#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
#define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
vector<d_complex> eir(kmax*numberOfAtoms*3);
vector<d_complex> tab_xy(numberOfAtoms);
vector<d_complex> tab_qxyz(numberOfAtoms);
......@@ -232,15 +318,15 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
for (int i = 0; (i < numberOfAtoms); i++) {
float* pos = posq+4*i;
for (int m = 0; (m < 3); m++)
EIR(0, i, m) = d_complex(1,0);
EIR(0, i, m) = d_complex(1,0);
for (int m=0; (m<3); m++)
EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
sin(pos[m]*recipBoxSize[m]));
EIR(1, i, m) = d_complex(cos(pos[m]*recipBoxSize[m]),
sin(pos[m]*recipBoxSize[m]));
for (int j=2; (j<kmax); j++)
for (int m=0; (m<3); m++)
EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
for (int m=0; (m<3); m++)
EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
}
// calculate reciprocal space energy and forces
......@@ -254,11 +340,11 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
float ky = ry * recipBoxSize[1];
if (ry >= 0) {
for (int n = 0; n < numberOfAtoms; n++)
tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
}
else {
for (int n = 0; n < numberOfAtoms; n++)
tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
}
for (int rz = lowrz; rz < numRz; rz++) {
if (rz >= 0) {
......@@ -301,13 +387,14 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
const vector<float>& C6params, const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
this->posq = posq;
this->atomCoordinates = &atomCoordinates[0];
this->atomParameters = &atomParameters[0];
this->C6params = &C6params[0];
this->exclusions = &exclusions[0];
this->threadForce = &threadForce;
includeEnergy = (totalEnergy != NULL);
......@@ -319,7 +406,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
// Signal the threads to start running and wait for them to finish.
ComputeDirectTask task(*this);
threads.execute(task);
threads.execute(task); // ACS calls threadcomputedirect
threads.waitForThreads();
// Signal the threads to subtract the exclusions.
......@@ -350,9 +437,8 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
float* forces = &(*threadForce)[threadIndex][0];
fvec4 boxSize(periodicBoxVectors[0][0], periodicBoxVectors[1][1], periodicBoxVectors[2][2], 0);
fvec4 invBoxSize(recipBoxSize[0], recipBoxSize[1], recipBoxSize[2], 0);
if (ewald || pme) {
if (ewald || pme || ljpme) {
// Compute the interactions from the neighbor list.
while (true) {
int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
......@@ -370,7 +456,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
break;
int end = min(start+groupSize, numberOfAtoms);
for (int i = start; i < end; i++) {
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
......@@ -394,7 +480,18 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
}
else if (includeEnergy)
threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
threadEnergy[threadIndex] -= alphaEwald*TWO_OVER_SQRT_PI*scaledChargeI*posq[4*j+3];
if (ljpme) {
float C6ij = C6params[i]*C6params[j];
float inverseR2 = 1.0f/r2;
float emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
if(includeEnergy)
threadEnergy[threadIndex] += emult;
float dEdR = -6.0f*C6ij*inverseR2*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*i)-result).store(forces+4*i);
(fvec4(forces+4*j)+result).store(forces+4*j);
}
}
}
}
......@@ -444,7 +541,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
}
float sig = atomParameters[ii].first + atomParameters[jj].first;
float sig2 = inverseR*sig;
sig2 *= sig2;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float eps = atomParameters[ii].second*atomParameters[jj].second;
......@@ -476,7 +573,7 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*ii)+result).store(forces+4*ii);
(fvec4(forces+4*jj)-result).store(forces+4*jj);
}
}
void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
deltaR = posJ-posI;
......@@ -502,3 +599,18 @@ float CpuNonbondedForce::erfcApprox(float x) {
return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
}
float CpuNonbondedForce::exptermsApprox(float x) {
float x1 = x*exptermsDXInv;
int index = min((int) floor(x1), NUM_TABLE_POINTS);
float coeff2 = x1-index;
float coeff1 = 1.0f-coeff2;
return coeff1*exptermsTable[index] + coeff2*exptermsTable[index+1];
}
float CpuNonbondedForce::dExptermsApprox(float x) {
float x1 = x*exptermsDXInv;
int index = min((int) floor(x1), NUM_TABLE_POINTS);
float coeff2 = x1-index;
float coeff1 = 1.0f-coeff2;
return coeff1*dExptermsTable[index] + coeff2*dExptermsTable[index+1];
}
......@@ -25,6 +25,7 @@
#include "SimTKOpenMMUtilities.h"
#include "CpuNonbondedForceVec4.h"
#include <algorithm>
#include <iostream>
using namespace std;
using namespace OpenMM;
......@@ -213,7 +214,6 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Determine whether we need to apply periodic boundary conditions.
PeriodicType periodicType;
fvec4 blockCenter;
if (!periodic) {
......@@ -263,7 +263,6 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
template <int PERIODIC_TYPE>
void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
// Load the positions and parameters of the atoms in the block.
const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
fvec4 blockAtomPosq[4];
fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
......@@ -278,9 +277,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
fvec4 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]]);
const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
// Loop over neighbors for this block.
const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
......@@ -318,7 +318,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
fvec4 sig2 = inverseR*sig;
sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2;
fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
fvec4 eps = blockAtomEpsilon*atomEpsilon;
fvec4 epsSig6 = eps*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
......@@ -328,6 +329,17 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
if (ljpme) {
fvec4 C6ij = C6s*C6params[atom];
fvec4 inverseR2 = inverseR*inverseR;
fvec4 mysig2 = sig*sig;
fvec4 mysig6 = mysig2*mysig2*mysig2;
fvec4 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
fvec4 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
energy += emult + potentialShift;
}
}
else {
energy = 0.0f;
......@@ -362,7 +374,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
}
// Record the forces on the block atoms.
fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
transpose(f[0], f[1], f[2], f[3]);
for (int j = 0; j < 4; j++)
......@@ -420,3 +432,30 @@ fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
fvec4 CpuNonbondedForceVec4::exptermsApprox(const fvec4& r) {
fvec4 r1 = r*exptermsDXInv;
ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
fvec4 coeff2 = r1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&exptermsTable[index[0]]);
fvec4 t2(&exptermsTable[index[1]]);
fvec4 t3(&exptermsTable[index[2]]);
fvec4 t4(&exptermsTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
fvec4 CpuNonbondedForceVec4::dExptermsApprox(const fvec4& r) {
fvec4 r1 = r*exptermsDXInv;
ivec4 index = min(floor(r1), NUM_TABLE_POINTS);
fvec4 coeff2 = r1-index;
fvec4 coeff1 = 1.0f-coeff2;
fvec4 t1(&dExptermsTable[index[0]]);
fvec4 t2(&dExptermsTable[index[1]]);
fvec4 t3(&dExptermsTable[index[2]]);
fvec4 t4(&dExptermsTable[index[3]]);
transpose(t1, t2, t3, t4);
return coeff1*t1 + coeff2*t2;
}
......@@ -27,6 +27,7 @@
#include "openmm/OpenMMException.h"
#include "openmm/internal/hardware.h"
#include <algorithm>
#include <iostream>
using namespace std;
using namespace OpenMM;
......@@ -80,8 +81,7 @@ CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
// Determine whether we need to apply periodic boundary conditions.
// Determine whether we need to apply periodic boundary conditions.
PeriodicType periodicType;
fvec4 blockCenter;
if (!periodic) {
......@@ -308,6 +308,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
blockAtomCharge *= ONE_4PI_EPS0;
fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
fvec8 C6s(C6params[blockAtom[0]], C6params[blockAtom[1]], C6params[blockAtom[2]], C6params[blockAtom[3]], C6params[blockAtom[4]], C6params[blockAtom[5]], C6params[blockAtom[6]], C6params[blockAtom[7]]);
const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
......@@ -348,7 +349,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
fvec8 sig2 = inverseR*sig;
sig2 *= sig2;
fvec8 sig6 = sig2*sig2*sig2;
fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
fvec8 eps = blockAtomEpsilon*atomEpsilon;
fvec8 epsSig6 = eps*sig6;
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
......@@ -358,6 +360,17 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
dEdR = switchValue*dEdR - energy*switchDeriv*r;
energy *= switchValue;
}
if (ljpme) {
fvec8 C6ij = C6s*C6params[atom];
fvec8 inverseR2 = inverseR*inverseR;
fvec8 mysig2 = sig*sig;
fvec8 mysig6 = mysig2*mysig2*mysig2;
fvec8 emult = C6ij*inverseR2*inverseR2*inverseR2*exptermsApprox(r);
fvec8 potentialShift = eps*(1.0f-mysig6*inverseRcut6)*mysig6*inverseRcut6 - C6ij*inverseRcut6Expterm;
dEdR += 6.0f*C6ij*inverseR2*inverseR2*inverseR2*dExptermsApprox(r);
energy += emult + potentialShift;
}
}
else {
energy = 0.0f;
......@@ -464,4 +477,45 @@ fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {
transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
return coeff1*s1 + coeff2*s2;
}
fvec8 CpuNonbondedForceVec8::exptermsApprox(const fvec8& r) {
fvec8 r1 = r*exptermsDXInv;
ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
fvec8 coeff2 = r1-index;
fvec8 coeff1 = 1.0f-coeff2;
ivec4 indexLower = index.lowerVec();
ivec4 indexUpper = index.upperVec();
fvec4 t1(&exptermsTable[indexLower[0]]);
fvec4 t2(&exptermsTable[indexLower[1]]);
fvec4 t3(&exptermsTable[indexLower[2]]);
fvec4 t4(&exptermsTable[indexLower[3]]);
fvec4 t5(&exptermsTable[indexUpper[0]]);
fvec4 t6(&exptermsTable[indexUpper[1]]);
fvec4 t7(&exptermsTable[indexUpper[2]]);
fvec4 t8(&exptermsTable[indexUpper[3]]);
fvec8 s1, s2, s3, s4;
transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
return coeff1*s1 + coeff2*s2;
}
fvec8 CpuNonbondedForceVec8::dExptermsApprox(const fvec8& r) {
fvec8 r1 = r*exptermsDXInv;
ivec8 index = min(floor(r1), NUM_TABLE_POINTS);
fvec8 coeff2 = r1-index;
fvec8 coeff1 = 1.0f-coeff2;
ivec4 indexLower = index.lowerVec();
ivec4 indexUpper = index.upperVec();
fvec4 t1(&dExptermsTable[indexLower[0]]);
fvec4 t2(&dExptermsTable[indexLower[1]]);
fvec4 t3(&dExptermsTable[indexLower[2]]);
fvec4 t4(&dExptermsTable[indexLower[3]]);
fvec4 t5(&dExptermsTable[indexUpper[0]]);
fvec4 t6(&dExptermsTable[indexUpper[1]]);
fvec4 t7(&dExptermsTable[indexUpper[2]]);
fvec4 t8(&dExptermsTable[indexUpper[3]]);
fvec8 s1, s2, s3, s4;
transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
return coeff1*s1 + coeff2*s2;
}
#endif
......@@ -598,8 +598,10 @@ private:
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), C6s(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
directDispersionPmeGrid(NULL), reciprocalDispersionPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeAtomDispersionGridIndex(NULL),
pmeEnergyBuffer(NULL), dispersionPmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), dispersionPmeio(NULL) {
}
~CudaCalcNonbondedForceKernel();
/**
......@@ -636,6 +638,15 @@ public:
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 8;}
......@@ -655,38 +666,55 @@ private:
CudaContext& cu;
bool hasInitializedFFT;
CudaArray* sigmaEpsilon;
CudaArray* C6s;
CudaArray* exceptionParams;
CudaArray* cosSinSums;
CudaArray* directPmeGrid;
CudaArray* reciprocalPmeGrid;
CudaArray* directDispersionPmeGrid;
CudaArray* reciprocalDispersionPmeGrid;
CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ;
CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaArray* pmeAtomDispersionGridIndex;
CudaArray* pmeEnergyBuffer;
CudaArray* dispersionPmeEnergyBuffer;
CudaSort* sort;
Kernel cpuPme;
Kernel cpuDispersionPme;
PmeIO* pmeio;
CUstream pmeStream;
CUevent pmeSyncEvent;
PmeIO* dispersionPmeio;
CUstream pmeStream, dispersionPmeStream;
CUevent pmeSyncEvent, dispersionPmeSyncEvent;
CudaFFT3D* fft;
cufftHandle fftForward;
cufftHandle fftBackward;
CudaFFT3D* dispersionFft;
cufftHandle dispersionFftForward;
cufftHandle dispersionFftBackward;
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeDispersionGridIndexKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeDispersionSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeDispersionFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel;
CUfunction pmeEvalDispersionEnergyKernel;
CUfunction pmeConvolutionKernel;
CUfunction pmeDispersionConvolutionKernel;
CUfunction pmeInterpolateForceKernel;
CUfunction pmeInterpolateDispersionForceKernel;
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha;
double ewaldSelfEnergy, dispersionSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
NonbondedMethod nonbondedMethod;
static const int PmeOrder = 5;
};
......
......@@ -439,6 +439,15 @@ public:
* @param nz the number of grid points along the Z axis
*/
void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
private:
class Task;
CudaPlatform::PlatformData& data;
......
......@@ -1593,14 +1593,20 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent();
if (sigmaEpsilon != NULL)
delete sigmaEpsilon;
if (C6s != NULL)
delete C6s;
if (exceptionParams != NULL)
delete exceptionParams;
if (cosSinSums != NULL)
delete cosSinSums;
if (directPmeGrid != NULL)
delete directPmeGrid;
if (directDispersionPmeGrid != NULL)
delete directDispersionPmeGrid;
if (reciprocalPmeGrid != NULL)
delete reciprocalPmeGrid;
if (reciprocalDispersionPmeGrid != NULL)
delete reciprocalDispersionPmeGrid;
if (pmeBsplineModuliX != NULL)
delete pmeBsplineModuliX;
if (pmeBsplineModuliY != NULL)
......@@ -1611,14 +1617,20 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomRange;
if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex;
if (pmeAtomDispersionGridIndex != NULL)
delete pmeAtomDispersionGridIndex;
if (pmeEnergyBuffer != NULL)
delete pmeEnergyBuffer;
if (dispersionPmeEnergyBuffer != NULL)
delete dispersionPmeEnergyBuffer;
if (sort != NULL)
delete sort;
if (fft != NULL)
delete fft;
if (pmeio != NULL)
delete pmeio;
if (dispersionPmeio != NULL)
delete dispersionPmeio;
if (hasInitializedFFT) {
if (useCudaFFT) {
cufftDestroy(fftForward);
......@@ -1627,6 +1639,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (usePmeStream) {
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
if(doLJPME){
cuStreamDestroy(dispersionPmeStream);
cuEventDestroy(dispersionPmeSyncEvent);
}
}
}
}
......@@ -1634,6 +1650,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
cu.setAsCurrent();
nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
// Identify which exceptions are 1-4 interactions.
vector<pair<int, int> > exclusions;
......@@ -1650,14 +1668,29 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Initialize nonbonded interactions.
int numParticles = force.getNumParticles();
// Pack the C6 coeffiecient in with sigma and epsilon, in case LJPME is being used. The C6
// coefficients could live in a separate array, but that would hurt cache efficiency for LJPME.
doLJPME = nonbondedMethod == LJPME;
sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
if(doLJPME){
if (cu.getUseDoublePrecision())
C6s = CudaArray::create<double>(cu, cu.getPaddedNumAtoms(), "C6s");
else
C6s = CudaArray::create<float>(cu, cu.getPaddedNumAtoms(), "C6s");
}
CudaArray& posq = cu.getPosq();
vector<double4> temp(posq.getSize());
float4* posqf = (float4*) &temp[0];
double4* posqd = (double4*) &temp[0];
// The C6 coefficients for LJPME could be computed from sigma and epsilon, but it
// seems like a good idea to cache them and avoid many recomputations.
vector<double> tmpc6(posq.getSize());
float* c6f = (float*) &tmpc6[0];
double* c6d = (double*) &tmpc6[0];
vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
vector<vector<int> > exclusionList(numParticles);
double sumSquaredCharges = 0.0;
double sumSquaredC6 = 0.0;
hasCoulomb = false;
hasLJ = false;
for (int i = 0; i < numParticles; i++) {
......@@ -1667,7 +1700,17 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
posqd[i] = make_double4(0, 0, 0, charge);
else
posqf[i] = make_float4(0, 0, 0, (float) charge);
sigmaEpsilonVector[i] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
double sig = (float) (0.5*sigma);
double eps = (float) (2.0*sqrt(epsilon));
sigmaEpsilonVector[i] = make_float2(sig, eps);
if(doLJPME){
float C6 = (float) 8.0*pow(sig, 3) * eps;
sumSquaredC6 += C6*C6;
if (cu.getUseDoublePrecision())
c6d[i] = 8.0*pow(sig,3)*eps;
else
c6f[i] = 8.0f*pow((float)sig,3)*eps;
}
exclusionList[i].push_back(i);
sumSquaredCharges += charge*charge;
if (charge != 0.0)
......@@ -1681,7 +1724,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
}
posq.upload(&temp[0]);
sigmaEpsilon->upload(sigmaEpsilonVector);
nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
if(doLJPME)
C6s->upload(&tmpc6[0]);
bool useCutoff = (nonbondedMethod != NoCutoff);
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
map<string, string> defines;
......@@ -1705,12 +1749,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
}
}
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0)
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
dispersionCoefficient = 0.0;
alpha = 0;
ewaldSelfEnergy = 0.0;
dispersionSelfEnergy = 0.0;
if (nonbondedMethod == Ewald) {
// Compute the Ewald parameters.
......@@ -1740,19 +1785,39 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
}
}
else if (nonbondedMethod == PME) {
else if (nonbondedMethod == PME || nonbondedMethod == LJPME) {
// Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
//
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
if(doLJPME){
NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = CudaFFT3D::findLegalDimension(dispersionGridSizeX);
dispersionGridSizeY = CudaFFT3D::findLegalDimension(dispersionGridSizeY);
dispersionGridSizeZ = CudaFFT3D::findLegalDimension(dispersionGridSizeZ);
}
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
if(doLJPME) {
defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
double dar4 = dar2*dar2;
double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
defines["INVCUT6"] = cu.doubleToString(invRCut6);
defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
}
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
defines["DO_LJPME"] = doLJPME ? "1" : "0";
if (cu.getContextIndex() == 0) {
ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
if(doLJPME) dispersionSelfEnergy = pow(dispersionAlpha, 6) * sumSquaredC6 / 12.0;
char deviceName[100];
cuDeviceGetName(deviceName, 100, cu.getDevice());
usePmeStream = (!cu.getPlatformData().disablePmeStream && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
......@@ -1771,7 +1836,19 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["USE_PME_STREAM"] = "1";
if (cu.getPlatformData().deterministicForces)
pmeDefines["USE_DETERMINISTIC_FORCES"] = "1";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
if (doLJPME){
pmeDefines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["DISPERSION_GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["DISPERSION_GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["DISPERSION_GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["RECIP_DISPERSION_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
}
CUmodule module;
if(doLJPME){
module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme+CudaKernelSources::ljpme, pmeDefines);
}else{
module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
}
if (cu.getPlatformData().useCpuPme) {
// Create the CPU PME kernel.
......@@ -1782,6 +1859,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
if(doLJPME){
cpuDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, numParticles, dispersionAlpha);
cu.addPreComputation(new PmePreComputation(cu, cpuDispersionPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuDispersionPme, *pmeio));
}
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
......@@ -1796,6 +1880,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
if(doLJPME){
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadC6");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomDispersionGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadC6");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalDispersionConvolution");
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateDispersionEnergy");
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateDispersionForce");
cuFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
}
// Create required data structures.
......@@ -1803,14 +1896,28 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(*directPmeGrid);
if(doLJPME){
directDispersionPmeGrid = new CudaArray(cu, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ,
cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalDispersionPmeGrid");
cu.addAutoclearBuffer(*directDispersionPmeGrid);
reciprocalDispersionPmeGrid = new CudaArray(cu, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ,
2*elementSize, "reciprocalDispersionPmeGrid");
}
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
if(doLJPME)
pmeAtomDispersionGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomDispersionGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(*pmeEnergyBuffer);
if(doLJPME){
dispersionPmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize,
"dispersionPmeEnergyBuffer");
cu.clearBuffer(*dispersionPmeEnergyBuffer);
}
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
int cufftVersion;
cufftGetVersion(&cufftVersion);
......@@ -1822,12 +1929,24 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
if(doLJPME){
result = cufftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
result = cufftPlan3d(&dispersionFftBackward, dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
}
}
else
else {
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
dispersionFft = new CudaFFT3D(cu, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
}
// Prepare for doing PME on its own stream.
if (usePmeStream) {
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
if (useCudaFFT) {
......@@ -1840,9 +1959,21 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
if(doLJPME){
cuStreamCreate(&dispersionPmeStream, CU_STREAM_NON_BLOCKING);
// Dispersion terms use yet another stream.
if (useCudaFFT) {
cufftSetStream(dispersionFftForward, dispersionPmeStream);
cufftSetStream(dispersionFftBackward, dispersionPmeStream);
}
CHECK_RESULT(cuEventCreate(&dispersionPmeSyncEvent, CU_EVENT_DISABLE_TIMING),
"Error creating event for Dispersion term of NonbondedForce");
// The force group is the same as the electrostatic PME reciprocal force group.
cu.addPreComputation(new SyncStreamPreComputation(cu, dispersionPmeStream, dispersionPmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, dispersionPmeSyncEvent, cu.getKernel(module, "addEnergy"), *dispersionPmeEnergyBuffer, recipForceGroup));
}
}
hasInitializedFFT = true;
// Initialize the b-spline moduli.
int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
......@@ -1916,13 +2047,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
}
}
}
// Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
if (hasLJ)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
if (hasLJ){
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2,
sizeof(float2), sigmaEpsilon->getDevicePointer()));
if(doLJPME){
if (cu.getUseDoublePrecision())
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("C6s", "double", 1,
sizeof(double), C6s->getDevicePointer()));
else
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("C6s", "float", 1,
sizeof(float), C6s->getDevicePointer()));
}
}
// Initialize the exceptions.
......@@ -1959,9 +2099,9 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
if (directPmeGrid != NULL && includeReciprocal) {
if (usePmeStream)
cu.setCurrentStream(pmeStream);
// Invert the periodic box vectors.
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
......@@ -1985,7 +2125,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
}
// Execute the reciprocal space kernels.
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
......@@ -2045,8 +2185,80 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active.
if(doLJPME){
if (usePmeStream)
cu.setCurrentStream(dispersionPmeStream);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomDispersionGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(*pmeAtomDispersionGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directDispersionPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomDispersionGridIndex->getDevicePointer(),
&C6s->getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directDispersionPmeGrid->getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, directDispersionPmeGrid->getSize(), 256);
}
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecD2Z(dispersionFftForward, (double*) directDispersionPmeGrid->getDevicePointer(), (double2*) reciprocalDispersionPmeGrid->getDevicePointer());
else
cufftExecR2C(dispersionFftForward, (float*) directDispersionPmeGrid->getDevicePointer(), (float2*) reciprocalDispersionPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*directDispersionPmeGrid, *reciprocalDispersionPmeGrid, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalDispersionPmeGrid->getDevicePointer(), usePmeStream ? &dispersionPmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
&C6s->getDevicePointer()};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
void* convolutionArgs[] = {&reciprocalDispersionPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecZ2D(dispersionFftBackward, (double2*) reciprocalDispersionPmeGrid->getDevicePointer(), (double*) directDispersionPmeGrid->getDevicePointer());
else
cufftExecC2R(dispersionFftBackward, (float2*) reciprocalDispersionPmeGrid->getDevicePointer(), (float*) directDispersionPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*reciprocalDispersionPmeGrid, *directDispersionPmeGrid, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directDispersionPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomDispersionGridIndex->getDevicePointer(),
&C6s->getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
if (usePmeStream) {
cuEventRecord(dispersionPmeSyncEvent, dispersionPmeStream);
cu.restoreDefaultStream();
}
}
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if(includeReciprocal) energy += dispersionSelfEnergy;
if (dispersionCoefficient != 0.0 && includeDirect) {
double4 boxSize = cu.getPeriodicBoxSize();
energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
......@@ -2092,7 +2304,9 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
float4* posqf = (float4*) cu.getPinnedBuffer();
double4* posqd = (double4*) cu.getPinnedBuffer();
vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
vector<float3> sigmaEpsilonC6Vector(cu.getPaddedNumAtoms(), make_float3(0, 0, 0));
double sumSquaredCharges = 0.0;
double sumSquaredC6 = 0.0;
const vector<int>& order = cu.getAtomIndex();
for (int i = 0; i < force.getNumParticles(); i++) {
int index = order[i];
......@@ -2102,11 +2316,23 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
posqd[i].w = charge;
else
posqf[i].w = (float) charge;
sigmaEpsilonVector[index] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
float sig = (float) (0.5*sigma);
float eps = (float) (2.0*sqrt(epsilon));
if(doLJPME){
float C6 = (float) 8.0*pow(sig, 3) * eps;
sumSquaredC6 += C6*C6;
sigmaEpsilonC6Vector[index] = make_float3(sig, eps, C6);
}else{
sigmaEpsilonVector[index] = make_float2(sig, eps);
}
sumSquaredCharges += charge*charge;
}
posq.upload(cu.getPinnedBuffer());
sigmaEpsilon->upload(sigmaEpsilonVector);
if(doLJPME){
sigmaEpsilon->upload(sigmaEpsilonC6Vector);
}else{
sigmaEpsilon->upload(sigmaEpsilonVector);
}
// Record the exceptions.
......@@ -2125,6 +2351,8 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
if (nonbondedMethod == Ewald || nonbondedMethod == PME)
ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
if (nonbondedMethod == LJPME)
dispersionSelfEnergy = (cu.getContextIndex() == 0 ? pow(dispersionAlpha, 6) * sumSquaredC6 / 12.0 : 0);
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
cu.invalidateMolecules();
......@@ -2143,6 +2371,20 @@ void CudaCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int&
}
}
void CudaCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (!doLJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
}
class CudaCustomNonbondedForceInfo : public CudaForceInfo {
public:
CudaCustomNonbondedForceInfo(const CustomNonbondedForce& force) : force(force) {
......
......@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
}
void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
}
class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,
......
......@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
deviceName << name;
}
size_t printfsize;
cuCtxGetLimit(&printfsize, CU_LIMIT_PRINTF_FIFO_SIZE);
cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 10*printfsize);
useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
disablePmeStream = (pmeStreamProperty == "true");
deterministicForces = (deterministicForcesProperty == "true");
......
......@@ -17,6 +17,25 @@
const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
#endif
real tempForce = 0.0f;
#if HAS_LENNARD_JONES
// The multiplicative term to correct for the multiplicative terms that are always
// present in reciprocal space. The real terms have an additive contribution
// added in, but for excluded terms the multiplicative term is just subtracted.
// These factors are needed in both clauses of the needCorrection statement, so
// I declare them up here.
#if DO_LJPME
const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
const real dar2 = dispersionAlphaR*dispersionAlphaR;
const real dar4 = dar2*dar2;
const real dar6 = dar4*dar2;
const real invR2 = invR*invR;
const real expDar2 = EXP(-dar2);
const real c6 = C6s1*C6s2;
const real coef = invR2*invR2*invR2*c6;
const real eprefac = 1.0f + dar2 + 0.5f*dar4;
const real dprefac = eprefac + dar6/6.0f;
#endif
#endif
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
......@@ -29,6 +48,13 @@
includeInteraction = false;
tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
}
#if HAS_LENNARD_JONES
#if DO_LJPME
// The multiplicative grid term
tempEnergy += coef*(1.0f - expDar2*eprefac);
tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
#endif
#endif
}
else {
#if HAS_LENNARD_JONES
......@@ -36,7 +62,8 @@
real sig2 = invR*sig;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
real epssig6 = sig6*eps;
tempForce = epssig6*(12.0f*sig6 - 6.0f);
real ljEnergy = epssig6*(sig6 - 1.0f);
#if USE_LJ_SWITCH
......@@ -48,6 +75,22 @@
ljEnergy *= switchValue;
}
#endif
#if DO_LJPME
// The multiplicative grid term
ljEnergy += coef*(1.0f - expDar2*eprefac);
tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
// The potential shift accounts for the step at the cutoff introduced by the
// transition from additive to multiplicative combintion rules and is only
// needed for the real (not excluded) terms. By addin these terms to ljEnergy
// instead of tempEnergy here, the includeInteraction mask is correctly applied.
sig2 = sig*sig;
sig6 = sig2*sig2*sig2*INVCUT6;
epssig6 = eps*sig6;
// The additive part of the potential shift
ljEnergy += epssig6*(1.0f - sig6);
// The multiplicative part of the potential shift
ljEnergy += MULTSHIFT6*c6;
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
#else
......
extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// Compute the index of the grid point each atom is associated with.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
((int) t.y) % DISPERSION_GRID_SIZE_Y,
((int) t.z) % DISPERSION_GRID_SIZE_Z);
pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z+gridIndex.y*DISPERSION_GRID_SIZE_Z+gridIndex.z);
}
}
extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex,
const real* __restrict__ C6s) {
real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
((int) t.y) % DISPERSION_GRID_SIZE_Y,
((int) t.z) % DISPERSION_GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
real dx = data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
real dy = data[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
int index = ybase + zindex;
// We need to grab the C6 coefficient from the array
real add = C6s[atom]*dx*dy*data[iz].z;
#ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = index;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else
atomicAdd(&originalPmeGrid[index], add);
#endif
}
}
}
}
}
extern "C" __global__ void finishSpreadC6(long long* __restrict__ originalPmeGrid) {
real* floatGrid = (real*) originalPmeGrid;
const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
real scale = 1.0f/(real) 0x100000000;
#ifdef USE_DOUBLE_PRECISION
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index];
#else
for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
floatGrid[index] = scale*originalPmeGrid[index/2];
if (index+1 < gridSize)
floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
}
#endif
}
// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
extern "C" __global__ void
reciprocalDispersionConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real scaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
const real alpha = EWALD_DISPERSION_ALPHA;
real bfac = M_PI / alpha;
real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
real fac2 = alpha*alpha*alpha;
real fac3 = -2*alpha*M_PI*M_PI;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int ky = remainder/(GRID_SIZE_Z/2+1);
int kz = remainder-ky*(GRID_SIZE_Z/2+1);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = scaleFactor/(bx*by*bz);
real2 grid = halfcomplex_pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real m = SQRT(m2);
real m3 = m*m2;
real b = bfac*m;
real expfac = -b*b;
real expterm = EXP(expfac);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real t = 1.0f/(1.0f+0.3275911f*b);
real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
#else
real erfcterm = ERFC(b);
#endif
real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
}
}
extern "C" __global__ void
gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
const real scaleFactor = -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
const real alpha = EWALD_DISPERSION_ALPHA;
real bfac = M_PI / alpha;
real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
real fac2 = alpha*alpha*alpha;
real fac3 = -2*alpha*M_PI*M_PI;
mixed energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices
int kx = index/(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z));
int remainder = index-kx*DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z);
int ky = remainder/(DISPERSION_GRID_SIZE_Z);
int kz = remainder-ky*(DISPERSION_GRID_SIZE_Z);
int mx = (kx < (DISPERSION_GRID_SIZE_X+1)/2) ? kx : (kx-DISPERSION_GRID_SIZE_X);
int my = (ky < (DISPERSION_GRID_SIZE_Y+1)/2) ? ky : (ky-DISPERSION_GRID_SIZE_Y);
int mz = (kz < (DISPERSION_GRID_SIZE_Z+1)/2) ? kz : (kz-DISPERSION_GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = scaleFactor/(bx*by*bz);
real m = SQRT(m2);
real m3 = m*m2;
real b = bfac*m;
real expfac = -b*b;
real expterm = EXP(expfac);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real t = 1.0f/(1.0f+0.3275911f*b);
real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
#else
real erfcterm = ERFC(b);
#endif
real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
if (kz >= (DISPERSION_GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : DISPERSION_GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : DISPERSION_GRID_SIZE_Y-ky);
kz = DISPERSION_GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(DISPERSION_GRID_SIZE_Z/2+1)+kx*(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z/2+1));
real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
// N.B. We inlcude the 0,0,0 point for dispersion
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
#ifdef USE_PME_STREAM
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
#else
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
#endif
}
extern "C" __global__
void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ C6s) {
real3 data[PME_ORDER];
real3 ddata[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real3 force = make_real3(0);
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
((int) t.y) % DISPERSION_GRID_SIZE_Y,
((int) t.z) % DISPERSION_GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
real dx = data[ix].x;
real ddx = ddata[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
real dy = data[iy].y;
real ddy = ddata[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
int index = ybase + zindex;
real gridvalue = originalPmeGrid[index];
force.x += ddx*dy*data[iz].z*gridvalue;
force.y += dx*ddy*data[iz].z*gridvalue;
force.z += dx*dy*ddata[iz].z*gridvalue;
}
}
}
real q = C6s[atom];
real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x);
real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y);
real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z);
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (forceX*0x100000000)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceY*0x100000000)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceZ*0x100000000)));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment