Commit dbcc494f authored by Peter Eastman's avatar Peter Eastman
Browse files

Cleanup to CUDA dispersion PME code

parent a9f65649
...@@ -75,7 +75,7 @@ public: ...@@ -75,7 +75,7 @@ public:
* This is a utility routine that calculates the values to use for alpha and grid size when using * This is a utility routine that calculates the values to use for alpha and grid size when using
* Particle Mesh Ewald. * Particle Mesh Ewald.
*/ */
static void calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool LJ); static void calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool lj);
/** /**
* Compute the coefficient which, when divided by the periodic box volume, gives the * Compute the coefficient which, when divided by the periodic box volume, gives the
* long range dispersion correction to the energy. * long range dispersion correction to the energy.
......
...@@ -151,12 +151,11 @@ void NonbondedForceImpl::calcEwaldParameters(const System& system, const Nonbond ...@@ -151,12 +151,11 @@ void NonbondedForceImpl::calcEwaldParameters(const System& system, const Nonbond
kmaxz++; kmaxz++;
} }
void NonbondedForceImpl::calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool LJ) { void NonbondedForceImpl::calcPMEParameters(const System& system, const NonbondedForce& force, double& alpha, int& xsize, int& ysize, int& zsize, bool lj) {
if(LJ) { if (lj)
force.getLJPMEParameters(alpha, xsize, ysize, zsize); force.getLJPMEParameters(alpha, xsize, ysize, zsize);
} else { else
force.getPMEParameters(alpha, xsize, ysize, zsize); force.getPMEParameters(alpha, xsize, ysize, zsize);
}
if (alpha == 0.0) { if (alpha == 0.0) {
Vec3 boxVectors[3]; Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
......
...@@ -598,10 +598,9 @@ private: ...@@ -598,10 +598,9 @@ private:
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), C6s(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
directDispersionPmeGrid(NULL), reciprocalDispersionPmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeAtomDispersionGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL) {
pmeEnergyBuffer(NULL), dispersionPmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), dispersionPmeio(NULL) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -666,28 +665,22 @@ private: ...@@ -666,28 +665,22 @@ private:
CudaContext& cu; CudaContext& cu;
bool hasInitializedFFT; bool hasInitializedFFT;
CudaArray* sigmaEpsilon; CudaArray* sigmaEpsilon;
CudaArray* C6s;
CudaArray* exceptionParams; CudaArray* exceptionParams;
CudaArray* cosSinSums; CudaArray* cosSinSums;
CudaArray* directPmeGrid; CudaArray* directPmeGrid;
CudaArray* reciprocalPmeGrid; CudaArray* reciprocalPmeGrid;
CudaArray* directDispersionPmeGrid;
CudaArray* reciprocalDispersionPmeGrid;
CudaArray* pmeBsplineModuliX; CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY; CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaArray* pmeAtomDispersionGridIndex;
CudaArray* pmeEnergyBuffer; CudaArray* pmeEnergyBuffer;
CudaArray* dispersionPmeEnergyBuffer;
CudaSort* sort; CudaSort* sort;
Kernel cpuPme; Kernel cpuPme;
Kernel cpuDispersionPme; Kernel cpuDispersionPme;
PmeIO* pmeio; PmeIO* pmeio;
PmeIO* dispersionPmeio; CUstream pmeStream;
CUstream pmeStream, dispersionPmeStream; CUevent pmeSyncEvent;
CUevent pmeSyncEvent, dispersionPmeSyncEvent;
CudaFFT3D* fft; CudaFFT3D* fft;
cufftHandle fftForward; cufftHandle fftForward;
cufftHandle fftBackward; cufftHandle fftBackward;
...@@ -710,7 +703,7 @@ private: ...@@ -710,7 +703,7 @@ private:
CUfunction pmeInterpolateDispersionForceKernel; CUfunction pmeInterpolateDispersionForceKernel;
std::map<std::string, std::string> pmeDefines; std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms; std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha; double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
int interpolateForceThreads; int interpolateForceThreads;
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ; int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
......
...@@ -1593,20 +1593,14 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1593,20 +1593,14 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent(); cu.setAsCurrent();
if (sigmaEpsilon != NULL) if (sigmaEpsilon != NULL)
delete sigmaEpsilon; delete sigmaEpsilon;
if (C6s != NULL)
delete C6s;
if (exceptionParams != NULL) if (exceptionParams != NULL)
delete exceptionParams; delete exceptionParams;
if (cosSinSums != NULL) if (cosSinSums != NULL)
delete cosSinSums; delete cosSinSums;
if (directPmeGrid != NULL) if (directPmeGrid != NULL)
delete directPmeGrid; delete directPmeGrid;
if (directDispersionPmeGrid != NULL)
delete directDispersionPmeGrid;
if (reciprocalPmeGrid != NULL) if (reciprocalPmeGrid != NULL)
delete reciprocalPmeGrid; delete reciprocalPmeGrid;
if (reciprocalDispersionPmeGrid != NULL)
delete reciprocalDispersionPmeGrid;
if (pmeBsplineModuliX != NULL) if (pmeBsplineModuliX != NULL)
delete pmeBsplineModuliX; delete pmeBsplineModuliX;
if (pmeBsplineModuliY != NULL) if (pmeBsplineModuliY != NULL)
...@@ -1617,20 +1611,14 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1617,20 +1611,14 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomRange; delete pmeAtomRange;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (pmeAtomDispersionGridIndex != NULL)
delete pmeAtomDispersionGridIndex;
if (pmeEnergyBuffer != NULL) if (pmeEnergyBuffer != NULL)
delete pmeEnergyBuffer; delete pmeEnergyBuffer;
if (dispersionPmeEnergyBuffer != NULL)
delete dispersionPmeEnergyBuffer;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (fft != NULL) if (fft != NULL)
delete fft; delete fft;
if (pmeio != NULL) if (pmeio != NULL)
delete pmeio; delete pmeio;
if (dispersionPmeio != NULL)
delete dispersionPmeio;
if (hasInitializedFFT) { if (hasInitializedFFT) {
if (useCudaFFT) { if (useCudaFFT) {
cufftDestroy(fftForward); cufftDestroy(fftForward);
...@@ -1639,10 +1627,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1639,10 +1627,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (usePmeStream) { if (usePmeStream) {
cuStreamDestroy(pmeStream); cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent); cuEventDestroy(pmeSyncEvent);
if(doLJPME){
cuStreamDestroy(dispersionPmeStream);
cuEventDestroy(dispersionPmeSyncEvent);
}
} }
} }
} }
...@@ -1668,25 +1652,12 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1668,25 +1652,12 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Initialize nonbonded interactions. // Initialize nonbonded interactions.
int numParticles = force.getNumParticles(); int numParticles = force.getNumParticles();
// Pack the C6 coeffiecient in with sigma and epsilon, in case LJPME is being used. The C6 doLJPME = (nonbondedMethod == LJPME);
// coefficients could live in a separate array, but that would hurt cache efficiency for LJPME.
doLJPME = nonbondedMethod == LJPME;
sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon"); sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
if(doLJPME){
if (cu.getUseDoublePrecision())
C6s = CudaArray::create<double>(cu, cu.getPaddedNumAtoms(), "C6s");
else
C6s = CudaArray::create<float>(cu, cu.getPaddedNumAtoms(), "C6s");
}
CudaArray& posq = cu.getPosq(); CudaArray& posq = cu.getPosq();
vector<double4> temp(posq.getSize()); vector<double4> temp(posq.getSize());
float4* posqf = (float4*) &temp[0]; float4* posqf = (float4*) &temp[0];
double4* posqd = (double4*) &temp[0]; double4* posqd = (double4*) &temp[0];
// The C6 coefficients for LJPME could be computed from sigma and epsilon, but it
// seems like a good idea to cache them and avoid many recomputations.
vector<double> tmpc6(posq.getSize());
float* c6f = (float*) &tmpc6[0];
double* c6d = (double*) &tmpc6[0];
vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0)); vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
vector<vector<int> > exclusionList(numParticles); vector<vector<int> > exclusionList(numParticles);
double sumSquaredCharges = 0.0; double sumSquaredCharges = 0.0;
...@@ -1703,16 +1674,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1703,16 +1674,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
double sig = (float) (0.5*sigma); double sig = (float) (0.5*sigma);
double eps = (float) (2.0*sqrt(epsilon)); double eps = (float) (2.0*sqrt(epsilon));
sigmaEpsilonVector[i] = make_float2(sig, eps); sigmaEpsilonVector[i] = make_float2(sig, eps);
if(doLJPME){
float C6 = (float) 8.0*pow(sig, 3) * eps;
sumSquaredC6 += C6*C6;
if (cu.getUseDoublePrecision())
c6d[i] = 8.0*pow(sig,3)*eps;
else
c6f[i] = 8.0f*pow((float)sig,3)*eps;
}
exclusionList[i].push_back(i); exclusionList[i].push_back(i);
sumSquaredCharges += charge*charge; sumSquaredCharges += charge*charge;
double C6 = 8.0*sig*sig*sig*eps;
sumSquaredC6 += C6*C6;
if (charge != 0.0) if (charge != 0.0)
hasCoulomb = true; hasCoulomb = true;
if (epsilon != 0.0) if (epsilon != 0.0)
...@@ -1724,8 +1689,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1724,8 +1689,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
} }
posq.upload(&temp[0]); posq.upload(&temp[0]);
sigmaEpsilon->upload(sigmaEpsilonVector); sigmaEpsilon->upload(sigmaEpsilonVector);
if(doLJPME)
C6s->upload(&tmpc6[0]);
bool useCutoff = (nonbondedMethod != NoCutoff); bool useCutoff = (nonbondedMethod != NoCutoff);
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic); bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
map<string, string> defines; map<string, string> defines;
...@@ -1755,7 +1718,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1755,7 +1718,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
dispersionCoefficient = 0.0; dispersionCoefficient = 0.0;
alpha = 0; alpha = 0;
ewaldSelfEnergy = 0.0; ewaldSelfEnergy = 0.0;
dispersionSelfEnergy = 0.0;
if (nonbondedMethod == Ewald) { if (nonbondedMethod == Ewald) {
// Compute the Ewald parameters. // Compute the Ewald parameters.
...@@ -1792,7 +1754,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1792,7 +1754,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX); gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY); gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ); gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
if(doLJPME){ if (doLJPME) {
NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX, NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
dispersionGridSizeY, dispersionGridSizeZ, true); dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = CudaFFT3D::findLegalDimension(dispersionGridSizeX); dispersionGridSizeX = CudaFFT3D::findLegalDimension(dispersionGridSizeX);
...@@ -1801,7 +1763,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1801,7 +1763,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
} }
defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
if(doLJPME) { if (doLJPME) {
defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha); defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
double invRCut6 = pow(force.getCutoffDistance(), -6); double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance(); double dalphaR = dispersionAlpha * force.getCutoffDistance();
...@@ -1816,7 +1778,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1816,7 +1778,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines["DO_LJPME"] = doLJPME ? "1" : "0"; defines["DO_LJPME"] = doLJPME ? "1" : "0";
if (cu.getContextIndex() == 0) { if (cu.getContextIndex() == 0) {
ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
if(doLJPME) dispersionSelfEnergy = pow(dispersionAlpha, 6) * sumSquaredC6 / 12.0; if (doLJPME)
ewaldSelfEnergy += pow(dispersionAlpha, 6)*sumSquaredC6/12.0;
char deviceName[100]; char deviceName[100];
cuDeviceGetName(deviceName, 100, cu.getDevice()); cuDeviceGetName(deviceName, 100, cu.getDevice());
...@@ -1836,7 +1799,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1836,7 +1799,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["USE_PME_STREAM"] = "1"; pmeDefines["USE_PME_STREAM"] = "1";
if (cu.getPlatformData().deterministicForces) if (cu.getPlatformData().deterministicForces)
pmeDefines["USE_DETERMINISTIC_FORCES"] = "1"; pmeDefines["USE_DETERMINISTIC_FORCES"] = "1";
if (doLJPME){ if (doLJPME) {
pmeDefines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha); pmeDefines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["DISPERSION_GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX); pmeDefines["DISPERSION_GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["DISPERSION_GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY); pmeDefines["DISPERSION_GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
...@@ -1844,11 +1807,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1844,11 +1807,10 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["RECIP_DISPERSION_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha)); pmeDefines["RECIP_DISPERSION_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
} }
CUmodule module; CUmodule module;
if(doLJPME){ if (doLJPME)
module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme+CudaKernelSources::ljpme, pmeDefines); module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme+CudaKernelSources::ljpme, pmeDefines);
}else{ else
module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines); module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
}
if (cu.getPlatformData().useCpuPme) { if (cu.getPlatformData().useCpuPme) {
// Create the CPU PME kernel. // Create the CPU PME kernel.
...@@ -1859,7 +1821,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1859,7 +1821,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeio = new PmeIO(cu, addForcesKernel); pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio)); cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio)); cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
if(doLJPME){ if (doLJPME) {
cpuDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context); cpuDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSizeX, dispersionGridSizeY, cpuDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, numParticles, dispersionAlpha); dispersionGridSizeZ, numParticles, dispersionAlpha);
...@@ -1880,7 +1842,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1880,7 +1842,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1); cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1); cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
if(doLJPME){ if (doLJPME) {
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadC6"); pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadC6");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomDispersionGridIndex"); pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomDispersionGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadC6"); pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadC6");
...@@ -1893,31 +1855,20 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1893,31 +1855,20 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures. // Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid"); int gridElements = gridSizeX*gridSizeY*gridSizeZ;
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid"); if (doLJPME)
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
directPmeGrid = new CudaArray(cu, gridElements, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridElements, 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(*directPmeGrid); cu.addAutoclearBuffer(*directPmeGrid);
if(doLJPME){
directDispersionPmeGrid = new CudaArray(cu, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ,
cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalDispersionPmeGrid");
cu.addAutoclearBuffer(*directDispersionPmeGrid);
reciprocalDispersionPmeGrid = new CudaArray(cu, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ,
2*elementSize, "reciprocalDispersionPmeGrid");
}
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
if(doLJPME)
pmeAtomDispersionGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomDispersionGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float)); int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer"); pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(*pmeEnergyBuffer); cu.clearBuffer(*pmeEnergyBuffer);
if(doLJPME){
dispersionPmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize,
"dispersionPmeEnergyBuffer");
cu.clearBuffer(*dispersionPmeEnergyBuffer);
}
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
int cufftVersion; int cufftVersion;
cufftGetVersion(&cufftVersion); cufftGetVersion(&cufftVersion);
...@@ -1929,7 +1880,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1929,7 +1880,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R); result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result)); throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
if(doLJPME){ if (doLJPME) {
result = cufftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY, result = cufftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C); dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
...@@ -1959,19 +1910,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1959,19 +1910,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
recipForceGroup = force.getForceGroup(); recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup)); cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup)); cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
if(doLJPME){
cuStreamCreate(&dispersionPmeStream, CU_STREAM_NON_BLOCKING);
// Dispersion terms use yet another stream.
if (useCudaFFT) {
cufftSetStream(dispersionFftForward, dispersionPmeStream);
cufftSetStream(dispersionFftBackward, dispersionPmeStream);
}
CHECK_RESULT(cuEventCreate(&dispersionPmeSyncEvent, CU_EVENT_DISABLE_TIMING),
"Error creating event for Dispersion term of NonbondedForce");
// The force group is the same as the electrostatic PME reciprocal force group.
cu.addPreComputation(new SyncStreamPreComputation(cu, dispersionPmeStream, dispersionPmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, dispersionPmeSyncEvent, cu.getKernel(module, "addEnergy"), *dispersionPmeEnergyBuffer, recipForceGroup));
}
} }
hasInitializedFFT = true; hasInitializedFFT = true;
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -2051,18 +1989,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -2051,18 +1989,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines); string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true); cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
if (hasLJ){ if (hasLJ)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2,
sizeof(float2), sigmaEpsilon->getDevicePointer())); sizeof(float2), sigmaEpsilon->getDevicePointer()));
if(doLJPME){
if (cu.getUseDoublePrecision())
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("C6s", "double", 1,
sizeof(double), C6s->getDevicePointer()));
else
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("C6s", "float", 1,
sizeof(float), C6s->getDevicePointer()));
}
}
// Initialize the exceptions. // Initialize the exceptions.
...@@ -2142,7 +2071,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2142,7 +2071,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) { if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize(), 256); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
} }
if (useCudaFFT) { if (useCudaFFT) {
...@@ -2181,84 +2110,74 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2181,84 +2110,74 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
if (usePmeStream) {
cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for // As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active. // now we assume that LJPME can only be used if electrostatic PME is also active.
if(doLJPME){ if (doLJPME) {
if (usePmeStream) void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.setCurrentStream(dispersionPmeStream);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomDispersionGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(*pmeAtomDispersionGridIndex); sort->sort(*pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directDispersionPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.clearBuffer(*directPmeGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomDispersionGridIndex->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer(),
&C6s->getDevicePointer()}; &sigmaEpsilon->getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) { if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directDispersionPmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, directDispersionPmeGrid->getSize(), 256); cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
} }
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecD2Z(dispersionFftForward, (double*) directDispersionPmeGrid->getDevicePointer(), (double2*) reciprocalDispersionPmeGrid->getDevicePointer()); cufftExecD2Z(dispersionFftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
else else
cufftExecR2C(dispersionFftForward, (float*) directDispersionPmeGrid->getDevicePointer(), (float2*) reciprocalDispersionPmeGrid->getDevicePointer()); cufftExecR2C(dispersionFftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
} }
else { else {
fft->execFFT(*directDispersionPmeGrid, *reciprocalDispersionPmeGrid, true); fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
} }
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalDispersionPmeGrid->getDevicePointer(), usePmeStream ? &dispersionPmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
&C6s->getDevicePointer()};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ); cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
} }
void* convolutionArgs[] = {&reciprocalDispersionPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256); cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2D(dispersionFftBackward, (double2*) reciprocalDispersionPmeGrid->getDevicePointer(), (double*) directDispersionPmeGrid->getDevicePointer()); cufftExecZ2D(dispersionFftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
else else
cufftExecC2R(dispersionFftBackward, (float2*) reciprocalDispersionPmeGrid->getDevicePointer(), (float*) directDispersionPmeGrid->getDevicePointer()); cufftExecC2R(dispersionFftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer());
} }
else { else {
fft->execFFT(*reciprocalDispersionPmeGrid, *directDispersionPmeGrid, false); fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
} }
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directDispersionPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomDispersionGridIndex->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer(),
&C6s->getDevicePointer()}; &sigmaEpsilon->getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (usePmeStream) { if (usePmeStream) {
cuEventRecord(dispersionPmeSyncEvent, dispersionPmeStream); cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream(); cu.restoreDefaultStream();
} }
} }
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if(includeReciprocal) energy += dispersionSelfEnergy;
if (dispersionCoefficient != 0.0 && includeDirect) { if (dispersionCoefficient != 0.0 && includeDirect) {
double4 boxSize = cu.getPeriodicBoxSize(); double4 boxSize = cu.getPeriodicBoxSize();
energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z); energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
...@@ -2304,7 +2223,6 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, ...@@ -2304,7 +2223,6 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
float4* posqf = (float4*) cu.getPinnedBuffer(); float4* posqf = (float4*) cu.getPinnedBuffer();
double4* posqd = (double4*) cu.getPinnedBuffer(); double4* posqd = (double4*) cu.getPinnedBuffer();
vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0)); vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
vector<float3> sigmaEpsilonC6Vector(cu.getPaddedNumAtoms(), make_float3(0, 0, 0));
double sumSquaredCharges = 0.0; double sumSquaredCharges = 0.0;
double sumSquaredC6 = 0.0; double sumSquaredC6 = 0.0;
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
...@@ -2318,21 +2236,13 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, ...@@ -2318,21 +2236,13 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
posqf[i].w = (float) charge; posqf[i].w = (float) charge;
float sig = (float) (0.5*sigma); float sig = (float) (0.5*sigma);
float eps = (float) (2.0*sqrt(epsilon)); float eps = (float) (2.0*sqrt(epsilon));
if(doLJPME){
float C6 = (float) 8.0*pow(sig, 3) * eps;
sumSquaredC6 += C6*C6;
sigmaEpsilonC6Vector[index] = make_float3(sig, eps, C6);
}else{
sigmaEpsilonVector[index] = make_float2(sig, eps); sigmaEpsilonVector[index] = make_float2(sig, eps);
} double C6 = 8.0*sig*sig*sig*eps;
sumSquaredC6 += C6*C6;
sumSquaredCharges += charge*charge; sumSquaredCharges += charge*charge;
} }
posq.upload(cu.getPinnedBuffer()); posq.upload(cu.getPinnedBuffer());
if(doLJPME){
sigmaEpsilon->upload(sigmaEpsilonC6Vector);
}else{
sigmaEpsilon->upload(sigmaEpsilonVector); sigmaEpsilon->upload(sigmaEpsilonVector);
}
// Record the exceptions. // Record the exceptions.
...@@ -2349,10 +2259,10 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, ...@@ -2349,10 +2259,10 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
// Compute other values. // Compute other values.
if (nonbondedMethod == Ewald || nonbondedMethod == PME) if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME)
ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
if (nonbondedMethod == LJPME) if (nonbondedMethod == LJPME)
dispersionSelfEnergy = (cu.getContextIndex() == 0 ? pow(dispersionAlpha, 6) * sumSquaredC6 / 12.0 : 0); ewaldSelfEnergy += (cu.getContextIndex() == 0 ? pow(dispersionAlpha, 6)*sumSquaredC6/12.0 : 0);
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME)) if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force); dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
cu.invalidateMolecules(); cu.invalidateMolecules();
......
...@@ -30,7 +30,8 @@ ...@@ -30,7 +30,8 @@
const real dar6 = dar4*dar2; const real dar6 = dar4*dar2;
const real invR2 = invR*invR; const real invR2 = invR*invR;
const real expDar2 = EXP(-dar2); const real expDar2 = EXP(-dar2);
const real c6 = C6s1*C6s2; const real2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
const real coef = invR2*invR2*invR2*c6; const real coef = invR2*invR2*invR2*c6;
const real eprefac = 1.0f + dar2 + 0.5f*dar4; const real eprefac = 1.0f + dar2 + 0.5f*dar4;
const real dprefac = eprefac + dar6/6.0f; const real dprefac = eprefac + dar6/6.0f;
......
...@@ -22,7 +22,7 @@ extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__ ...@@ -22,7 +22,7 @@ extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__
extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid, extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex,
const real* __restrict__ C6s) { const real2* __restrict__ sigmaEpsilon) {
real3 data[PME_ORDER]; real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1); const real scale = RECIP(PME_ORDER-1);
...@@ -64,6 +64,8 @@ extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __ ...@@ -64,6 +64,8 @@ extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __
// Spread the charge from this atom onto each grid point. // Spread the charge from this atom onto each grid point.
const real2 sigEps = sigmaEpsilon[atom];
const real c6 = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix; int xbase = gridIndex.x+ix;
xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0); xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
...@@ -81,8 +83,7 @@ extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __ ...@@ -81,8 +83,7 @@ extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __
zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0); zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
int index = ybase + zindex; int index = ybase + zindex;
// We need to grab the C6 coefficient from the array real add = c6*dx*dy*data[iz].z;
real add = C6s[atom]*dx*dy*data[iz].z;
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid; unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000))); atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000)));
...@@ -241,7 +242,7 @@ gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __r ...@@ -241,7 +242,7 @@ gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __r
extern "C" __global__ extern "C" __global__
void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid, void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ C6s) { real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real2* __restrict__ sigmaEpsilon) {
real3 data[PME_ORDER]; real3 data[PME_ORDER];
real3 ddata[PME_ORDER]; real3 ddata[PME_ORDER];
const real scale = RECIP(PME_ORDER-1); const real scale = RECIP(PME_ORDER-1);
...@@ -313,7 +314,8 @@ void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned lon ...@@ -313,7 +314,8 @@ void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned lon
} }
} }
} }
real q = C6s[atom]; const real2 sigEps = sigmaEpsilon[atom];
real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x); real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x);
real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y); real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y);
real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z); real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment