Merged changes from main branch

cd874b2b · peastman · a783b996 · b84e22ba · cd874b2b · cd874b2b
Commit cd874b2b authored Feb 22, 2017 by peastman
20 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -69,8 +69,8 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL),
-        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
+        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL),
-        expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        chargeBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (precision == "single") {
        useDoublePrecision = false;
        useMixedPrecision = false;
@@ -309,6 +309,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
    if (supports64BitGlobalAtomics)
        reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
+    setChargesKernel = cl::Kernel(utilities, "setCharges");
    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
@@ -439,6 +440,8 @@ OpenCLContext::~OpenCLContext() {
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
+    if (chargeBuffer != NULL)
+        delete chargeBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
@@ -747,6 +750,28 @@ void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
    executeKernel(reduceReal4Kernel, bufferSize, 128);
 }
+void OpenCLContext::setCharges(const vector<double>& charges) {
+    if (chargeBuffer == NULL)
+        chargeBuffer = new OpenCLArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
+    if (getUseDoublePrecision()) {
+        double* c = (double*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = charges[i];
+        chargeBuffer->upload(c);
+    }
+    else {
+        float* c = (float*) getPinnedBuffer();
+        for (int i = 0; i < charges.size(); i++)
+            c[i] = (float) charges[i];
+        chargeBuffer->upload(c);
+    }
+    setChargesKernel.setArg<cl::Buffer>(0, chargeBuffer->getDeviceBuffer());
+    setChargesKernel.setArg<cl::Buffer>(1, posq->getDeviceBuffer());
+    setChargesKernel.setArg<cl::Buffer>(2, atomIndexDevice->getDeviceBuffer());
+    setChargesKernel.setArg<cl_int>(3, numAtoms);
+    executeKernel(setChargesKernel, numAtoms);
+}
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -945,9 +970,19 @@ void OpenCLContext::findMoleculeGroups() {
 }
 void OpenCLContext::invalidateMolecules() {
+    for (int i = 0; i < forces.size(); i++)
+        if (invalidateMolecules(forces[i]))
+            return;
+}
+bool OpenCLContext::invalidateMolecules(OpenCLForceInfo* force) {
    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
-        return;
+        return false;
    bool valid = true;
+    int forceIndex = -1;
+    for (int i = 0; i < forces.size(); i++)
+        if (forces[i] == force)
+            forceIndex = i;
    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
        MoleculeGroup& mol = moleculeGroups[group];
        vector<int>& instances = mol.instances;
@@ -962,22 +997,21 @@ void OpenCLContext::invalidateMolecules() {
            Molecule& m2 = molecules[instances[j]];
            int offset2 = offsets[j];
            for (int i = 0; i < (int) atoms.size() && valid; i++) {
-                for (int k = 0; k < (int) forces.size(); k++)
+                if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
-                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                    valid = false;
-                        valid = false;
            }
            // See if the force groups are identical.
-            for (int i = 0; i < (int) forces.size() && valid; i++) {
+            if (valid && forceIndex > -1) {
-                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
+                for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
-                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
+                    if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
                        valid = false;
            }
        }
    }
    if (valid)
-        return;
+        return false;
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
@@ -1045,6 +1079,7 @@ void OpenCLContext::invalidateMolecules() {
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
    reorderAtoms();
+    return true;
 }
 void OpenCLContext::reorderAtoms() {

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -583,6 +583,10 @@ void OpenCLParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int
    dynamic_cast<const OpenCLCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
 }
+void OpenCLParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const OpenCLCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
 class OpenCLParallelCalcCustomNonbondedForceKernel::Task : public OpenCLContext::WorkTask {
 public:
    Task(ContextImpl& context, OpenCLCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -22,6 +22,26 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0;
+#if HAS_LENNARD_JONES
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.  The real terms have an additive contribution
+    // added in, but for excluded terms the multiplicative term is just subtracted.
+    // These factors are needed in both clauses of the needCorrection statement, so
+    // I declare them up here.
+    #if DO_LJPME
+        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+        const real dar4 = dar2*dar2;
+        const real dar6 = dar4*dar2;
+        const real invR2 = invR*invR;
+        const real expDar2 = EXP(-dar2);
+        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        const real coef = invR2*invR2*invR2*c6;
+        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+        const real dprefac = eprefac + dar6/6.0f;
+    #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
@@ -34,6 +54,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -41,7 +68,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -53,6 +81,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+#if DO_LJPME
+        // The multiplicative grid term
+        ljEnergy += coef*(1.0f - expDar2*eprefac);
+        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        // The potential shift accounts for the step at the cutoff introduced by the
+        // transition from additive to multiplicative combintion rules and is only
+        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+        sig2 = sig*sig;
+        sig6 = sig2*sig2*sig2*INVCUT6;
+        epssig6 = eps*sig6;
+        // The additive part of the potential shift
+        ljEnergy += epssig6*(1.0f - sig6);
+        // The multiplicative part of the potential shift
+        ljEnergy += MULTSHIFT6*c6;
+#endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
 #else

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -107,3 +107,11 @@ __kernel void determineNativeAccuracy(__global float8* restrict values, int numV
        values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
    }
 }
+/**
+ * Record the atomic charges into the posq array.
+ */
+__kernel void setCharges(__global real* restrict charges, __global real4* restrict posq, __global int* restrict atomOrder, int numAtoms) {
+    for (int i = get_global_id(0); i < numAtoms; i += get_global_size(0))
+        posq[i].w = charges[atomOrder[i]];
+}
\ No newline at end of file
--- a/platforms/opencl/staticTarget/CMakeLists.txt
+++ b/platforms/opencl/staticTarget/CMakeLists.txt
@@ -14,7 +14,6 @@ SET_SOURCE_FILES_PROPERTIES(${CL_KERNELS_CPP} ${CL_KERNELS_H} PROPERTIES GENERAT
 ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
 TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}  ${OPENCL_LIBRARIES} ${PTHREADS_LIB_STATIC})
-#-DPTW32_STATIC_LIB only works for the windows pthreads.
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_OPENCL_BUILDING_STATIC_LIBRARY")
-SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_OPENCL_BUILDING_STATIC_LIBRARY -DPTW32_STATIC_LIB")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/opencl/tests/TestOpenCLDispersionPME.cpp
+++ b/platforms/opencl/tests/TestOpenCLDispersionPME.cpp
--- a/platforms/reference/include/ReferenceKernels.h
+++ b/platforms/reference/include/ReferenceKernels.h
@@ -604,12 +604,21 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    int numParticles, num14;
    int **bonded14IndexArray;
    double **particleParamArray, **bonded14ParamArray;
-    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, dispersionCoefficient;
+    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, dispersionCoefficient;
-    int kmax[3], gridSize[3];
+    int kmax[3], gridSize[3], dispersionGridSize[3];
    bool useSwitchingFunction;
    std::vector<std::set<int> > exclusions;
    NonbondedMethod nonbondedMethod;

--- a/platforms/reference/include/ReferenceLJCoulombIxn.h
+++ b/platforms/reference/include/ReferenceLJCoulombIxn.h
--- a/platforms/reference/include/ReferencePME.h
+++ b/platforms/reference/include/ReferencePME.h
--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
--- a/platforms/reference/src/SimTKReference/ReferenceCCMAAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceCCMAAlgorithm.cpp
--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
--- a/platforms/reference/tests/TestReferenceDispersionPME.cpp
+++ b/platforms/reference/tests/TestReferenceDispersionPME.cpp
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaExtrapolatedPolarization.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaExtrapolatedPolarization.cpp
--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp