Merge pull request #2318 from peastman/exclusions

Revised how exclusions are computed with PME

Merge pull request #2318 from peastman/exclusions
Revised how exclusions are computed with PME
0a4867a9 · peastman · GitHub · df52ff73 · de4c9900 · 0a4867a9
Unverified Commit 0a4867a9 authored Jun 11, 2019 by peastman Committed by GitHub Jun 11, 2019
10 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -678,6 +678,8 @@ private:
    CudaArray charges;
    CudaArray sigmaEpsilon;
    CudaArray exceptionParams;
+    CudaArray exclusionAtoms;
+    CudaArray exclusionParams;
    CudaArray baseParticleParams;
    CudaArray baseExceptionParams;
    CudaArray particleParamOffsets;
@@ -707,7 +709,7 @@ private:
    CudaFFT3D* dispersionFft;
    cufftHandle dispersionFftForward;
    cufftHandle dispersionFftBackward;
-    CUfunction computeParamsKernel;
+    CUfunction computeParamsKernel, computeExclusionParamsKernel;
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1665,6 +1665,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
    if (hasOffsets)
        paramsDefines["HAS_OFFSETS"] = "1";
+    if (usePosqCharges)
+        paramsDefines["USE_POSQ_CHARGES"] = "1";
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.
@@ -1929,7 +1931,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                    // Evaluate the actual bspline moduli for X/Y/Z.
-                    for(int dim = 0; dim < 3; dim++) {
+                    for (int dim = 0; dim < 3; dim++) {
                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
                        vector<double> moduli(ndata);
                        for (int i = 0; i < ndata; i++) {
@@ -1957,6 +1959,37 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        }
    }
+    // Add code to subtract off the reciprocal part of excluded interactions.
+    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
+        int numContexts = cu.getPlatformData().contexts.size();
+        int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
+        int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
+        int numExclusions = endIndex-startIndex;
+        if (numExclusions > 0) {
+            paramsDefines["HAS_EXCLUSIONS"] = "1";
+            vector<vector<int> > atoms(numExclusions, vector<int>(2));
+            exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
+            exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
+            vector<int2> exclusionAtomsVec(numExclusions);
+            for (int i = 0; i < numExclusions; i++) {
+                int j = i+startIndex;
+                exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
+                atoms[i][0] = exclusions[j].first;
+                atoms[i][1] = exclusions[j].second;
+            }
+            exclusionAtoms.upload(exclusionAtomsVec);
+            map<string, string> replacements;
+            replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams.getDevicePointer(), "float4");
+            replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
+            replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
+            if (doLJPME)
+                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::pmeExclusions, replacements), force.getForceGroup());
+        }
+    }
    // Add the interaction to the default nonbonded kernel.
    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
@@ -1967,7 +2000,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    if (usePosqCharges) {
        replacements["CHARGE1"] = "posq1.w";
        replacements["CHARGE2"] = "posq2.w";
-        paramsDefines["USE_POSQ_CHARGES"] = "1";
    }
    else {
        replacements["CHARGE1"] = prefix+"charge1";
@@ -2078,6 +2110,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    CUmodule module = cu.createModule(CudaKernelSources::nonbondedParameters, paramsDefines);
    computeParamsKernel = cu.getKernel(module, "computeParameters");
+    computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
    info = new ForceInfo(force);
    cu.addForce(info);
 }
@@ -2104,8 +2137,9 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
                &baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
                &particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
+        int numExceptions;
        if (exceptionParams.isInitialized()) {
-            int numExceptions = exceptionParams.getSize();
+            numExceptions = exceptionParams.getSize();
            paramsArgs.push_back(&numExceptions);
            paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
            paramsArgs.push_back(&exceptionParams.getDevicePointer());
@@ -2113,6 +2147,12 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
        }
        cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
+        if (exclusionParams.isInitialized()) {
+            int numExclusions = exclusionParams.getSize();
+            vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
+                    &numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
+            cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
+        }
        if (usePmeStream) {
            cuEventRecord(paramsSyncEvent, cu.getCurrentStream());
            cuStreamWaitEvent(pmeStream, paramsSyncEvent, 0);

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
 {
 #if USE_EWALD
-    bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+    unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
-    unsigned int includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
    const real alphaR = EWALD_ALPHA*r;
    const real expAlphaRSqr = EXP(-alphaR*alphaR);
 #if HAS_COULOMB
@@ -22,89 +21,58 @@
 #endif
    real tempForce = 0.0f;
 #if HAS_LENNARD_JONES
-    // The multiplicative term to correct for the multiplicative terms that are always
+    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-    // present in reciprocal space.  The real terms have an additive contribution
+    real sig2 = invR*sig;
-    // added in, but for excluded terms the multiplicative term is just subtracted.
+    sig2 *= sig2;
-    // These factors are needed in both clauses of the needCorrection statement, so
+    real sig6 = sig2*sig2*sig2;
-    // I declare them up here.
+    real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-    #if DO_LJPME
+    real epssig6 = sig6*eps;
-        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+    tempForce = epssig6*(12.0f*sig6 - 6.0f);
-        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+    real ljEnergy = epssig6*(sig6 - 1.0f);
-        const real dar4 = dar2*dar2;
+    #if USE_LJ_SWITCH
-        const real dar6 = dar4*dar2;
+    if (r > LJ_SWITCH_CUTOFF) {
-        const real invR2 = invR*invR;
+        real x = r-LJ_SWITCH_CUTOFF;
-        const real expDar2 = EXP(-dar2);
+        real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-        const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
+        real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-        const real coef = invR2*invR2*invR2*c6;
+        ljEnergy *= switchValue;
-        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-        const real dprefac = eprefac + dar6/6.0f;
-    #endif
-#endif
-    if (needCorrection) {
-        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-#if HAS_COULOMB
-        if (1-erfcAlphaR > 1e-6) {
-            real erfAlphaR = ERF(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
-            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-            tempEnergy += -prefactor*erfAlphaR;
-        }
-        else {
-            includeInteraction = false;
-            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*CHARGE1*CHARGE2;
-        }
-#endif
-#if HAS_LENNARD_JONES
-        #if DO_LJPME
-            // The multiplicative grid term
-            tempEnergy += coef*(1.0f - expDar2*eprefac);
-            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-        #endif
-#endif
    }
-    else {
+    #endif
-#if HAS_LENNARD_JONES
-        real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-        real sig2 = invR*sig;
-        sig2 *= sig2;
-        real sig6 = sig2*sig2*sig2;
-        real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-        real epssig6 = sig6*eps;
-        tempForce = epssig6*(12.0f*sig6 - 6.0f);
-        real ljEnergy = epssig6*(sig6 - 1.0f);
-        #if USE_LJ_SWITCH
-        if (r > LJ_SWITCH_CUTOFF) {
-            real x = r-LJ_SWITCH_CUTOFF;
-            real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-            real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-            tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-            ljEnergy *= switchValue;
-        }
-        #endif
 #if DO_LJPME
-        // The multiplicative grid term
+    // The multiplicative term to correct for the multiplicative terms that are always
-        ljEnergy += coef*(1.0f - expDar2*eprefac);
+    // present in reciprocal space.
-        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+    const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
-        // The potential shift accounts for the step at the cutoff introduced by the
+    const real dar2 = dispersionAlphaR*dispersionAlphaR;
-        // transition from additive to multiplicative combintion rules and is only
+    const real dar4 = dar2*dar2;
-        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+    const real dar6 = dar4*dar2;
-        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+    const real invR2 = invR*invR;
-        sig2 = sig*sig;
+    const real expDar2 = EXP(-dar2);
-        sig6 = sig2*sig2*sig2*INVCUT6;
+    const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
-        epssig6 = eps*sig6;
+    const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
-        // The additive part of the potential shift
+    const real coef = invR2*invR2*invR2*c6;
-        ljEnergy += epssig6*(1.0f - sig6);
+    const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-        // The multiplicative part of the potential shift
+    const real dprefac = eprefac + dar6/6.0f;
-        ljEnergy += MULTSHIFT6*c6;
+    // The multiplicative grid term
+    ljEnergy += coef*(1.0f - expDar2*eprefac);
+    tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+    // The potential shift accounts for the step at the cutoff introduced by the
+    // transition from additive to multiplicative combintion rules and is only
+    // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+    // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+    sig2 = sig*sig;
+    sig6 = sig2*sig2*sig2*INVCUT6;
+    epssig6 = eps*sig6;
+    // The additive part of the potential shift
+    ljEnergy += epssig6*(1.0f - sig6);
+    // The multiplicative part of the potential shift
+    ljEnergy += MULTSHIFT6*c6;
 #endif
-        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
+    tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else
-        tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
+    tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
 #endif
-    }
    dEdR += includeInteraction ? tempForce*invR*invR : 0;
 #else
 #ifdef USE_CUTOFF
@@ -113,7 +81,7 @@
    unsigned int includeInteraction = (!isExcluded);
 #endif
    real tempForce = 0.0f;
-  #if HAS_LENNARD_JONES
+#if HAS_LENNARD_JONES
    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
    real sig2 = invR*sig;
    sig2 *= sig2;
@@ -131,7 +99,7 @@
    }
    #endif
    tempEnergy += ljEnergy;
-  #endif
+#endif
 #if HAS_COULOMB
  #ifdef USE_CUTOFF
    const real prefactor = 138.935456f*CHARGE1*CHARGE2;

--- a/platforms/cuda/src/kernels/nonbondedParameters.cu
+++ b/platforms/cuda/src/kernels/nonbondedParameters.cu
@@ -63,3 +63,28 @@ extern "C" __global__ void computeParameters(mixed* __restrict__ energyBuffer, b
    if (includeSelfEnergy)
        energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
+/**
+ * Compute parameters for subtracting the reciprocal part of excluded interactions.
+ */
+extern "C" __global__ void computeExclusionParameters(real4* __restrict__ posq, real* __restrict__ charge, float2* __restrict__ sigmaEpsilon,
+        int numExclusions, const int2* __restrict__ exclusionAtoms, float4* __restrict__ exclusionParams) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numExclusions; i += blockDim.x*gridDim.x) {
+        int2 atoms = exclusionAtoms[i];
+#ifdef USE_POSQ_CHARGES
+        real chargeProd = posq[atoms.x].w*posq[atoms.y].w;
+#else
+        real chargeProd = charge[atoms.x]*charge[atoms.y];
+#endif
+#ifdef INCLUDE_LJPME
+        float2 sigEps1 = sigmaEpsilon[atoms.x];
+        float2 sigEps2 = sigmaEpsilon[atoms.y];
+        float sigma = sigEps1.x*sigEps2.x;
+        float epsilon = sigEps1.y*sigEps2.y;
+#else
+        float sigma = 0;
+        float epsilon = 0;
+#endif
+        exclusionParams[i] = make_float4((float) (138.935456f*chargeProd), sigma, epsilon, 0);
+    }
+}
\ No newline at end of file
--- a/platforms/cuda/src/kernels/pmeExclusions.cu
+++ b/platforms/cuda/src/kernels/pmeExclusions.cu
+const float4 exclusionParams = PARAMS[index];
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+const real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+const real r = SQRT(r2);
+const real invR = RECIP(r);
+const real alphaR = EWALD_ALPHA*r;
+const real expAlphaRSqr = EXP(-alphaR*alphaR);
+real tempForce = 0.0f;
+if (alphaR > 1e-6f) {
+    const real erfAlphaR = ERF(alphaR);
+    const real prefactor = exclusionParams.x*invR;
+    tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    energy -= prefactor*erfAlphaR;
+}
+else {
+    energy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*exclusionParams.x;
+}
+#if DO_LJPME
+const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+const real dar2 = dispersionAlphaR*dispersionAlphaR;
+const real dar4 = dar2*dar2;
+const real dar6 = dar4*dar2;
+const real invR2 = invR*invR;
+const real expDar2 = EXP(-dar2);
+const real c6 = 64*exclusionParams.y*exclusionParams.y*exclusionParams.y*exclusionParams.z;
+const real coef = invR2*invR2*invR2*c6;
+const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+const real dprefac = eprefac + dar6/6.0f;
+energy += coef*(1.0f - expDar2*eprefac);
+tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+#endif
+if (r > 0)
+    delta *= tempForce*invR*invR;
+real3 force1 = -delta;
+real3 force2 = delta;
--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -655,6 +655,8 @@ private:
    OpenCLArray charges;
    OpenCLArray sigmaEpsilon;
    OpenCLArray exceptionParams;
+    OpenCLArray exclusionAtoms;
+    OpenCLArray exclusionParams;
    OpenCLArray baseParticleParams;
    OpenCLArray baseExceptionParams;
    OpenCLArray particleParamOffsets;
@@ -683,7 +685,7 @@ private:
    Kernel cpuPme;
    PmeIO* pmeio;
    SyncQueuePostComputation* syncQueue;
-    cl::Kernel computeParamsKernel;
+    cl::Kernel computeParamsKernel, computeExclusionParamsKernel;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
    cl::Kernel pmeAtomRangeKernel;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1655,6 +1655,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
    if (hasOffsets)
        paramsDefines["HAS_OFFSETS"] = "1";
+    if (usePosqCharges)
+        paramsDefines["USE_POSQ_CHARGES"] = "1";
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.
@@ -1853,7 +1855,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                    // Evaluate the actual bspline moduli for X/Y/Z.
-                    for(int dim = 0; dim < 3; dim++) {
+                    for (int dim = 0; dim < 3; dim++) {
                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
                        vector<cl_double> moduli(ndata);
                        for (int i = 0; i < ndata; i++) {
@@ -1883,6 +1885,37 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        }
    }
+    // Add code to subtract off the reciprocal part of excluded interactions.
+    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
+        int numContexts = cl.getPlatformData().contexts.size();
+        int startIndex = cl.getContextIndex()*force.getNumExceptions()/numContexts;
+        int endIndex = (cl.getContextIndex()+1)*force.getNumExceptions()/numContexts;
+        int numExclusions = endIndex-startIndex;
+        if (numExclusions > 0) {
+            paramsDefines["HAS_EXCLUSIONS"] = "1";
+            vector<vector<int> > atoms(numExclusions, vector<int>(2));
+            exclusionAtoms.initialize<mm_int2>(cl, numExclusions, "exclusionAtoms");
+            exclusionParams.initialize<mm_float4>(cl, numExclusions, "exclusionParams");
+            vector<mm_int2> exclusionAtomsVec(numExclusions);
+            for (int i = 0; i < numExclusions; i++) {
+                int j = i+startIndex;
+                exclusionAtomsVec[i] = mm_int2(exclusions[j].first, exclusions[j].second);
+                atoms[i][0] = exclusions[j].first;
+                atoms[i][1] = exclusions[j].second;
+            }
+            exclusionAtoms.upload(exclusionAtomsVec);
+            map<string, string> replacements;
+            replacements["PARAMS"] = cl.getBondedUtilities().addArgument(exclusionParams.getDeviceBuffer(), "float4");
+            replacements["EWALD_ALPHA"] = cl.doubleToString(alpha);
+            replacements["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
+            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
+            if (doLJPME)
+                replacements["EWALD_DISPERSION_ALPHA"] = cl.doubleToString(dispersionAlpha);
+            cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(OpenCLKernelSources::pmeExclusions, replacements), force.getForceGroup());
+        }
+    }
    // Add the interaction to the default nonbonded kernel.
    string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines);
@@ -1893,7 +1926,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    if (usePosqCharges) {
        replacements["CHARGE1"] = "posq1.w";
        replacements["CHARGE2"] = "posq2.w";
-        paramsDefines["USE_POSQ_CHARGES"] = "1";
    }
    else {
        replacements["CHARGE1"] = prefix+"charge1";
@@ -2004,6 +2036,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    cl::Program program = cl.createProgram(OpenCLKernelSources::nonbondedParameters, paramsDefines);
    computeParamsKernel = cl::Kernel(program, "computeParameters");
+    computeExclusionParamsKernel = cl::Kernel(program, "computeExclusionParameters");
    info = new ForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force);
    cl.addForce(info);
 }
@@ -2012,21 +2045,31 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
-        computeParamsKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
+        int index = 0;
-        computeParamsKernel.setArg<cl::Buffer>(2, globalParams.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
-        computeParamsKernel.setArg<cl_int>(3, cl.getPaddedNumAtoms());
+        index++;
-        computeParamsKernel.setArg<cl::Buffer>(4, baseParticleParams.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, globalParams.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(5, cl.getPosq().getDeviceBuffer());
+        computeParamsKernel.setArg<cl_int>(index++, cl.getPaddedNumAtoms());
-        computeParamsKernel.setArg<cl::Buffer>(6, charges.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, baseParticleParams.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(7, sigmaEpsilon.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(8, particleParamOffsets.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, charges.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(9, particleOffsetIndices.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, sigmaEpsilon.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, particleParamOffsets.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, particleOffsetIndices.getDeviceBuffer());
        if (exceptionParams.isInitialized()) {
-            computeParamsKernel.setArg<cl_int>(10, exceptionParams.getSize());
+            computeParamsKernel.setArg<cl_int>(index++, exceptionParams.getSize());
-            computeParamsKernel.setArg<cl::Buffer>(11, baseExceptionParams.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, baseExceptionParams.getDeviceBuffer());
-            computeParamsKernel.setArg<cl::Buffer>(12, exceptionParams.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exceptionParams.getDeviceBuffer());
-            computeParamsKernel.setArg<cl::Buffer>(13, exceptionParamOffsets.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exceptionParamOffsets.getDeviceBuffer());
-            computeParamsKernel.setArg<cl::Buffer>(14, exceptionOffsetIndices.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exceptionOffsetIndices.getDeviceBuffer());
+        }
+        if (exclusionParams.isInitialized()) {
+            computeExclusionParamsKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
+            computeExclusionParamsKernel.setArg<cl::Buffer>(1, charges.getDeviceBuffer());
+            computeExclusionParamsKernel.setArg<cl::Buffer>(2, sigmaEpsilon.getDeviceBuffer());
+            computeExclusionParamsKernel.setArg<cl_int>(3, exclusionParams.getSize());
+            computeExclusionParamsKernel.setArg<cl::Buffer>(4, exclusionAtoms.getDeviceBuffer());
+            computeExclusionParamsKernel.setArg<cl::Buffer>(5, exclusionParams.getDeviceBuffer());
        }
        if (cosSinSums.isInitialized()) {
            ewaldSumsKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
@@ -2176,6 +2219,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    if (recomputeParams || hasOffsets) {
        computeParamsKernel.setArg<cl_int>(1, includeEnergy && includeReciprocal);
        cl.executeKernel(computeParamsKernel, cl.getPaddedNumAtoms());
+        if (exclusionParams.isInitialized())
+            cl.executeKernel(computeExclusionParamsKernel, exclusionParams.getSize());
        if (usePmeQueue) {
            vector<cl::Event> events(1);
            cl.getQueue().enqueueMarker(&events[0]);
@@ -2350,9 +2395,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
                cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
            }
            else {
-                if (!hasCoulomb)
-                    sort->sort(pmeAtomGridIndex);
                if (cl.getSupports64BitGlobalAtomics()) {
+                    if (!hasCoulomb)
+                        sort->sort(pmeAtomGridIndex);
                    cl.clearBuffer(pmeGrid2);
                    setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
                    if (cl.getUseDoublePrecision()) {
@@ -2369,6 +2414,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
                    cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
                }
                else {
+                    sort->sort(pmeAtomGridIndex);
                    cl.clearBuffer(pmeGrid1);
                    cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
                    setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);

--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -5,8 +5,7 @@
    unsigned int includeInteraction;
 #endif
 #if USE_EWALD
-    bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
+    includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
-    includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
    const real alphaR = EWALD_ALPHA*r;
    const real expAlphaRSqr = EXP(-alphaR*alphaR);
 #if HAS_COULOMB
@@ -27,89 +26,58 @@
 #endif
    real tempForce = 0;
 #if HAS_LENNARD_JONES
-    // The multiplicative term to correct for the multiplicative terms that are always
+    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-    // present in reciprocal space.  The real terms have an additive contribution
+    real sig2 = invR*sig;
-    // added in, but for excluded terms the multiplicative term is just subtracted.
+    sig2 *= sig2;
-    // These factors are needed in both clauses of the needCorrection statement, so
+    real sig6 = sig2*sig2*sig2;
-    // I declare them up here.
+    real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-    #if DO_LJPME
+    real epssig6 = sig6*eps;
-        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+    tempForce = epssig6*(12.0f*sig6 - 6.0f);
-        const real dar2 = dispersionAlphaR*dispersionAlphaR;
+    real ljEnergy = epssig6*(sig6 - 1.0f);
-        const real dar4 = dar2*dar2;
+    #if USE_LJ_SWITCH
-        const real dar6 = dar4*dar2;
+    if (r > LJ_SWITCH_CUTOFF) {
-        const real invR2 = invR*invR;
+        real x = r-LJ_SWITCH_CUTOFF;
-        const real expDar2 = EXP(-dar2);
+        real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-        const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
+        real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+        tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-        const real coef = invR2*invR2*invR2*c6;
+        ljEnergy *= switchValue;
-        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-        const real dprefac = eprefac + dar6/6.0f;
-    #endif
-#endif
-    if (needCorrection) {
-        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-#if HAS_COULOMB
-        if (1-erfcAlphaR > 1e-6) {
-            real erfAlphaR = erf(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
-            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-            tempEnergy += -prefactor*erfAlphaR;
-        }
-        else {
-            includeInteraction = false;
-            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*CHARGE1*CHARGE2;
-        }
-#endif
-#if HAS_LENNARD_JONES
-        #if DO_LJPME
-            // The multiplicative grid term
-            tempEnergy += coef*(1.0f - expDar2*eprefac);
-            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-        #endif
-#endif
    }
-    else {
+    #endif
-#if HAS_LENNARD_JONES
-        real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-        real sig2 = invR*sig;
-        sig2 *= sig2;
-        real sig6 = sig2*sig2*sig2;
-        real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-        real epssig6 = sig6*eps;
-        tempForce = epssig6*(12.0f*sig6 - 6.0f);
-        real ljEnergy = epssig6*(sig6 - 1.0f);
-        #if USE_LJ_SWITCH
-        if (r > LJ_SWITCH_CUTOFF) {
-            real x = r-LJ_SWITCH_CUTOFF;
-            real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-            real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-            tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-            ljEnergy *= switchValue;
-        }
-        #endif
 #if DO_LJPME
-        // The multiplicative grid term
+    // The multiplicative term to correct for the multiplicative terms that are always
-        ljEnergy += coef*(1.0f - expDar2*eprefac);
+    // present in reciprocal space.
-        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+    const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
-        // The potential shift accounts for the step at the cutoff introduced by the
+    const real dar2 = dispersionAlphaR*dispersionAlphaR;
-        // transition from additive to multiplicative combintion rules and is only
+    const real dar4 = dar2*dar2;
-        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+    const real dar6 = dar4*dar2;
-        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+    const real invR2 = invR*invR;
-        sig2 = sig*sig;
+    const real expDar2 = EXP(-dar2);
-        sig6 = sig2*sig2*sig2*INVCUT6;
+    const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
-        epssig6 = eps*sig6;
+    const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
-        // The additive part of the potential shift
+    const real coef = invR2*invR2*invR2*c6;
-        ljEnergy += epssig6*(1.0f - sig6);
+    const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-        // The multiplicative part of the potential shift
+    const real dprefac = eprefac + dar6/6.0f;
-        ljEnergy += MULTSHIFT6*c6;
+    // The multiplicative grid term
+    ljEnergy += coef*(1.0f - expDar2*eprefac);
+    tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+    // The potential shift accounts for the step at the cutoff introduced by the
+    // transition from additive to multiplicative combintion rules and is only
+    // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+    // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+    sig2 = sig*sig;
+    sig6 = sig2*sig2*sig2*INVCUT6;
+    epssig6 = eps*sig6;
+    // The additive part of the potential shift
+    ljEnergy += epssig6*(1.0f - sig6);
+    // The multiplicative part of the potential shift
+    ljEnergy += MULTSHIFT6*c6;
 #endif
-        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
+    tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
 #else
-        tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
+    tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
 #endif
-    }
    dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
 #else
 #ifdef USE_CUTOFF

--- a/platforms/opencl/src/kernels/nonbondedParameters.cl
+++ b/platforms/opencl/src/kernels/nonbondedParameters.cl
@@ -63,3 +63,28 @@ __kernel void computeParameters(__global mixed* restrict energyBuffer, int inclu
    if (includeSelfEnergy)
        energyBuffer[get_global_id(0)] += energy;
 }
+/**
+ * Compute parameters for subtracting the reciprocal part of excluded interactions.
+ */
+__kernel void computeExclusionParameters(__global real4* restrict posq, __global real* restrict charge, __global float2* restrict sigmaEpsilon,
+        int numExclusions, __global const int2* restrict exclusionAtoms, __global float4* restrict exclusionParams) {
+    for (int i = get_global_id(0); i < numExclusions; i += get_global_size(0)) {
+        int2 atoms = exclusionAtoms[i];
+#ifdef USE_POSQ_CHARGES
+        real chargeProd = posq[atoms.x].w*posq[atoms.y].w;
+#else
+        real chargeProd = charge[atoms.x]*charge[atoms.y];
+#endif
+#ifdef INCLUDE_LJPME
+        float2 sigEps1 = sigmaEpsilon[atoms.x];
+        float2 sigEps2 = sigmaEpsilon[atoms.y];
+        float sigma = sigEps1.x*sigEps2.x;
+        float epsilon = sigEps1.y*sigEps2.y;
+#else
+        float sigma = 0;
+        float epsilon = 0;
+#endif
+        exclusionParams[i] = (float4) ((float) (138.935456f*chargeProd), sigma, epsilon, 0);
+    }
+}
--- a/platforms/opencl/src/kernels/pmeExclusions.cl
+++ b/platforms/opencl/src/kernels/pmeExclusions.cl
+const float4 exclusionParams = PARAMS[index];
+real3 delta = (real3) (pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+const real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+const real r = SQRT(r2);
+const real invR = RECIP(r);
+const real alphaR = EWALD_ALPHA*r;
+const real expAlphaRSqr = EXP(-alphaR*alphaR);
+real tempForce = 0.0f;
+if (alphaR > 1e-6f) {
+    const real erfAlphaR = erf(alphaR);
+    const real prefactor = exclusionParams.x*invR;
+    tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    energy -= prefactor*erfAlphaR;
+}
+else {
+    energy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*exclusionParams.x;
+}
+#if DO_LJPME
+const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+const real dar2 = dispersionAlphaR*dispersionAlphaR;
+const real dar4 = dar2*dar2;
+const real dar6 = dar4*dar2;
+const real invR2 = invR*invR;
+const real expDar2 = EXP(-dar2);
+const real c6 = 64*exclusionParams.y*exclusionParams.y*exclusionParams.y*exclusionParams.z;
+const real coef = invR2*invR2*invR2*c6;
+const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+const real dprefac = eprefac + dar6/6.0f;
+energy += coef*(1.0f - expDar2*eprefac);
+tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+#endif
+if (r > 0)
+    delta *= tempForce*invR*invR;
+real3 force1 = -delta;
+real3 force2 = delta;