Revised how exclusions are computed with PME

da28ce0e · Peter Eastman · df52ff73 · da28ce0e · da28ce0e · da28ce0e
Commit da28ce0e authored Jun 07, 2019 by Peter Eastman
10 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -678,6 +678,8 @@ private:
    CudaArray charges;
    CudaArray sigmaEpsilon;
    CudaArray exceptionParams;
+    CudaArray exclusionAtoms;
+    CudaArray exclusionParams;
    CudaArray baseParticleParams;
    CudaArray baseExceptionParams;
    CudaArray particleParamOffsets;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1665,6 +1665,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
    if (hasOffsets)
        paramsDefines["HAS_OFFSETS"] = "1";
+    if (usePosqCharges)
+        paramsDefines["USE_POSQ_CHARGES"] = "1";
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.

@@ -1929,7 +1931,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon

                    // Evaluate the actual bspline moduli for X/Y/Z.

-                    for(int dim = 0; dim < 3; dim++) {
+                    for (int dim = 0; dim < 3; dim++) {
                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
                        vector<double> moduli(ndata);
                        for (int i = 0; i < ndata; i++) {
@@ -1957,6 +1959,37 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        }
    }

+    // Add code to subtract off the reciprocal part of excluded interactions.
+
+    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
+        int numContexts = cu.getPlatformData().contexts.size();
+        int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
+        int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
+        int numExclusions = endIndex-startIndex;
+        if (numExclusions > 0) {
+            paramsDefines["HAS_EXCLUSIONS"] = "1";
+            vector<vector<int> > atoms(numExclusions, vector<int>(2));
+            exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
+            exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
+            vector<int2> exclusionAtomsVec(numExclusions);
+            for (int i = 0; i < numExclusions; i++) {
+                int j = i+startIndex;
+                exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
+                atoms[i][0] = exclusions[j].first;
+                atoms[i][1] = exclusions[j].second;
+            }
+            exclusionAtoms.upload(exclusionAtomsVec);
+            map<string, string> replacements;
+            replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams.getDevicePointer(), "float4");
+            replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
+            replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
+            if (doLJPME)
+                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
+            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::pmeExclusions, replacements), force.getForceGroup());
+        }
+    }
+
    // Add the interaction to the default nonbonded kernel.

    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
@@ -1967,7 +2000,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    if (usePosqCharges) {
        replacements["CHARGE1"] = "posq1.w";
        replacements["CHARGE2"] = "posq2.w";
-        paramsDefines["USE_POSQ_CHARGES"] = "1";
    }
    else {
        replacements["CHARGE1"] = prefix+"charge1";
@@ -2104,14 +2136,21 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
                &baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
                &particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
+        int numExceptions, numExclusions;
        if (exceptionParams.isInitialized()) {
-            int numExceptions = exceptionParams.getSize();
+            numExceptions = exceptionParams.getSize();
            paramsArgs.push_back(&numExceptions);
            paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
            paramsArgs.push_back(&exceptionParams.getDevicePointer());
            paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
            paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
        }
+        if (exclusionParams.isInitialized()) {
+            numExclusions = exclusionParams.getSize();
+            paramsArgs.push_back(&numExclusions);
+            paramsArgs.push_back(&exclusionAtoms.getDevicePointer());
+            paramsArgs.push_back(&exclusionParams.getDevicePointer());
+        }
        cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
        if (usePmeStream) {
            cuEventRecord(paramsSyncEvent, cu.getCurrentStream());

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
 {
 #if USE_EWALD
-    bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
-    unsigned int includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
+    unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
    const real alphaR = EWALD_ALPHA*r;
    const real expAlphaRSqr = EXP(-alphaR*alphaR);
 #if HAS_COULOMB
@@ -22,89 +21,58 @@
 #endif
    real tempForce = 0.0f;
 #if HAS_LENNARD_JONES
-    // The multiplicative term to correct for the multiplicative terms that are always
-    // present in reciprocal space.  The real terms have an additive contribution
-    // added in, but for excluded terms the multiplicative term is just subtracted.
-    // These factors are needed in both clauses of the needCorrection statement, so
-    // I declare them up here.
-    #if DO_LJPME
-        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
-        const real dar2 = dispersionAlphaR*dispersionAlphaR;
-        const real dar4 = dar2*dar2;
-        const real dar6 = dar4*dar2;
-        const real invR2 = invR*invR;
-        const real expDar2 = EXP(-dar2);
-        const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
-        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
-        const real coef = invR2*invR2*invR2*c6;
-        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-        const real dprefac = eprefac + dar6/6.0f;
-    #endif
-#endif
-    if (needCorrection) {
-        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-#if HAS_COULOMB
-        if (1-erfcAlphaR > 1e-6) {
-            real erfAlphaR = ERF(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
-            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-            tempEnergy += -prefactor*erfAlphaR;
-        }
-        else {
-            includeInteraction = false;
-            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*CHARGE1*CHARGE2;
-        }
-#endif
-#if HAS_LENNARD_JONES
-        #if DO_LJPME
-            // The multiplicative grid term
-            tempEnergy += coef*(1.0f - expDar2*eprefac);
-            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-        #endif
-#endif
+    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
+    real sig2 = invR*sig;
+    sig2 *= sig2;
+    real sig6 = sig2*sig2*sig2;
+    real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
+    real epssig6 = sig6*eps;
+    tempForce = epssig6*(12.0f*sig6 - 6.0f);
+    real ljEnergy = epssig6*(sig6 - 1.0f);
+    #if USE_LJ_SWITCH
+    if (r > LJ_SWITCH_CUTOFF) {
+        real x = r-LJ_SWITCH_CUTOFF;
+        real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
+        real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
+        tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
+        ljEnergy *= switchValue;
    }
-    else {
-#if HAS_LENNARD_JONES
-        real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-        real sig2 = invR*sig;
-        sig2 *= sig2;
-        real sig6 = sig2*sig2*sig2;
-        real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-        real epssig6 = sig6*eps;
-        tempForce = epssig6*(12.0f*sig6 - 6.0f);
-        real ljEnergy = epssig6*(sig6 - 1.0f);
-        #if USE_LJ_SWITCH
-        if (r > LJ_SWITCH_CUTOFF) {
-            real x = r-LJ_SWITCH_CUTOFF;
-            real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-            real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-            tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-            ljEnergy *= switchValue;
-        }
-        #endif
+    #endif
 #if DO_LJPME
-        // The multiplicative grid term
-        ljEnergy += coef*(1.0f - expDar2*eprefac);
-        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-        // The potential shift accounts for the step at the cutoff introduced by the
-        // transition from additive to multiplicative combintion rules and is only
-        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
-        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
-        sig2 = sig*sig;
-        sig6 = sig2*sig2*sig2*INVCUT6;
-        epssig6 = eps*sig6;
-        // The additive part of the potential shift
-        ljEnergy += epssig6*(1.0f - sig6);
-        // The multiplicative part of the potential shift
-        ljEnergy += MULTSHIFT6*c6;
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.
+    const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+    const real dar2 = dispersionAlphaR*dispersionAlphaR;
+    const real dar4 = dar2*dar2;
+    const real dar6 = dar4*dar2;
+    const real invR2 = invR*invR;
+    const real expDar2 = EXP(-dar2);
+    const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
+    const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+    const real coef = invR2*invR2*invR2*c6;
+    const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+    const real dprefac = eprefac + dar6/6.0f;
+    // The multiplicative grid term
+    ljEnergy += coef*(1.0f - expDar2*eprefac);
+    tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+    // The potential shift accounts for the step at the cutoff introduced by the
+    // transition from additive to multiplicative combintion rules and is only
+    // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+    // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+    sig2 = sig*sig;
+    sig6 = sig2*sig2*sig2*INVCUT6;
+    epssig6 = eps*sig6;
+    // The additive part of the potential shift
+    ljEnergy += epssig6*(1.0f - sig6);
+    // The multiplicative part of the potential shift
+    ljEnergy += MULTSHIFT6*c6;
 #endif
-        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
+    tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else
-        tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
+    tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
 #endif
-    }
    dEdR += includeInteraction ? tempForce*invR*invR : 0;
 #else
 #ifdef USE_CUTOFF
@@ -113,7 +81,7 @@
    unsigned int includeInteraction = (!isExcluded);
 #endif
    real tempForce = 0.0f;
-  #if HAS_LENNARD_JONES
+#if HAS_LENNARD_JONES
    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
    real sig2 = invR*sig;
    sig2 *= sig2;
@@ -131,7 +99,7 @@
    }
    #endif
    tempEnergy += ljEnergy;
-  #endif
+#endif
 #if HAS_COULOMB
  #ifdef USE_CUTOFF
    const real prefactor = 138.935456f*CHARGE1*CHARGE2;

--- a/platforms/cuda/src/kernels/nonbondedParameters.cu
+++ b/platforms/cuda/src/kernels/nonbondedParameters.cu
@@ -7,6 +7,9 @@ extern "C" __global__ void computeParameters(mixed* __restrict__ energyBuffer, b
 #ifdef HAS_EXCEPTIONS
        , int numExceptions, const float4* __restrict__ baseExceptionParams, float4* __restrict__ exceptionParams,
        float4* __restrict__ exceptionParamOffsets, int* __restrict__ exceptionOffsetIndices
+#endif
+#ifdef HAS_EXCLUSIONS
+        , int numExclusions, const int2* __restrict__ exclusionAtoms, float4* __restrict__ exclusionParams
 #endif
        ) {
    mixed energy = 0;
@@ -60,6 +63,29 @@ extern "C" __global__ void computeParameters(mixed* __restrict__ energyBuffer, b
        exceptionParams[i] = make_float4((float) (138.935456f*params.x), (float) params.y, (float) (4*params.z), 0);
    }
 #endif
+
+    // Compute parameters for subtracting the reciprocal part of excluded interactions.
+
+#ifdef HAS_EXCLUSIONS
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numExclusions; i += blockDim.x*gridDim.x) {
+        int2 atoms = exclusionAtoms[i];
+#ifdef USE_POSQ_CHARGES
+        real chargeProd = posq[atoms.x].w*posq[atoms.y].w;
+#else
+        real chargeProd = charge[atoms.x]*charge[atoms.y];
+#endif
+#ifdef INCLUDE_LJPME
+        float2 sigEps1 = sigmaEpsilon[atoms.x];
+        float2 sigEps2 = sigmaEpsilon[atoms.y];
+        float sigma = sigEps1.x*sigEps2.x;
+        float epsilon = sigEps1.y*sigEps2.y;
+#else
+        float sigma = 0;
+        float epsilon = 0;
+#endif
+        exclusionParams[i] = make_float4((float) (138.935456f*chargeProd), sigma, epsilon, 0);
+    }
+#endif
    if (includeSelfEnergy)
        energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 }
--- a/platforms/cuda/src/kernels/pmeExclusions.cu
+++ b/platforms/cuda/src/kernels/pmeExclusions.cu
+const float4 exclusionParams = PARAMS[index];
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+const real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+const real invR = RSQRT(r2);
+const real r = r2*invR;
+const real alphaR = EWALD_ALPHA*r;
+const real expAlphaRSqr = EXP(-alphaR*alphaR);
+const real erfAlphaR = ERF(alphaR);
+real tempForce = 0.0f;
+if (erfAlphaR > 1e-6f) {
+    const real prefactor = exclusionParams.x*invR;
+    tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    energy -= prefactor*erfAlphaR;
+}
+else {
+    energy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*exclusionParams.x;
+}
+#if DO_LJPME
+const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+const real dar2 = dispersionAlphaR*dispersionAlphaR;
+const real dar4 = dar2*dar2;
+const real dar6 = dar4*dar2;
+const real invR2 = invR*invR;
+const real expDar2 = EXP(-dar2);
+const real c6 = 64*exclusionParams.y*exclusionParams.y*exclusionParams.y*exclusionParams.z;
+const real coef = invR2*invR2*invR2*c6;
+const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+const real dprefac = eprefac + dar6/6.0f;
+energy += coef*(1.0f - expDar2*eprefac);
+tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+#endif
+delta *= tempForce*invR*invR;
+real3 force1 = -delta;
+real3 force2 = delta;
--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -655,6 +655,8 @@ private:
    OpenCLArray charges;
    OpenCLArray sigmaEpsilon;
    OpenCLArray exceptionParams;
+    OpenCLArray exclusionAtoms;
+    OpenCLArray exclusionParams;
    OpenCLArray baseParticleParams;
    OpenCLArray baseExceptionParams;
    OpenCLArray particleParamOffsets;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1655,6 +1655,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
    if (hasOffsets)
        paramsDefines["HAS_OFFSETS"] = "1";
+    if (usePosqCharges)
+        paramsDefines["USE_POSQ_CHARGES"] = "1";
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.

@@ -1853,7 +1855,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb

                    // Evaluate the actual bspline moduli for X/Y/Z.

-                    for(int dim = 0; dim < 3; dim++) {
+                    for (int dim = 0; dim < 3; dim++) {
                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
                        vector<cl_double> moduli(ndata);
                        for (int i = 0; i < ndata; i++) {
@@ -1883,6 +1885,37 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        }
    }

+    // Add code to subtract off the reciprocal part of excluded interactions.
+
+    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
+        int numContexts = cl.getPlatformData().contexts.size();
+        int startIndex = cl.getContextIndex()*force.getNumExceptions()/numContexts;
+        int endIndex = (cl.getContextIndex()+1)*force.getNumExceptions()/numContexts;
+        int numExclusions = endIndex-startIndex;
+        if (numExclusions > 0) {
+            paramsDefines["HAS_EXCLUSIONS"] = "1";
+            vector<vector<int> > atoms(numExclusions, vector<int>(2));
+            exclusionAtoms.initialize<mm_int2>(cl, numExclusions, "exclusionAtoms");
+            exclusionParams.initialize<mm_float4>(cl, numExclusions, "exclusionParams");
+            vector<mm_int2> exclusionAtomsVec(numExclusions);
+            for (int i = 0; i < numExclusions; i++) {
+                int j = i+startIndex;
+                exclusionAtomsVec[i] = mm_int2(exclusions[j].first, exclusions[j].second);
+                atoms[i][0] = exclusions[j].first;
+                atoms[i][1] = exclusions[j].second;
+            }
+            exclusionAtoms.upload(exclusionAtomsVec);
+            map<string, string> replacements;
+            replacements["PARAMS"] = cl.getBondedUtilities().addArgument(exclusionParams.getDeviceBuffer(), "float4");
+            replacements["EWALD_ALPHA"] = cl.doubleToString(alpha);
+            replacements["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
+            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
+            if (doLJPME)
+                replacements["EWALD_DISPERSION_ALPHA"] = cl.doubleToString(dispersionAlpha);
+            cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(OpenCLKernelSources::pmeExclusions, replacements), force.getForceGroup());
+        }
+    }
+
    // Add the interaction to the default nonbonded kernel.
    
    string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines);
@@ -1893,7 +1926,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    if (usePosqCharges) {
        replacements["CHARGE1"] = "posq1.w";
        replacements["CHARGE2"] = "posq2.w";
-        paramsDefines["USE_POSQ_CHARGES"] = "1";
    }
    else {
        replacements["CHARGE1"] = prefix+"charge1";
@@ -2012,21 +2044,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
-        computeParamsKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(2, globalParams.getDeviceBuffer());
-        computeParamsKernel.setArg<cl_int>(3, cl.getPaddedNumAtoms());
-        computeParamsKernel.setArg<cl::Buffer>(4, baseParticleParams.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(5, cl.getPosq().getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(6, charges.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(7, sigmaEpsilon.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(8, particleParamOffsets.getDeviceBuffer());
-        computeParamsKernel.setArg<cl::Buffer>(9, particleOffsetIndices.getDeviceBuffer());
+        int index = 0;
+        computeParamsKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
+        index++;
+        computeParamsKernel.setArg<cl::Buffer>(index++, globalParams.getDeviceBuffer());
+        computeParamsKernel.setArg<cl_int>(index++, cl.getPaddedNumAtoms());
+        computeParamsKernel.setArg<cl::Buffer>(index++, baseParticleParams.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, charges.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, sigmaEpsilon.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, particleParamOffsets.getDeviceBuffer());
+        computeParamsKernel.setArg<cl::Buffer>(index++, particleOffsetIndices.getDeviceBuffer());
        if (exceptionParams.isInitialized()) {
-            computeParamsKernel.setArg<cl_int>(10, exceptionParams.getSize());
-            computeParamsKernel.setArg<cl::Buffer>(11, baseExceptionParams.getDeviceBuffer());
-            computeParamsKernel.setArg<cl::Buffer>(12, exceptionParams.getDeviceBuffer());
-            computeParamsKernel.setArg<cl::Buffer>(13, exceptionParamOffsets.getDeviceBuffer());
-            computeParamsKernel.setArg<cl::Buffer>(14, exceptionOffsetIndices.getDeviceBuffer());
+            computeParamsKernel.setArg<cl_int>(index++, exceptionParams.getSize());
+            computeParamsKernel.setArg<cl::Buffer>(index++, baseExceptionParams.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exceptionParams.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exceptionParamOffsets.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exceptionOffsetIndices.getDeviceBuffer());
+        }
+        if (exclusionParams.isInitialized()) {
+            computeParamsKernel.setArg<cl_int>(index++, exclusionParams.getSize());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exclusionAtoms.getDeviceBuffer());
+            computeParamsKernel.setArg<cl::Buffer>(index++, exclusionParams.getDeviceBuffer());
        }
        if (cosSinSums.isInitialized()) {
            ewaldSumsKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());

--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -5,8 +5,7 @@
    unsigned int includeInteraction;
 #endif
 #if USE_EWALD
-    bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
-    includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
+    includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
    const real alphaR = EWALD_ALPHA*r;
    const real expAlphaRSqr = EXP(-alphaR*alphaR);
 #if HAS_COULOMB
@@ -27,89 +26,58 @@
 #endif
    real tempForce = 0;
 #if HAS_LENNARD_JONES
-    // The multiplicative term to correct for the multiplicative terms that are always
-    // present in reciprocal space.  The real terms have an additive contribution
-    // added in, but for excluded terms the multiplicative term is just subtracted.
-    // These factors are needed in both clauses of the needCorrection statement, so
-    // I declare them up here.
-    #if DO_LJPME
-        const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
-        const real dar2 = dispersionAlphaR*dispersionAlphaR;
-        const real dar4 = dar2*dar2;
-        const real dar6 = dar4*dar2;
-        const real invR2 = invR*invR;
-        const real expDar2 = EXP(-dar2);
-        const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
-        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
-        const real coef = invR2*invR2*invR2*c6;
-        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-        const real dprefac = eprefac + dar6/6.0f;
-    #endif
-#endif
-    if (needCorrection) {
-        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-#if HAS_COULOMB
-        if (1-erfcAlphaR > 1e-6) {
-            real erfAlphaR = erf(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
-            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-            tempEnergy += -prefactor*erfAlphaR;
-        }
-        else {
-            includeInteraction = false;
-            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*CHARGE1*CHARGE2;
-        }
-#endif
-#if HAS_LENNARD_JONES
-        #if DO_LJPME
-            // The multiplicative grid term
-            tempEnergy += coef*(1.0f - expDar2*eprefac);
-            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-        #endif
-#endif
+    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
+    real sig2 = invR*sig;
+    sig2 *= sig2;
+    real sig6 = sig2*sig2*sig2;
+    real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
+    real epssig6 = sig6*eps;
+    tempForce = epssig6*(12.0f*sig6 - 6.0f);
+    real ljEnergy = epssig6*(sig6 - 1.0f);
+    #if USE_LJ_SWITCH
+    if (r > LJ_SWITCH_CUTOFF) {
+        real x = r-LJ_SWITCH_CUTOFF;
+        real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
+        real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
+        tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
+        ljEnergy *= switchValue;
    }
-    else {
-#if HAS_LENNARD_JONES
-        real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-        real sig2 = invR*sig;
-        sig2 *= sig2;
-        real sig6 = sig2*sig2*sig2;
-        real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-        real epssig6 = sig6*eps;
-        tempForce = epssig6*(12.0f*sig6 - 6.0f);
-        real ljEnergy = epssig6*(sig6 - 1.0f);
-        #if USE_LJ_SWITCH
-        if (r > LJ_SWITCH_CUTOFF) {
-            real x = r-LJ_SWITCH_CUTOFF;
-            real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-            real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-            tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-            ljEnergy *= switchValue;
-        }
-        #endif
+    #endif
 #if DO_LJPME
-        // The multiplicative grid term
-        ljEnergy += coef*(1.0f - expDar2*eprefac);
-        tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-        // The potential shift accounts for the step at the cutoff introduced by the
-        // transition from additive to multiplicative combintion rules and is only
-        // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
-        // instead of tempEnergy here, the includeInteraction mask is correctly applied.
-        sig2 = sig*sig;
-        sig6 = sig2*sig2*sig2*INVCUT6;
-        epssig6 = eps*sig6;
-        // The additive part of the potential shift
-        ljEnergy += epssig6*(1.0f - sig6);
-        // The multiplicative part of the potential shift
-        ljEnergy += MULTSHIFT6*c6;
+    // The multiplicative term to correct for the multiplicative terms that are always
+    // present in reciprocal space.
+    const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+    const real dar2 = dispersionAlphaR*dispersionAlphaR;
+    const real dar4 = dar2*dar2;
+    const real dar6 = dar4*dar2;
+    const real invR2 = invR*invR;
+    const real expDar2 = EXP(-dar2);
+    const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
+    const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
+    const real coef = invR2*invR2*invR2*c6;
+    const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+    const real dprefac = eprefac + dar6/6.0f;
+    // The multiplicative grid term
+    ljEnergy += coef*(1.0f - expDar2*eprefac);
+    tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+    // The potential shift accounts for the step at the cutoff introduced by the
+    // transition from additive to multiplicative combintion rules and is only
+    // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+    // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+    sig2 = sig*sig;
+    sig6 = sig2*sig2*sig2*INVCUT6;
+    epssig6 = eps*sig6;
+    // The additive part of the potential shift
+    ljEnergy += epssig6*(1.0f - sig6);
+    // The multiplicative part of the potential shift
+    ljEnergy += MULTSHIFT6*c6;
 #endif
-        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
+    tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
 #else
-        tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
+    tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
 #endif
-    }
    dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
 #else
 #ifdef USE_CUTOFF

--- a/platforms/opencl/src/kernels/nonbondedParameters.cl
+++ b/platforms/opencl/src/kernels/nonbondedParameters.cl
@@ -7,6 +7,9 @@ __kernel void computeParameters(__global mixed* restrict energyBuffer, int inclu
 #ifdef HAS_EXCEPTIONS
        , int numExceptions, __global const float4* restrict baseExceptionParams, __global float4* restrict exceptionParams,
        __global float4* restrict exceptionParamOffsets, __global int* restrict exceptionOffsetIndices
+#endif
+#ifdef HAS_EXCLUSIONS
+        , int numExclusions, __global const int2* restrict exclusionAtoms, __global float4* restrict exclusionParams
 #endif
        ) {
    mixed energy = 0;
@@ -60,6 +63,29 @@ __kernel void computeParameters(__global mixed* restrict energyBuffer, int inclu
        exceptionParams[i] = (float4) ((float) (138.935456f*params.x), (float) params.y, (float) (4*params.z), 0);
    }
 #endif
+
+    // Compute parameters for subtracting the reciprocal part of excluded interactions.
+
+#ifdef HAS_EXCLUSIONS
+    for (int i = get_global_id(0); i < numExclusions; i += get_global_size(0)) {
+        int2 atoms = exclusionAtoms[i];
+#ifdef USE_POSQ_CHARGES
+        real chargeProd = posq[atoms.x].w*posq[atoms.y].w;
+#else
+        real chargeProd = charge[atoms.x]*charge[atoms.y];
+#endif
+#ifdef INCLUDE_LJPME
+        float2 sigEps1 = sigmaEpsilon[atoms.x];
+        float2 sigEps2 = sigmaEpsilon[atoms.y];
+        float sigma = sigEps1.x*sigEps2.x;
+        float epsilon = sigEps1.y*sigEps2.y;
+#else
+        float sigma = 0;
+        float epsilon = 0;
+#endif
+        exclusionParams[i] = (float4) ((float) (138.935456f*chargeProd), sigma, epsilon, 0);
+    }
+#endif
    if (includeSelfEnergy)
        energyBuffer[get_global_id(0)] += energy;
 }
--- a/platforms/opencl/src/kernels/pmeExclusions.cl
+++ b/platforms/opencl/src/kernels/pmeExclusions.cl
+const float4 exclusionParams = PARAMS[index];
+real3 delta = (real3) (pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+const real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+const real invR = RSQRT(r2);
+const real r = r2*invR;
+const real alphaR = EWALD_ALPHA*r;
+const real expAlphaRSqr = EXP(-alphaR*alphaR);
+const real erfAlphaR = erf(alphaR);
+real tempForce = 0.0f;
+if (erfAlphaR > 1e-6f) {
+    const real prefactor = exclusionParams.x*invR;
+    tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+    energy -= prefactor*erfAlphaR;
+}
+else {
+    energy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*exclusionParams.x;
+}
+#if DO_LJPME
+const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+const real dar2 = dispersionAlphaR*dispersionAlphaR;
+const real dar4 = dar2*dar2;
+const real dar6 = dar4*dar2;
+const real invR2 = invR*invR;
+const real expDar2 = EXP(-dar2);
+const real c6 = 64*exclusionParams.y*exclusionParams.y*exclusionParams.y*exclusionParams.z;
+const real coef = invR2*invR2*invR2*c6;
+const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+const real dprefac = eprefac + dar6/6.0f;
+energy += coef*(1.0f - expDar2*eprefac);
+tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+#endif
+delta *= tempForce*invR*invR;
+real3 force1 = -delta;
+real3 force2 = delta;