Converted more code to common platform (#3073)

* Converted more code to common platform * Converted more code to common platform

Converted more code to common platform (#3073)
* Converted more code to common platform * Converted more code to common platform
98d81730 · Peter Eastman · GitHub · 72c70cfe · 98d81730 · 98d81730
Unverified Commit 98d81730 authored Mar 19, 2021 by Peter Eastman Committed by GitHub Mar 19, 2021
14 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -413,6 +413,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    compilationDefines["ACOS"] = "acos";
    compilationDefines["ASIN"] = "asin";
    compilationDefines["ATAN"] = "atan";
+    compilationDefines["ERF"] = "erf";
+    compilationDefines["ERFC"] = "erfc";

    // Set defines for applying periodic boundary conditions.


--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -72,9 +72,9 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
    if (name == UpdateStateDataKernel::Name())
        return new OpenCLUpdateStateDataKernel(name, platform, cl);
    if (name == ApplyConstraintsKernel::Name())
-        return new OpenCLApplyConstraintsKernel(name, platform, cl);
+        return new CommonApplyConstraintsKernel(name, platform, cl);
    if (name == VirtualSitesKernel::Name())
-        return new OpenCLVirtualSitesKernel(name, platform, cl);
+        return new CommonVirtualSitesKernel(name, platform, cl);
    if (name == CalcHarmonicBondForceKernel::Name())
        return new CommonCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
    if (name == CalcCustomBondForceKernel::Name())
@@ -134,7 +134,7 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
    if (name == IntegrateNoseHooverStepKernel::Name())
        return new CommonIntegrateNoseHooverStepKernel(name, platform, cl);
    if (name == ApplyMonteCarloBarostatKernel::Name())
-        return new OpenCLApplyMonteCarloBarostatKernel(name, platform, cl);
+        return new CommonApplyMonteCarloBarostatKernel(name, platform, cl);
    if (name == RemoveCMMotionKernel::Name())
        return new CommonRemoveCMMotionKernel(name, platform, cl);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -27,23 +27,14 @@
 #include "OpenCLKernels.h"
 #include "OpenCLForceInfo.h"
 #include "openmm/Context.h"
-#include "openmm/internal/AndersenThermostatImpl.h"
 #include "openmm/internal/ContextImpl.h"
-#include "openmm/internal/CustomCompoundBondForceImpl.h"
-#include "openmm/internal/CustomHbondForceImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
-#include "openmm/internal/OSRngSeed.h"
+#include "CommonKernelSources.h"
 #include "OpenCLBondedUtilities.h"
 #include "OpenCLExpressionUtilities.h"
 #include "OpenCLIntegrationUtilities.h"
 #include "OpenCLNonbondedUtilities.h"
 #include "OpenCLKernelSources.h"
-#include "lepton/CustomFunction.h"
-#include "lepton/ExpressionTreeNode.h"
-#include "lepton/Operation.h"
-#include "lepton/Parser.h"
-#include "lepton/ParsedExpression.h"
-#include "ReferenceTabulatedFunction.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
 #include <algorithm>
@@ -54,14 +45,6 @@

 using namespace OpenMM;
 using namespace std;
-using namespace Lepton;
-
-static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
-    if (cl.getUseMixedPrecision())
-        kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
-    else
-        kernel.setArg(index, sizeof(void*), NULL);
-}

 static void setPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
    if (cl.getUseDoublePrecision())
@@ -87,40 +70,6 @@ static void setPeriodicBoxArgs(OpenCLContext& cl, cl::Kernel& kernel, int index)
    }
 }

-static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
-    const Lepton::Operation& op = expression.getRootNode().getOperation();
-    if (op.getId() != Lepton::Operation::CONSTANT)
-        return false;
-    return (dynamic_cast<const Lepton::Operation::Constant&>(op).getValue() == 0.0);
-}
-
-static bool usesVariable(const Lepton::ExpressionTreeNode& node, const string& variable) {
-    const Lepton::Operation& op = node.getOperation();
-    if (op.getId() == Lepton::Operation::VARIABLE && op.getName() == variable)
-        return true;
-    for (auto& child : node.getChildren())
-        if (usesVariable(child, variable))
-            return true;
-    return false;
-}
-
-static bool usesVariable(const Lepton::ParsedExpression& expression, const string& variable) {
-    return usesVariable(expression.getRootNode(), variable);
-}
-
-static pair<ExpressionTreeNode, string> makeVariable(const string& name, const string& value) {
-    return make_pair(ExpressionTreeNode(new Operation::Variable(name)), value);
-}
-
-static void replaceFunctionsInExpression(map<string, CustomFunction*>& functions, ExpressionProgram& expression) {
-    for (int i = 0; i < expression.getNumOperations(); i++) {
-        if (expression.getOperation(i).getId() == Operation::CUSTOM) {
-            const Operation::Custom& op = dynamic_cast<const Operation::Custom&>(expression.getOperation(i));
-            expression.setOperation(i, new Operation::Custom(op.getName(), functions[op.getName()]->clone(), op.getDerivOrder()));
-        }
-    }
-}
-
 void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
 }

@@ -479,38 +428,6 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
        listener->execute();
 }

-void OpenCLApplyConstraintsKernel::initialize(const System& system) {
-}
-
-void OpenCLApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
-    if (!hasInitializedKernel) {
-        hasInitializedKernel = true;
-        map<string, string> defines;
-        defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
-        cl::Program program = cl.createProgram(OpenCLKernelSources::constraints, defines);
-        applyDeltasKernel = cl::Kernel(program, "applyPositionDeltas");
-        applyDeltasKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-        setPosqCorrectionArg(cl, applyDeltasKernel, 1);
-        applyDeltasKernel.setArg<cl::Buffer>(2, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer());
-    }
-    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
-    cl.clearBuffer(integration.getPosDelta());
-    integration.applyConstraints(tol);
-    cl.executeKernel(applyDeltasKernel, cl.getNumAtoms());
-    integration.computeVirtualSites();
-}
-
-void OpenCLApplyConstraintsKernel::applyToVelocities(ContextImpl& context, double tol) {
-    cl.getIntegrationUtilities().applyVelocityConstraints(tol);
-}
-
-void OpenCLVirtualSitesKernel::initialize(const System& system) {
-}
-
-void OpenCLVirtualSitesKernel::computePositions(ContextImpl& context) {
-    cl.getIntegrationUtilities().computeVirtualSites();
-}
-
 class OpenCLCalcNonbondedForceKernel::ForceInfo : public OpenCLForceInfo {
 public:
    ForceInfo(int requiredBuffers, const NonbondedForce& force) : OpenCLForceInfo(requiredBuffers), force(force) {
@@ -767,11 +684,14 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb

            map<string, string> replacements;
            replacements["NUM_ATOMS"] = cl.intToString(numParticles);
+            replacements["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
            replacements["KMAX_X"] = cl.intToString(kmaxx);
            replacements["KMAX_Y"] = cl.intToString(kmaxy);
            replacements["KMAX_Z"] = cl.intToString(kmaxz);
            replacements["EXP_COEFFICIENT"] = cl.doubleToString(-1.0/(4.0*alpha*alpha));
-            cl::Program program = cl.createProgram(OpenCLKernelSources::ewald, replacements);
+            replacements["ONE_4PI_EPS0"] = cl.doubleToString(ONE_4PI_EPS0);
+            replacements["M_PI"] = cl.doubleToString(M_PI);
+            cl::Program program = cl.createProgram(CommonKernelSources::ewald, replacements);
            ewaldSumsKernel = cl::Kernel(program, "calculateEwaldCosSinSums");
            ewaldForcesKernel = cl::Kernel(program, "calculateEwaldForces");
            int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
@@ -1005,13 +925,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
            replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
            if (doLJPME)
                replacements["EWALD_DISPERSION_ALPHA"] = cl.doubleToString(dispersionAlpha);
-            cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(OpenCLKernelSources::pmeExclusions, replacements), force.getForceGroup());
+            cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
        }
    }

    // Add the interaction to the default nonbonded kernel.
    
-    string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines);
+    string source = cl.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
    charges.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
    baseParticleParams.initialize<mm_float4>(cl, cl.getPaddedNumAtoms(), "baseParticleParams");
    baseParticleParams.upload(baseParticleParamVec);
@@ -1059,7 +979,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        map<string, string> replacements;
        replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
        replacements["PARAMS"] = cl.getBondedUtilities().addArgument(exceptionParams.getDeviceBuffer(), "float4");
-        cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(OpenCLKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
+        cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
    }
    
    // Initialize parameter offsets.
@@ -1131,7 +1051,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    
    // Initialize the kernel for updating parameters.
    
-    cl::Program program = cl.createProgram(OpenCLKernelSources::nonbondedParameters, paramsDefines);
+    cl::Program program = cl.createProgram(CommonKernelSources::nonbondedParameters, paramsDefines);
    computeParamsKernel = cl::Kernel(program, "computeParameters");
    computeExclusionParamsKernel = cl::Kernel(program, "computeExclusionParameters");
    info = new ForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force);
@@ -1172,7 +1092,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            ewaldSumsKernel.setArg<cl::Buffer>(0, cl.getEnergyBuffer().getDeviceBuffer());
            ewaldSumsKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
            ewaldSumsKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
-            ewaldForcesKernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer());
+            ewaldForcesKernel.setArg<cl::Buffer>(0, cl.getLongForceBuffer().getDeviceBuffer());
            ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
            ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
        }
@@ -1332,19 +1252,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    
    if (cosSinSums.isInitialized() && includeReciprocal) {
        mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
-        mm_double4 recipBoxSize = mm_double4(2*M_PI/boxSize.x, 2*M_PI/boxSize.y, 2*M_PI/boxSize.z, 0.0);
-        double recipCoefficient = ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z);
        if (cl.getUseDoublePrecision()) {
-            ewaldSumsKernel.setArg<mm_double4>(3, recipBoxSize);
-            ewaldSumsKernel.setArg<cl_double>(4, recipCoefficient);
-            ewaldForcesKernel.setArg<mm_double4>(3, recipBoxSize);
-            ewaldForcesKernel.setArg<cl_double>(4, recipCoefficient);
+            ewaldSumsKernel.setArg<mm_double4>(3, boxSize);
+            ewaldForcesKernel.setArg<mm_double4>(3, boxSize);
        }
        else {
-            ewaldSumsKernel.setArg<mm_float4>(3, mm_float4((float) recipBoxSize.x, (float) recipBoxSize.y, (float) recipBoxSize.z, 0));
-            ewaldSumsKernel.setArg<cl_float>(4, (cl_float) recipCoefficient);
-            ewaldForcesKernel.setArg<mm_float4>(3, mm_float4((float) recipBoxSize.x, (float) recipBoxSize.y, (float) recipBoxSize.z, 0));
-            ewaldForcesKernel.setArg<cl_float>(4, (cl_float) recipCoefficient);
+            ewaldSumsKernel.setArg<mm_float4>(3, mm_float4((float) boxSize.x, (float) boxSize.y, (float) boxSize.z, 0));
+            ewaldForcesKernel.setArg<mm_float4>(3, mm_float4((float) boxSize.x, (float) boxSize.y, (float) boxSize.z, 0));
        }
        cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize());
        cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
@@ -1671,283 +1585,3 @@ void OpenCLCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx,
        nz = dispersionGridSizeZ;
    }
 }
-
-class OpenCLCalcCustomCVForceKernel::ForceInfo : public OpenCLForceInfo {
-public:
-    ForceInfo(ComputeForceInfo& force) : OpenCLForceInfo(0), force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        return force.areParticlesIdentical(particle1, particle2);
-    }
-    int getNumParticleGroups() {
-        return force.getNumParticleGroups();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        force.getParticlesInGroup(index, particles);
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        return force.areGroupsIdentical(group1, group2);
-    }
-private:
-    ComputeForceInfo& force;
-};
-
-class OpenCLCalcCustomCVForceKernel::ReorderListener : public OpenCLContext::ReorderListener {
-public:
-    ReorderListener(OpenCLContext& cl, OpenCLArray& invAtomOrder) : cl(cl), invAtomOrder(invAtomOrder) {
-    }
-    void execute() {
-        vector<cl_int> invOrder(cl.getPaddedNumAtoms());
-        const vector<int>& order = cl.getAtomIndex();
-        for (int i = 0; i < order.size(); i++)
-            invOrder[order[i]] = i;
-        invAtomOrder.upload(invOrder);
-    }
-private:
-    OpenCLContext& cl;
-    OpenCLArray& invAtomOrder;
-};
-
-void OpenCLCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
-    int numCVs = force.getNumCollectiveVariables();
-    cl.addForce(new OpenCLForceInfo(1));
-    for (int i = 0; i < force.getNumGlobalParameters(); i++)
-        globalParameterNames.push_back(force.getGlobalParameterName(i));
-    for (int i = 0; i < numCVs; i++)
-        variableNames.push_back(force.getCollectiveVariableName(i));
-    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
-        string name = force.getEnergyParameterDerivativeName(i);
-        paramDerivNames.push_back(name);
-        cl.addEnergyParameterDerivative(name);
-    }
-
-    // Create custom functions for the tabulated functions.
-
-    map<string, Lepton::CustomFunction*> functions;
-    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
-        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
-
-    // Create the expressions.
-
-    Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
-    energyExpression = energyExpr.createProgram();
-    variableDerivExpressions.clear();
-    for (auto& name : variableNames)
-        variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
-    paramDerivExpressions.clear();
-    for (auto& name : paramDerivNames)
-        paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
-
-    // Delete the custom functions.
-
-    for (auto& function : functions)
-        delete function.second;
-
-    // Copy parameter derivatives from the inner context.
-
-    OpenCLContext& cl2 = *reinterpret_cast<OpenCLPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
-    for (auto& param : cl2.getEnergyParamDerivNames())
-        cl.addEnergyParameterDerivative(param);
-    
-    // Create arrays for storing information.
-    
-    int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
-    cvForces.resize(numCVs);
-    for (int i = 0; i < numCVs; i++)
-        cvForces[i].initialize(cl, cl.getNumAtoms(), 4*elementSize, "cvForce");
-    invAtomOrder.initialize<cl_int>(cl, cl.getPaddedNumAtoms(), "invAtomOrder");
-    innerInvAtomOrder.initialize<cl_int>(cl, cl.getPaddedNumAtoms(), "innerInvAtomOrder");
-    
-    // Create the kernels.
-    
-    stringstream args, add;
-    for (int i = 0; i < numCVs; i++) {
-        args << ", __global real4* restrict force" << i << ", real dEdV" << i;
-        add << "f += force" << i << "[i]*dEdV" << i << ";\n";
-    }
-    map<string, string> replacements;
-    replacements["PARAMETER_ARGUMENTS"] = args.str();
-    replacements["ADD_FORCES"] = add.str();
-    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customCVForce, replacements));
-    copyStateKernel = cl::Kernel(program, "copyState");
-    copyForcesKernel = cl::Kernel(program, "copyForces");
-    addForcesKernel = cl::Kernel(program, "addForces");
-
-    // This context needs to respect all forces in the inner context when reordering atoms.
-
-    for (auto* info : cl2.getForceInfos())
-        cl.addForce(new ForceInfo(*info));
-}
-
-double OpenCLCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
-    copyState(context, innerContext);
-    int numCVs = variableNames.size();
-    int numAtoms = cl.getNumAtoms();
-    OpenCLContext& cl2 = *reinterpret_cast<OpenCLPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
-    vector<double> cvValues;
-    vector<map<string, double> > cvDerivs(numCVs);
-    for (int i = 0; i < numCVs; i++) {
-        cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
-        copyForcesKernel.setArg<cl::Buffer>(0, cvForces[i].getDeviceBuffer());
-        cl.executeKernel(copyForcesKernel, numAtoms);
-        innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
-    }
-    
-    // Compute the energy and forces.
-    
-    map<string, double> variables;
-    for (auto& name : globalParameterNames)
-        variables[name] = context.getParameter(name);
-    for (int i = 0; i < numCVs; i++)
-        variables[variableNames[i]] = cvValues[i];
-    double energy = energyExpression.evaluate(variables);
-    for (int i = 0; i < numCVs; i++) {
-        double dEdV = variableDerivExpressions[i].evaluate(variables);
-        if (cl.getUseDoublePrecision())
-            addForcesKernel.setArg<cl_double>(2*i+3, dEdV);
-        else
-            addForcesKernel.setArg<cl_float>(2*i+3, dEdV);
-    }
-    cl.executeKernel(addForcesKernel, numAtoms);
-    
-    // Compute the energy parameter derivatives.
-    
-    map<string, double>& energyParamDerivs = cl.getEnergyParamDerivWorkspace();
-    for (int i = 0; i < paramDerivExpressions.size(); i++)
-        energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
-    for (int i = 0; i < numCVs; i++) {
-        double dEdV = variableDerivExpressions[i].evaluate(variables);
-        for (auto& deriv : cvDerivs[i])
-            energyParamDerivs[deriv.first] += dEdV*deriv.second;
-    }
-    return energy;
-}
-
-void OpenCLCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
-    int numAtoms = cl.getNumAtoms();
-    OpenCLContext& cl2 = *reinterpret_cast<OpenCLPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
-    if (!hasInitializedKernels) {
-        hasInitializedKernels = true;
-        
-        // Initialize the listeners.
-        
-        ReorderListener* listener1 = new ReorderListener(cl, invAtomOrder);
-        ReorderListener* listener2 = new ReorderListener(cl2, innerInvAtomOrder);
-        cl.addReorderListener(listener1);
-        cl2.addReorderListener(listener2);
-        listener1->execute();
-        listener2->execute();
-        
-        // Initialize the kernels.
-        
-        copyStateKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-        copyStateKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
-        copyStateKernel.setArg<cl::Buffer>(3, cl.getAtomIndexArray().getDeviceBuffer());
-        copyStateKernel.setArg<cl::Buffer>(4, cl2.getPosq().getDeviceBuffer());
-        copyStateKernel.setArg<cl::Buffer>(6, cl2.getVelm().getDeviceBuffer());
-        copyStateKernel.setArg<cl::Buffer>(7, innerInvAtomOrder.getDeviceBuffer());
-        copyStateKernel.setArg<cl_int>(8, numAtoms);
-        if (cl.getUseMixedPrecision()) {
-            copyStateKernel.setArg<cl::Buffer>(1, cl.getPosqCorrection().getDeviceBuffer());
-            copyStateKernel.setArg<cl::Buffer>(5, cl2.getPosqCorrection().getDeviceBuffer());
-        }
-        else {
-            copyStateKernel.setArg(1, sizeof(void*), NULL);
-            copyStateKernel.setArg(5, sizeof(void*), NULL);
-        }
-
-        copyForcesKernel.setArg<cl::Buffer>(1, invAtomOrder.getDeviceBuffer());
-        copyForcesKernel.setArg<cl::Buffer>(2, cl2.getForce().getDeviceBuffer());
-        copyForcesKernel.setArg<cl::Buffer>(3, cl2.getAtomIndexArray().getDeviceBuffer());
-        copyForcesKernel.setArg<cl_int>(4, numAtoms);
-
-        addForcesKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
-        addForcesKernel.setArg<cl_int>(1, numAtoms);
-        for (int i = 0; i < cvForces.size(); i++)
-            addForcesKernel.setArg<cl::Buffer>(2*i+2, cvForces[i].getDeviceBuffer());
-    }
-    cl.executeKernel(copyStateKernel, numAtoms);
-    Vec3 a, b, c;
-    context.getPeriodicBoxVectors(a, b, c);
-    innerContext.setPeriodicBoxVectors(a, b, c);
-    innerContext.setTime(context.getTime());
-    map<string, double> innerParameters = innerContext.getParameters();
-    for (auto& param : innerParameters)
-        innerContext.setParameter(param.first, context.getParameter(param.first));
-}
-
-void OpenCLCalcCustomCVForceKernel::copyParametersToContext(ContextImpl& context, const CustomCVForce& force) {
-    // Create custom functions for the tabulated functions.
-
-    map<string, CustomFunction*> functions;
-    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
-        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
-
-    // Replace tabulated functions in the expressions.
-
-    replaceFunctionsInExpression(functions, energyExpression);
-    for (auto& expression : variableDerivExpressions)
-        replaceFunctionsInExpression(functions, expression);
-    for (auto& expression : paramDerivExpressions)
-        replaceFunctionsInExpression(functions, expression);
-
-    // Delete the custom functions.
-
-    for (auto& function : functions)
-        delete function.second;
-}
-
-void OpenCLApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
-    savedPositions.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedPositions");
-    savedForces.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedForces");
-    cl::Program program = cl.createProgram(OpenCLKernelSources::monteCarloBarostat);
-    kernel = cl::Kernel(program, "scalePositions");
-}
-
-void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ) {
-    if (!hasInitializedKernels) {
-        hasInitializedKernels = true;
-
-        // Create the arrays with the molecule definitions.
-
-        vector<vector<int> > molecules = context.getMolecules();
-        numMolecules = molecules.size();
-        moleculeAtoms.initialize<int>(cl, cl.getNumAtoms(), "moleculeAtoms");
-        moleculeStartIndex.initialize<int>(cl, numMolecules+1, "moleculeStartIndex");
-        vector<int> atoms(moleculeAtoms.getSize());
-        vector<int> startIndex(moleculeStartIndex.getSize());
-        int index = 0;
-        for (int i = 0; i < numMolecules; i++) {
-            startIndex[i] = index;
-            for (int molecule : molecules[i])
-                atoms[index++] = molecule;
-        }
-        startIndex[numMolecules] = index;
-        moleculeAtoms.upload(atoms);
-        moleculeStartIndex.upload(startIndex);
-
-        // Initialize the kernel arguments.
-        
-        kernel.setArg<cl_int>(3, numMolecules);
-        kernel.setArg<cl::Buffer>(9, cl.getPosq().getDeviceBuffer());
-        kernel.setArg<cl::Buffer>(10, moleculeAtoms.getDeviceBuffer());
-        kernel.setArg<cl::Buffer>(11, moleculeStartIndex.getDeviceBuffer());
-    }
-    int bytesToCopy = cl.getPosq().getSize()*(cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
-    cl.getQueue().enqueueCopyBuffer(cl.getPosq().getDeviceBuffer(), savedPositions.getDeviceBuffer(), 0, 0, bytesToCopy);
-    cl.getQueue().enqueueCopyBuffer(cl.getForce().getDeviceBuffer(), savedForces.getDeviceBuffer(), 0, 0, bytesToCopy);
-    kernel.setArg<cl_float>(0, (cl_float) scaleX);
-    kernel.setArg<cl_float>(1, (cl_float) scaleY);
-    kernel.setArg<cl_float>(2, (cl_float) scaleZ);
-    setPeriodicBoxArgs(cl, kernel, 4);
-    cl.executeKernel(kernel, cl.getNumAtoms());
-    for (auto& offset : cl.getPosCellOffsets())
-        offset = mm_int4(0, 0, 0, 0);
-    lastAtomOrder = cl.getAtomIndex();
-}
-
-void OpenCLApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
-    int bytesToCopy = cl.getPosq().getSize()*(cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
-    cl.getQueue().enqueueCopyBuffer(savedPositions.getDeviceBuffer(), cl.getPosq().getDeviceBuffer(), 0, 0, bytesToCopy);
-    cl.getQueue().enqueueCopyBuffer(savedForces.getDeviceBuffer(), cl.getForce().getDeviceBuffer(), 0, 0, bytesToCopy);
-}
--- a/platforms/opencl/src/kernels/angleForce.cl
+++ b/platforms/opencl/src/kernels/angleForce.cl
-real4 v0 = pos2-pos1;
-real4 v1 = pos2-pos3;
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(v0)
-APPLY_PERIODIC_TO_DELTA(v1)
-#endif
-real4 cp = cross(v0, v1);
-real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
-rp = max(SQRT(rp), (real) 1.0e-06f);
-real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
-real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
-real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
-real cosine = clamp(dot*RSQRT(r21*r23), (real) -1, (real) 1);
-real theta = acos(cosine);
-COMPUTE_FORCE
-real4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
-real4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
-real4 force2 = -force1-force3;
--- a/platforms/opencl/src/kernels/bondForce.cl
+++ b/platforms/opencl/src/kernels/bondForce.cl
-real4 delta = pos2-pos1;
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
-COMPUTE_FORCE
-dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
-delta.xyz *= dEdR;
-real4 force1 = delta;
-real4 force2 = -delta;
\ No newline at end of file
--- a/platforms/opencl/src/kernels/constraints.cl
+++ b/platforms/opencl/src/kernels/constraints.cl
-__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
-    for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-#ifdef USE_MIXED_PRECISION
-        real4 pos1 = posq[index];
-        real4 pos2 = posqCorrection[index];
-        mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-        mixed4 pos = posq[index];
-#endif
-        pos.xyz += posDelta[index].xyz;
-#ifdef USE_MIXED_PRECISION
-        posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-        posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-        posq[index] = pos;
-#endif
-    }
-}
--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
-{
-#ifdef USE_DOUBLE_PRECISION
-    unsigned long includeInteraction;
-#else
-    unsigned int includeInteraction;
-#endif
-#if USE_EWALD
-    includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
-    const real alphaR = EWALD_ALPHA*r;
-    const real expAlphaRSqr = EXP(-alphaR*alphaR);
-#if HAS_COULOMB
-    const real prefactor = ONE_4PI_EPS0*CHARGE1*CHARGE2*invR;
-#else
-    const real prefactor = 0.0f;
-#endif
-
-#ifdef USE_DOUBLE_PRECISION
-    const real erfcAlphaR = erfc(alphaR);
-#else
-    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
-    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
-    // error of 1.5e-7.
-
-    const real t = RECIP(1.0f+0.3275911f*alphaR);
-    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
-#endif
-    real tempForce = 0;
-#if HAS_LENNARD_JONES
-    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-    real sig2 = invR*sig;
-    sig2 *= sig2;
-    real sig6 = sig2*sig2*sig2;
-    real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
-    real epssig6 = sig6*eps;
-    tempForce = epssig6*(12.0f*sig6 - 6.0f);
-    real ljEnergy = epssig6*(sig6 - 1.0f);
-    #if USE_LJ_SWITCH
-    if (r > LJ_SWITCH_CUTOFF) {
-        real x = r-LJ_SWITCH_CUTOFF;
-        real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-        real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-        tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-        ljEnergy *= switchValue;
-    }
-    #endif
-#if DO_LJPME
-    // The multiplicative term to correct for the multiplicative terms that are always
-    // present in reciprocal space.
-    const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
-    const real dar2 = dispersionAlphaR*dispersionAlphaR;
-    const real dar4 = dar2*dar2;
-    const real dar6 = dar4*dar2;
-    const real invR2 = invR*invR;
-    const real expDar2 = EXP(-dar2);
-    const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
-    const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
-    const real coef = invR2*invR2*invR2*c6;
-    const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-    const real dprefac = eprefac + dar6/6.0f;
-    // The multiplicative grid term
-    ljEnergy += coef*(1.0f - expDar2*eprefac);
-    tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-    // The potential shift accounts for the step at the cutoff introduced by the
-    // transition from additive to multiplicative combintion rules and is only
-    // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
-    // instead of tempEnergy here, the includeInteraction mask is correctly applied.
-    sig2 = sig*sig;
-    sig6 = sig2*sig2*sig2*INVCUT6;
-    epssig6 = eps*sig6;
-    // The additive part of the potential shift
-    ljEnergy += epssig6*(1.0f - sig6);
-    // The multiplicative part of the potential shift
-    ljEnergy += MULTSHIFT6*c6;
-#endif
-    tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-    tempEnergy += select((real) 0, ljEnergy + prefactor*erfcAlphaR, includeInteraction);
-#else
-    tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-    tempEnergy += select((real) 0, prefactor*erfcAlphaR, includeInteraction);
-#endif
-    dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
-#else
-#ifdef USE_CUTOFF
-    includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
-#else
-    includeInteraction = (!isExcluded);
-#endif
-    real tempForce = 0;
-  #if HAS_LENNARD_JONES
-    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
-    real sig2 = invR*sig;
-    sig2 *= sig2;
-    real sig6 = sig2*sig2*sig2;
-    real epssig6 = sig6*(SIGMA_EPSILON1.y*SIGMA_EPSILON2.y);
-    tempForce = epssig6*(12.0f*sig6 - 6.0f);
-    real ljEnergy = epssig6*(sig6-1);
-    #if USE_LJ_SWITCH
-    if (r > LJ_SWITCH_CUTOFF) {
-        real x = r-LJ_SWITCH_CUTOFF;
-        real switchValue = 1+x*x*x*(LJ_SWITCH_C3+x*(LJ_SWITCH_C4+x*LJ_SWITCH_C5));
-        real switchDeriv = x*x*(3*LJ_SWITCH_C3+x*(4*LJ_SWITCH_C4+x*5*LJ_SWITCH_C5));
-        tempForce = tempForce*switchValue - ljEnergy*switchDeriv*r;
-        ljEnergy *= switchValue;
-    }
-    #endif
-    ljEnergy = select((real) 0, ljEnergy, includeInteraction);
-    tempEnergy += ljEnergy;
-  #endif
-#if HAS_COULOMB
-  #ifdef USE_CUTOFF
-    const real prefactor = ONE_4PI_EPS0*CHARGE1*CHARGE2;
-    tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
-    tempEnergy += select((real) 0, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
-  #else
-    const real prefactor = ONE_4PI_EPS0*CHARGE1*CHARGE2*invR;
-    tempForce += prefactor;
-    tempEnergy += select((real) 0, prefactor, includeInteraction);
-  #endif
-#endif
-    dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
-#endif
-}
--- a/platforms/opencl/src/kernels/customCVForce.cl
+++ b/platforms/opencl/src/kernels/customCVForce.cl
-/**
- * Copy the positions and velocities to the inner context.
- */
-__kernel void copyState(__global real4* posq, __global real4* posqCorrection, __global mixed4* velm, __global int* restrict atomOrder,
-        __global real4* innerPosq, __global real4* innerPosqCorrection, __global mixed4* innerVelm, __global int* restrict innerInvAtomOrder,
-        int numAtoms) {
-    for (int i = get_global_id(0); i < numAtoms; i += get_global_size(0)) {
-        int index = innerInvAtomOrder[atomOrder[i]];
-        innerPosq[index] = posq[i];
-        innerVelm[index] = velm[i];
-#ifdef USE_MIXED_PRECISION
-        innerPosqCorrection[index] = posqCorrection[i];
-#endif
-    }
-}
-
-/**
- * Copy the forces back to the main context.
- */
-__kernel void copyForces(__global real4* forces, __global int* restrict invAtomOrder, __global real4* innerForces,
-        __global int* restrict innerAtomOrder, int numAtoms) {
-    for (int i = get_global_id(0); i < numAtoms; i += get_global_size(0)) {
-        int index = invAtomOrder[innerAtomOrder[i]];
-        forces[index] = innerForces[i];
-    }
-}
-
-/**
- * Add all the forces from the CVs.
- */
-__kernel void addForces(__global real4* forces, int numAtoms
-    PARAMETER_ARGUMENTS) {
-    for (int i = get_global_id(0); i < numAtoms; i += get_global_size(0)) {
-        real4 f = forces[i];
-        ADD_FORCES
-        forces[i] = f;
-    }
-}
\ No newline at end of file
--- a/platforms/opencl/src/kernels/ewald.cl
+++ b/platforms/opencl/src/kernels/ewald.cl
-real2 multofReal2(real2 a, real2 b) {
-    return (real2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
-}
-
-/**
- * Precompute the cosine and sine sums which appear in each force term.
- */
-
-__kernel void calculateEwaldCosSinSums(__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
-    const unsigned int ksizex = 2*KMAX_X-1;
-    const unsigned int ksizey = 2*KMAX_Y-1;
-    const unsigned int ksizez = 2*KMAX_Z-1;
-    const unsigned int totalK = ksizex*ksizey*ksizez;
-    unsigned int index = get_global_id(0);
-    mixed energy = 0;
-    while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
-        index += get_global_size(0);
-    while (index < totalK) {
-        // Find the wave vector (kx, ky, kz) this index corresponds to.
-
-        int rx = index/(ksizey*ksizez);
-        int remainder = index - rx*ksizey*ksizez;
-        int ry = remainder/ksizez;
-        int rz = remainder - ry*ksizez - KMAX_Z + 1;
-        ry += -KMAX_Y + 1;
-        real kx = rx*reciprocalPeriodicBoxSize.x;
-        real ky = ry*reciprocalPeriodicBoxSize.y;
-        real kz = rz*reciprocalPeriodicBoxSize.z;
-
-        // Compute the sum for this wave vector.
-
-        real2 sum = 0.0f;
-        for (int atom = 0; atom < NUM_ATOMS; atom++) {
-            real4 apos = posq[atom];
-            real phase = apos.x*kx;
-            real2 structureFactor = (real2) (cos(phase), sin(phase));
-            phase = apos.y*ky;
-            structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
-            phase = apos.z*kz;
-            structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
-            sum += apos.w*structureFactor;
-        }
-        cosSinSum[index] = sum;
-
-        // Compute the contribution to the energy.
-
-        real k2 = kx*kx + ky*ky + kz*kz;
-        real ak = EXP(k2*EXP_COEFFICIENT) / k2;
-        energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
-        index += get_global_size(0);
-    }
-    energyBuffer[get_global_id(0)] += energy;
-}
-
-/**
- * Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
- * previous routine.
- */
-
-__kernel void calculateEwaldForces(__global real4* restrict forceBuffers, __global const real4* restrict posq, __global const real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
-    unsigned int atom = get_global_id(0);
-    while (atom < NUM_ATOMS) {
-        real4 force = forceBuffers[atom];
-        real4 apos = posq[atom];
-
-        // Loop over all wave vectors.
-
-        int lowry = 0;
-        int lowrz = 1;
-        for (int rx = 0; rx < KMAX_X; rx++) {
-            real kx = rx*reciprocalPeriodicBoxSize.x;
-            for (int ry = lowry; ry < KMAX_Y; ry++) {
-                real ky = ry*reciprocalPeriodicBoxSize.y;
-                real phase = apos.x*kx;
-                real2 tab_xy = (real2) (cos(phase), sin(phase));
-                phase = apos.y*ky;
-                tab_xy = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
-                for (int rz = lowrz; rz < KMAX_Z; rz++) {
-                    real kz = rz*reciprocalPeriodicBoxSize.z;
-
-                    // Compute the force contribution of this wave vector.
-
-                    int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
-                    real k2 = kx*kx + ky*ky + kz*kz;
-                    real ak = EXP(k2*EXP_COEFFICIENT)/k2;
-                    phase = apos.z*kz;
-                    real2 structureFactor = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
-                    real2 sum = cosSinSum[index];
-                    real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
-                    force.x += dEdR*kx;
-                    force.y += dEdR*ky;
-                    force.z += dEdR*kz;
-                    lowrz = 1 - KMAX_Z;
-                }
-                lowry = 1 - KMAX_Y;
-            }
-        }
-
-        // Record the force on the atom.
-
-        forceBuffers[atom] = force;
-        atom += get_global_size(0);
-    }
-}
--- a/platforms/opencl/src/kernels/monteCarloBarostat.cl
+++ b/platforms/opencl/src/kernels/monteCarloBarostat.cl
-/**
- * Scale the particle positions with each axis independent.
- */
-
-__kernel void scalePositions(float scaleX, float scaleY, float scaleZ, int numMolecules, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __global real4* restrict posq,
-        __global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) {
-    for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) {
-        int first = moleculeStartIndex[index];
-        int last = moleculeStartIndex[index+1];
-        int numAtoms = last-first;
-
-        // Find the center of each molecule.
-
-        real3 center = (real3) 0;
-        for (int atom = first; atom < last; atom++)
-            center += posq[moleculeAtoms[atom]].xyz;
-        center /= (real) numAtoms;
-
-        // Move it into the first periodic box.
-
-        real3 oldCenter = center;
-        APPLY_PERIODIC_TO_POS(center)
-        real3 delta = oldCenter-center;;
-        real3 scaleXYZ = (real3) (scaleX, scaleY, scaleZ);
-
-        // Now scale the position of the molecule center.
-
-        delta = center*(scaleXYZ-1)-delta;
-        for (int atom = first; atom < last; atom++) {
-            real4 pos = posq[moleculeAtoms[atom]];
-            pos.xyz += delta.xyz;
-            posq[moleculeAtoms[atom]] = pos;
-        }
-    }
-}
--- a/platforms/opencl/src/kernels/nonbondedExceptions.cl
+++ b/platforms/opencl/src/kernels/nonbondedExceptions.cl
-float4 exceptionParams = PARAMS[index];
-real4 delta = pos2-pos1;
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-real invR = RSQRT(r2);
-real sig2 = invR*exceptionParams.y;
-sig2 *= sig2;
-real sig6 = sig2*sig2*sig2;
-real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
-real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
-dEdR += exceptionParams.x*invR;
-dEdR *= invR*invR;
-tempEnergy += exceptionParams.x*invR;
-energy += tempEnergy;
-delta.xyz *= dEdR;
-real4 force1 = -delta;
-real4 force2 = delta;
--- a/platforms/opencl/src/kernels/nonbondedParameters.cl
+++ b/platforms/opencl/src/kernels/nonbondedParameters.cl
-/**
- * Compute the nonbonded parameters for particles and exceptions.
- */
-__kernel void computeParameters(__global mixed* restrict energyBuffer, int includeSelfEnergy, __global real* restrict globalParams,
-        int numAtoms, __global const float4* restrict baseParticleParams, __global real4* restrict posq, __global real* restrict charge,
-        __global float2* restrict sigmaEpsilon, __global float4* restrict particleParamOffsets, __global int* restrict particleOffsetIndices
-#ifdef HAS_EXCEPTIONS
-        , int numExceptions, __global const float4* restrict baseExceptionParams, __global float4* restrict exceptionParams,
-        __global float4* restrict exceptionParamOffsets, __global int* restrict exceptionOffsetIndices
-#endif
-        ) {
-    mixed energy = 0;
-
-    // Compute particle parameters.
-    
-    for (int i = get_global_id(0); i < numAtoms; i += get_global_size(0)) {
-        float4 params = baseParticleParams[i];
-#ifdef HAS_OFFSETS
-        int start = particleOffsetIndices[i], end = particleOffsetIndices[i+1];
-        for (int j = start; j < end; j++) {
-            float4 offset = particleParamOffsets[j];
-            real value = globalParams[(int) offset.w];
-            params.x += value*offset.x;
-            params.y += value*offset.y;
-            params.z += value*offset.z;
-        }
-#endif
-#ifdef USE_POSQ_CHARGES
-        posq[i].w = params.x;
-#else
-        charge[i] = params.x;
-#endif
-        sigmaEpsilon[i] = (float2) (0.5f*params.y, 2*SQRT(params.z));
-#ifdef HAS_OFFSETS
-    #ifdef INCLUDE_EWALD
-        energy -= EWALD_SELF_ENERGY_SCALE*params.x*params.x;
-    #endif
-    #ifdef INCLUDE_LJPME
-        real sig3 = params.y*params.y*params.y;
-        energy += LJPME_SELF_ENERGY_SCALE*sig3*sig3*params.z;
-    #endif
-#endif
-    }
-
-    // Compute exception parameters.
-    
-#ifdef HAS_EXCEPTIONS
-    for (int i = get_global_id(0); i < numExceptions; i += get_global_size(0)) {
-        float4 params = baseExceptionParams[i];
-#ifdef HAS_OFFSETS
-        int start = exceptionOffsetIndices[i], end = exceptionOffsetIndices[i+1];
-        for (int j = start; j < end; j++) {
-            float4 offset = exceptionParamOffsets[j];
-            real value = globalParams[(int) offset.w];
-            params.x += value*offset.x;
-            params.y += value*offset.y;
-            params.z += value*offset.z;
-        }
-#endif
-        exceptionParams[i] = (float4) ((float) (ONE_4PI_EPS0*params.x), (float) params.y, (float) (4*params.z), 0);
-    }
-#endif
-    if (includeSelfEnergy)
-        energyBuffer[get_global_id(0)] += energy;
-}
-
-/**
- * Compute parameters for subtracting the reciprocal part of excluded interactions.
- */
-__kernel void computeExclusionParameters(__global real4* restrict posq, __global real* restrict charge, __global float2* restrict sigmaEpsilon,
-        int numExclusions, __global const int2* restrict exclusionAtoms, __global float4* restrict exclusionParams) {
-    for (int i = get_global_id(0); i < numExclusions; i += get_global_size(0)) {
-        int2 atoms = exclusionAtoms[i];
-#ifdef USE_POSQ_CHARGES
-        real chargeProd = posq[atoms.x].w*posq[atoms.y].w;
-#else
-        real chargeProd = charge[atoms.x]*charge[atoms.y];
-#endif
-#ifdef INCLUDE_LJPME
-        float2 sigEps1 = sigmaEpsilon[atoms.x];
-        float2 sigEps2 = sigmaEpsilon[atoms.y];
-        float sigma = sigEps1.x*sigEps2.x;
-        float epsilon = sigEps1.y*sigEps2.y;
-#else
-        float sigma = 0;
-        float epsilon = 0;
-#endif
-        exclusionParams[i] = (float4) ((float) (ONE_4PI_EPS0*chargeProd), sigma, epsilon, 0);
-    }
-}
--- a/platforms/opencl/src/kernels/pmeExclusions.cl
+++ b/platforms/opencl/src/kernels/pmeExclusions.cl
-const float4 exclusionParams = PARAMS[index];
-real3 delta = (real3) (pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
-#if USE_PERIODIC
-    APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-const real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-const real r = SQRT(r2);
-const real invR = RECIP(r);
-const real alphaR = EWALD_ALPHA*r;
-const real expAlphaRSqr = EXP(-alphaR*alphaR);
-real tempForce = 0.0f;
-if (alphaR > 1e-6f) {
-    const real erfAlphaR = erf(alphaR);
-    const real prefactor = exclusionParams.x*invR;
-    tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-    energy -= prefactor*erfAlphaR;
-}
-else {
-    energy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*exclusionParams.x;
-}
-#if DO_LJPME
-const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
-const real dar2 = dispersionAlphaR*dispersionAlphaR;
-const real dar4 = dar2*dar2;
-const real dar6 = dar4*dar2;
-const real invR2 = invR*invR;
-const real expDar2 = EXP(-dar2);
-const real c6 = 64*exclusionParams.y*exclusionParams.y*exclusionParams.y*exclusionParams.z;
-const real coef = invR2*invR2*invR2*c6;
-const real eprefac = 1.0f + dar2 + 0.5f*dar4;
-const real dprefac = eprefac + dar6/6.0f;
-energy += coef*(1.0f - expDar2*eprefac);
-tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
-#endif
-if (r > 0)
-    delta *= tempForce*invR*invR;
-real3 force1 = -delta;
-real3 force2 = delta;
--- a/platforms/opencl/src/kernels/torsionForce.cl
+++ b/platforms/opencl/src/kernels/torsionForce.cl
-const real PI = 3.14159265358979323846f;
-real4 v0 = (real4) (pos1.xyz-pos2.xyz, 0.0f);
-real4 v1 = (real4) (pos3.xyz-pos2.xyz, 0.0f);
-real4 v2 = (real4) (pos3.xyz-pos4.xyz, 0.0f);
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(v0)
-APPLY_PERIODIC_TO_DELTA(v1)
-APPLY_PERIODIC_TO_DELTA(v2)
-#endif
-real4 cp0 = cross(v0, v1);
-real4 cp1 = cross(v1, v2);
-real cosangle = dot(normalize(cp0), normalize(cp1));
-real theta;
-if (cosangle > 0.99f || cosangle < -0.99f) {
-    // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-
-    real4 cross_prod = cross(cp0, cp1);
-    real scale = dot(cp0, cp0)*dot(cp1, cp1);
-    theta = asin(SQRT(dot(cross_prod, cross_prod)/scale));
-    if (cosangle < 0)
-        theta = PI-theta;
-}
-else
-   theta = acos(cosangle);
-theta = (dot(v0, cp1) >= 0 ? theta : -theta);
-COMPUTE_FORCE
-real normCross1 = dot(cp0, cp0);
-real normSqrBC = dot(v1, v1);
-real normBC = SQRT(normSqrBC);
-real normCross2 = dot(cp1, cp1);
-real dp = 1.0f/normSqrBC;
-real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
-real4 force1 = ff.x*cp0;
-real4 force4 = ff.w*cp1;
-real4 s = ff.y*force1 - ff.z*force4;
-real4 force2 = s-force1;
-real4 force3 = -s-force4;