Began CUDA implementation of CustomNonbondedForce

c9bf6eee · Peter Eastman · 364130a5 · c9bf6eee · c9bf6eee · c9bf6eee
Commit c9bf6eee authored Aug 06, 2009 by Peter Eastman
14 changed files
--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -81,8 +81,8 @@ public:
    _gpuContext* gpu;
    KernelImpl* primaryKernel;
    bool removeCM;
-    bool hasBonds, hasAngles, hasPeriodicTorsions, hasRB, hasNonbonded;
-    int nonbondedMethod;
+    bool hasBonds, hasAngles, hasPeriodicTorsions, hasRB, hasNonbonded, hasCustomNonbonded;
+    int nonbondedMethod, customNonbondedMethod;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -47,6 +47,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
    if (name == CalcNonbondedForceKernel::Name())
        return new CudaCalcNonbondedForceKernel(name, platform, data, context.getSystem());
+    if (name == CalcCustomNonbondedForceKernel::Name())
+        return new CudaCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
    if (name == CalcGBSAOBCForceKernel::Name())
        return new CudaCalcGBSAOBCForceKernel(name, platform, data);
    if (name == IntegrateVerletStepKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -51,15 +51,11 @@ static void calcForces(ContextImpl& context, CudaPlatform::PlatformData& data) {
        kReduceObcGbsaBornForces(gpu);
        kCalculateObcGbsaForces2(gpu);
    }
-    else if (data.hasNonbonded) {
+    else if (data.hasNonbonded)
        kCalculateCDLJForces(gpu);
-    }
+    if (data.hasCustomNonbonded)
+        kCalculateCustomNonbondedForces(gpu, data.hasNonbonded);
    kCalculateLocalForces(gpu);
-/*
-    if (gpu->bIncludeGBSA)
-        kReduceBornSumAndForces(gpu);
-    else
-*/
    kReduceForces(gpu);
 }

@@ -255,16 +251,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Identify which exceptions are 1-4 interactions.

    vector<pair<int, int> > exclusions;
-    vector<int> nb14s;
+    vector<int> exceptions;
    for (int i = 0; i < force.getNumExceptions(); i++) {
        int particle1, particle2;
        double chargeProd, sigma, epsilon;
        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
        exclusions.push_back(pair<int, int>(particle1, particle2));
        if (chargeProd != 0.0 || epsilon != 0.0)
-            nb14s.push_back(i);
+            exceptions.push_back(i);
    }
-    num14 = nb14s.size();

    // Initialize nonbonded interactions.
    
@@ -329,15 +324,16 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Initialize 1-4 nonbonded interactions.
    
    {
-        vector<int> particle1(num14);
-        vector<int> particle2(num14);
-        vector<float> c6(num14);
-        vector<float> c12(num14);
-        vector<float> q1(num14);
-        vector<float> q2(num14);
-        for (int i = 0; i < num14; i++) {
+        int numExceptions = exceptions.size();
+        vector<int> particle1(numExceptions);
+        vector<int> particle2(numExceptions);
+        vector<float> c6(numExceptions);
+        vector<float> c12(numExceptions);
+        vector<float> q1(numExceptions);
+        vector<float> q2(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
            double charge, sig, eps;
-            force.getExceptionParameters(nb14s[i], particle1[i], particle2[i], charge, sig, eps);
+            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps);
            c6[i] = (float) (4*eps*pow(sig, 6.0));
            c12[i] = (float) (4*eps*pow(sig, 12.0));
            q1[i] = (float) charge;
@@ -358,6 +354,88 @@ double CudaCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
    return 0.0;
 }

+CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
+}
+
+void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
+    if (data.primaryKernel == NULL)
+        data.primaryKernel = this;
+    data.hasCustomNonbonded = true;
+    numParticles = force.getNumParticles();
+    _gpuContext* gpu = data.gpu;
+
+    // Identify which exceptions are actual interactions.
+
+    vector<pair<int, int> > exclusions;
+    vector<int> exceptions;
+    {
+        vector<double> parameters;
+        for (int i = 0; i < force.getNumExceptions(); i++) {
+            int particle1, particle2;
+            force.getExceptionParameters(i, particle1, particle2, parameters);
+            exclusions.push_back(pair<int, int>(particle1, particle2));
+            if (parameters.size() > 0)
+                exceptions.push_back(i);
+        }
+    }
+
+    // Initialize nonbonded interactions.
+
+    vector<int> particle(numParticles);
+    vector<vector<double> > parameters(numParticles);
+    vector<vector<int> > exclusionList(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        force.getParticleParameters(i, parameters[i]);
+        particle[i] = i;
+        exclusionList[i].push_back(i);
+    }
+    for (int i = 0; i < (int)exclusions.size(); i++) {
+        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
+        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
+    }
+    CudaNonbondedMethod method = NO_CUTOFF;
+    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
+        method = CUTOFF;
+    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic) {
+        Vec3 boxVectors[3];
+        force.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
+        method = PERIODIC;
+    }
+    data.customNonbondedMethod = method;
+
+    // Initialize exceptions.
+
+    int numExceptions = exceptions.size();
+    vector<int> exceptionParticle1(numExceptions);
+    vector<int> exceptionParticle2(numExceptions);
+    vector<vector<double> > exceptionParams(numExceptions);
+    for (int i = 0; i < numExceptions; i++)
+        force.getExceptionParameters(exceptions[i], exceptionParticle1[i], exceptionParticle2[i], exceptionParams[i]);
+
+    // Record information for the expressions.
+
+    vector<string> paramNames;
+    vector<string> combiningRules;
+    for (int i = 0; i < force.getNumParameters(); i++) {
+        paramNames.push_back(force.getParameterName(i));
+        combiningRules.push_back(force.getParameterCombiningRule(i));
+    }
+    gpuSetCustomNonbondedParameters(gpu, parameters, exclusionList, exceptionParticle1, exceptionParticle2, exceptionParams, method,
+            (float)force.getCutoffDistance(), force.getEnergyFunction(), combiningRules, paramNames);
+}
+
+void CudaCalcCustomNonbondedForceKernel::executeForces(ContextImpl& context) {
+    if (data.primaryKernel == this)
+        calcForces(context, data);
+}
+
+double CudaCalcCustomNonbondedForceKernel::executeEnergy(ContextImpl& context) {
+    if (data.primaryKernel == this)
+        return calcEnergy(context, system);
+    return 0.0;
+}
+
 CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
 }


--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -256,7 +256,41 @@ public:
    double executeEnergy(ContextImpl& context);
 private:
    CudaPlatform::PlatformData& data;
-    int numParticles, num14;
+    int numParticles;
+    System& system;
+};
+
+/**
+ * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
+ */
+class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
+public:
+    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomNonbondedForceKernel(name, platform), data(data), system(system) {
+    }
+    ~CudaCalcCustomNonbondedForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomNonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomNonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     *
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the CustomNonbondedForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    CudaPlatform::PlatformData& data;
+    int numParticles;
    System& system;
 };


--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -53,6 +53,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
@@ -104,8 +105,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
    delete data;
 }

-CudaPlatform::PlatformData::PlatformData(_gpuContext* gpu) : gpu(gpu), removeCM(false), nonbondedMethod(0), hasBonds(false), hasAngles(false),
-        hasPeriodicTorsions(false), hasRB(false), hasNonbonded(false), primaryKernel(NULL), stepCount(0), computeForceCount(0), time(0.0) {
+CudaPlatform::PlatformData::PlatformData(_gpuContext* gpu) : gpu(gpu), removeCM(false), nonbondedMethod(0), customNonbondedMethod(0), hasBonds(false), hasAngles(false),
+        hasPeriodicTorsions(false), hasRB(false), hasNonbonded(false), hasCustomNonbonded(false), primaryKernel(NULL), stepCount(0), computeForceCount(0), time(0.0) {
    stringstream device;
    device << gpu->device;
    propertyValues[CudaPlatform::CudaDevice()] = device.str();

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -35,6 +35,7 @@ extern void kGenerateRandoms(gpuContext gpu);
 // Main loop
 extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
 extern void kCalculateCDLJForces(gpuContext gpu);
+extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
 extern void kReduceObcGbsaBornForces(gpuContext gpu);
 extern void kCalculateObcGbsaForces2(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
@@ -64,6 +65,8 @@ extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
 extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
 extern void SetCalculateCDLJForcesSim(gpuContext gpu);
 extern void GetCalculateCDLJForcesSim(gpuContext gpu);
+extern void SetCalculateCustomNonbondedForcesSim(gpuContext gpu);
+extern void GetCalculateCustomNonbondedForcesSim(gpuContext gpu);
 extern void SetCalculateLocalForcesSim(gpuContext gpu);
 extern void GetCalculateLocalForcesSim(gpuContext gpu);
 extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
@@ -88,3 +91,6 @@ extern void SetBrownianUpdateSim(gpuContext gpu);
 extern void GetBrownianUpdateSim(gpuContext gpu);
 extern void SetRandomSim(gpuContext gpu);
 extern void GetRandomSim(gpuContext gpu);
+extern void SetCustomNonbondedForceExpression(const Expression<128>& expression);
+extern void SetCustomNonbondedEnergyExpression(const Expression<128>& expression);
+extern void SetCustomNonbondedCombiningRules(const Expression<64>* expressions);
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -231,7 +231,7 @@ static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
 static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
 static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
 static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
-
+static const unsigned int MAX_STACK_SIZE = 8;
 enum CudaNonbondedMethod
 {
    NO_CUTOFF,
@@ -240,6 +240,18 @@ enum CudaNonbondedMethod
    EWALD
 };

+enum ExpressionOp {
+    CONSTANT = 0, VARIABLE0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, CUSTOM, ADD, SUBTRACT, MULTIPLY, DIVIDE,
+        POWER, NEGATE, SQRT, EXP, LOG, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SQUARE, CUBE, RECIPROCAL, INCREMENT, DECREMENT
+};
+
+template<int SIZE>
+struct Expression {
+    int op[SIZE];
+    float arg[SIZE];
+    int length, stackSize;
+};
+
 struct cudaGmxSimulation {
    // Constants
    unsigned int    atoms;                          // Number of atoms
@@ -291,7 +303,9 @@ struct cudaGmxSimulation {
    float           bigFloat;                       // Floating point value used as a flag for Shaken atoms 
    float           epsfac;                         // Epsilon factor for CDLJ calculations
    CudaNonbondedMethod nonbondedMethod;            // How to handle nonbonded interactions
-    float           nonbondedCutoffSqr;             // Cutoff distance for CDLJ calculations
+    CudaNonbondedMethod customNonbondedMethod;      // How to handle custom nonbonded interactions
+    float           nonbondedCutoff;                // Cutoff distance for nonbonded interactions
+    float           nonbondedCutoffSqr;             // Square of the cutoff distance for nonbonded interactions
    float           periodicBoxSizeX;               // The X dimension of the periodic box
    float           periodicBoxSizeY;               // The Y dimension of the periodic box
    float           periodicBoxSizeZ;               // The Z dimension of the periodic box
@@ -324,6 +338,10 @@ struct cudaGmxSimulation {
    float           collisionFrequency;             // Collision frequency for Andersen thermostat
    float2*         pObcData;                       // Pointer to fixed Born data
    float2*         pAttr;                          // Pointer to additional atom attributes (sig, eps)
+    float4*         pCustomParams;                  // Pointer to atom parameters for custom nonbonded force
+    int4*           pCustomExceptionID;             // Atom indices for custom nonbonded exceptions
+    float4*         pCustomExceptionParams;         // Parameters for custom nonbonded exceptions
+    unsigned int    customExceptions;               // Number of custom nonbonded exceptions
    float2*         pEwaldCosSinSum;                // Pointer to the cos/sin sums (ewald)
    unsigned int    bonds;                          // Number of bonds
    int4*           pBondID;                        // Bond atom and output buffer IDs

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -49,8 +49,10 @@ using namespace std;
 #include "hilbert.h"
 #include "openmm/OpenMMException.h"
 #include "quern.h"
+#include "Lepton.h"

 using OpenMM::OpenMMException;
+using Lepton::Operation;

 struct ShakeCluster {
    int centralID;
@@ -130,6 +132,116 @@ static const float BOLTZ                    =    (RGAS / KILO);            // (k

 #define DUMP_PARAMETERS 0

+template <int SIZE>
+static Expression<SIZE> createExpression(const string& expression, const Lepton::ExpressionProgram& program, const vector<string>& variables) {
+    Expression<SIZE> exp;
+    if (program.getNumOperations() > SIZE)
+        throw OpenMMException("Expression contains too many operations: "+expression);
+    exp.length = program.getNumOperations();
+    exp.stackSize = program.getStackSize();
+    for (int i = 0; i < program.getNumOperations(); i++) {
+        const Operation& op = program.getOperation(i);
+        switch (op.getId()) {
+            case Operation::CONSTANT:
+                exp.op[i] = CONSTANT;
+                exp.arg[i] = op.evaluate(NULL, map<string, double>());
+                break;
+            case Operation::VARIABLE:
+                if (variables.size() > 0 && op.getName() == variables[0])
+                    exp.op[i] = VARIABLE0;
+                else if (variables.size() > 1 && op.getName() == variables[1])
+                    exp.op[i] = VARIABLE1;
+                else if (variables.size() > 2 && op.getName() == variables[2])
+                    exp.op[i] = VARIABLE2;
+                else if (variables.size() > 3 && op.getName() == variables[3])
+                    exp.op[i] = VARIABLE3;
+                else if (variables.size() > 4 && op.getName() == variables[4])
+                    exp.op[i] = VARIABLE4;
+                else if (variables.size() > 5 && op.getName() == variables[5])
+                    exp.op[i] = VARIABLE5;
+                else if (variables.size() > 6 && op.getName() == variables[6])
+                    exp.op[i] = VARIABLE6;
+                else if (variables.size() > 7 && op.getName() == variables[7])
+                    exp.op[i] = VARIABLE7;
+                else if (variables.size() > 8 && op.getName() == variables[8])
+                    exp.op[i] = VARIABLE8;
+                else
+                    throw OpenMMException("Unknown variable '"+op.getName()+"' in expression: "+expression);
+                break;
+            case Operation::ADD:
+                exp.op[i] = ADD;
+                break;
+            case Operation::SUBTRACT:
+                exp.op[i] = SUBTRACT;
+                break;
+            case Operation::MULTIPLY:
+                exp.op[i] = MULTIPLY;
+                break;
+            case Operation::DIVIDE:
+                exp.op[i] = DIVIDE;
+                break;
+            case Operation::POWER:
+                exp.op[i] = POWER;
+                break;
+            case Operation::NEGATE:
+                exp.op[i] = NEGATE;
+                break;
+            case Operation::SQRT:
+                exp.op[i] = SQRT;
+                break;
+            case Operation::EXP:
+                exp.op[i] = EXP;
+                break;
+            case Operation::LOG:
+                exp.op[i] = LOG;
+                break;
+            case Operation::SIN:
+                exp.op[i] = SIN;
+                break;
+            case Operation::COS:
+                exp.op[i] = COS;
+                break;
+            case Operation::SEC:
+                exp.op[i] = SEC;
+                break;
+            case Operation::CSC:
+                exp.op[i] = CSC;
+                break;
+            case Operation::TAN:
+                exp.op[i] = TAN;
+                break;
+            case Operation::COT:
+                exp.op[i] = COT;
+                break;
+            case Operation::ASIN:
+                exp.op[i] = ASIN;
+                break;
+            case Operation::ACOS:
+                exp.op[i] = ACOS;
+                break;
+            case Operation::ATAN:
+                exp.op[i] = ATAN;
+                break;
+            case Operation::SQUARE:
+                exp.op[i] = SQUARE;
+                break;
+            case Operation::CUBE:
+                exp.op[i] = CUBE;
+                break;
+            case Operation::RECIPROCAL:
+                exp.op[i] = RECIPROCAL;
+                break;
+            case Operation::INCREMENT:
+                exp.op[i] = INCREMENT;
+                break;
+            case Operation::DECREMENT:
+                exp.op[i] = DECREMENT;
+                break;
+        }
+    }
+    return exp;
+}
+
 extern "C"
 void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
 {
@@ -376,14 +488,33 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vecto
    psLJ14Parameter->Upload();
 }

+static void setExclusions(gpuContext gpu, const vector<vector<int> >& exclusions) {
+    if (gpu->exclusions.size() > 0) {
+        bool ok = (exclusions.size() == gpu->exclusions.size());
+        for (int i = 0; i < exclusions.size() && ok; i++) {
+            if (exclusions[i].size() != gpu->exclusions[i].size())
+                ok = false;
+            else {
+                for (int j = 0; j < exclusions[i].size(); j++)
+                    if (find(gpu->exclusions[i].begin(), gpu->exclusions[i].end(), exclusions[i][j]) == gpu->exclusions[i].end())
+                        ok = false;
+            }
+        }
+        if (!ok)
+            throw OpenMMException("All nonbonded forces must have identical sets of exceptions");
+    }
+    gpu->exclusions = exclusions;
+}
+
 extern "C"
 void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
        const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
 {
-    unsigned int coulombs = atom.size();
+    unsigned int coulombs = c6.size();
    gpu->sim.epsfac = epsfac;
    gpu->sim.nonbondedMethod = method;
-    gpu->exclusions = exclusions;
+    if (coulombs > 0)
+        setExclusions(gpu, exclusions);
    
    for (unsigned int i = 0; i < coulombs; i++)
    {
@@ -419,10 +550,84 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
 extern "C"
 void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
 {
+    if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
+        throw OpenMMException("All nonbonded forces must use the same cutoff");
+    gpu->sim.nonbondedCutoff = cutoffDistance;
    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
    gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
 }

+extern "C"
+void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double> >& parameters, const vector<vector<int> >& exclusions,
+            const vector<int>& exceptionAtom1, const vector<int>& exceptionAtom2, const vector<vector<double> >& exceptionParams,
+            CudaNonbondedMethod method, float cutoffDistance, const string& energyExp, const vector<string>& combiningRules, const vector<string>& paramNames)
+{
+    if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
+        throw OpenMMException("All nonbonded forces must use the same cutoff");
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-atom parameters for custom nonbonded forces");
+    gpu->sim.nonbondedCutoff = cutoffDistance;
+    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
+    gpu->sim.customNonbondedMethod = method;
+    gpu->sim.customExceptions = exceptionAtom1.size();
+    setExclusions(gpu, exclusions);
+    gpu->psCustomParams = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "CustomParams");
+    gpu->sim.pCustomParams = gpu->psCustomParams->_pDevData;
+    gpu->psCustomExceptionID = new CUDAStream<int4>(gpu->sim.customExceptions, 1, "CustomExceptionId");
+    gpu->sim.pCustomExceptionID = gpu->psCustomExceptionID->_pDevData;
+    gpu->psCustomExceptionParams = new CUDAStream<float4>(gpu->sim.customExceptions, 1, "CustomExceptionParams");
+    gpu->sim.pCustomExceptionParams = gpu->psCustomExceptionParams->_pDevData;
+    for (int i = 0; i < parameters.size(); i++) {
+        if (parameters[i].size() > 0)
+            (*gpu->psCustomParams)[i].x = parameters[i][0];
+        if (parameters[i].size() > 1)
+            (*gpu->psCustomParams)[i].y = parameters[i][1];
+        if (parameters[i].size() > 2)
+            (*gpu->psCustomParams)[i].z = parameters[i][2];
+        if (parameters[i].size() > 3)
+            (*gpu->psCustomParams)[i].w = parameters[i][3];
+    }
+    for (int i = 0; i < exceptionAtom1.size(); i++) {
+        (*gpu->psCustomExceptionID)[i].x = exceptionAtom1[i];
+        (*gpu->psCustomExceptionID)[i].y = exceptionAtom2[i];
+        (*gpu->psCustomExceptionID)[i].z = gpu->pOutputBufferCounter[exceptionAtom1[i]]++;
+        (*gpu->psCustomExceptionID)[i].w = gpu->pOutputBufferCounter[exceptionAtom2[i]]++;
+        if (exceptionParams[i].size() > 0)
+            (*gpu->psCustomExceptionParams)[i].x = exceptionParams[i][0];
+        if (exceptionParams[i].size() > 1)
+            (*gpu->psCustomExceptionParams)[i].y = exceptionParams[i][1];
+        if (exceptionParams[i].size() > 2)
+            (*gpu->psCustomExceptionParams)[i].z = exceptionParams[i][2];
+        if (exceptionParams[i].size() > 3)
+            (*gpu->psCustomExceptionParams)[i].w = exceptionParams[i][3];
+    }
+    gpu->psCustomParams->Upload();
+    gpu->psCustomExceptionID->Upload();
+    gpu->psCustomExceptionParams->Upload();
+
+    // Create the Expressions.
+
+    vector<string> variables;
+    variables.push_back("r");
+    for (int i = 0; i < paramNames.size(); i++)
+        variables.push_back(paramNames[i]);
+    SetCustomNonbondedEnergyExpression(createExpression<128>(energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables));
+    SetCustomNonbondedForceExpression(createExpression<128>(energyExp, Lepton::Parser::parse(energyExp).differentiate("r").optimize().createProgram(), variables));
+    Expression<64> paramExpressions[4];
+    vector<string> combiningRuleParams;
+    combiningRuleParams.push_back("");
+    for (int j = 1; j < 3; j++) {
+        for (int i = 0; i < paramNames.size(); i++) {
+            stringstream name;
+            name << paramNames[i] << j;
+            combiningRuleParams.push_back(name.str());
+        }
+    }
+    for (int i = 0; i < paramNames.size(); i++)
+        paramExpressions[i] = createExpression<64>(combiningRules[i], Lepton::Parser::parse(combiningRules[i]).optimize().createProgram(), combiningRuleParams);
+    SetCustomNonbondedCombiningRules(paramExpressions);
+}
+
 extern "C"
 void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz)
 {
@@ -1216,6 +1421,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->sim.surfaceAreaFactor      = surfaceAreaFactor;
    gpu->sim.electricConstant       = electricConstant;
    gpu->sim.nonbondedMethod        = NO_CUTOFF;
+    gpu->sim.nonbondedCutoff        = 0.0f;
    gpu->sim.nonbondedCutoffSqr     = 0.0f;

    gpu->sim.bigFloat               = 99999999.0f;
@@ -1408,6 +1614,11 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psxVector4;
    delete gpu->psvVector4;
    delete gpu->psSigEps2;
+    if (gpu->psCustomParams != NULL) {
+        delete gpu->psCustomParams;
+        delete gpu->psCustomExceptionID;
+        delete gpu->psCustomExceptionParams;
+    }
    if (gpu->psEwaldCosSinSum != NULL)
        delete gpu->psEwaldCosSinSum;
    delete gpu->psObcData;
@@ -1761,6 +1972,7 @@ int gpuSetConstants(gpuContext gpu)
 {
    SetCalculateCDLJForcesSim(gpu);
    SetCalculateCDLJObcGbsaForces1Sim(gpu);
+    SetCalculateCustomNonbondedForcesSim(gpu);
    SetCalculateLocalForcesSim(gpu);
    SetCalculateObcGbsaBornSumSim(gpu);
    SetCalculateObcGbsaForces2Sim(gpu);

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -88,6 +88,9 @@ struct _gpuContext {
    CUDAStream<float4>* psxVector4;
    CUDAStream<float4>* psvVector4;
    CUDAStream<float2>* psSigEps2; 
+    CUDAStream<float4>* psCustomParams;     // Atom parameters for custom nonbonded force
+    CUDAStream<int4>* psCustomExceptionID;  // Atom indices for custom nonbonded exceptions
+    CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions
    CUDAStream<float2>* psEwaldCosSinSum;
    CUDAStream<float2>* psObcData; 
    CUDAStream<float>* psObcChain;
@@ -178,6 +181,12 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int
 extern "C"
 void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);

+extern "C"
+void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vector<double> >& parameters, const std::vector<std::vector<int> >& exclusions,
+            const std::vector<int>& exceptionAtom1, const std::vector<int>& exceptionAtom2, const std::vector<std::vector<double> >& exceptionParams,
+            CudaNonbondedMethod method, float cutoffDistance, const std::string& energyExp, const std::vector<std::string>& combiningRules,
+            const std::vector<std::string>& paramNames);
+
 extern "C"
 void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);


--- a/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudatypes.h"
+
+#define UNROLLXX 0
+#define UNROLLXY 0
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float4 params;
+    float fx;
+    float fy;
+    float fz;
+};
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ Expression<128> forceExp;
+static __constant__ Expression<128> energyExp;
+static __constant__ Expression<64> combiningRules[4];
+
+void SetCalculateCustomNonbondedForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateCustomNonbondedForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+void SetCustomNonbondedForceExpression(const Expression<128>& expression)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
+    RTERROR(status, "SetCustomNonbondedForceExpression: cudaMemcpyToSymbol failed");
+}
+
+void SetCustomNonbondedEnergyExpression(const Expression<128>& expression)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
+    RTERROR(status, "SetCustomNonbondedEnergyExpression: cudaMemcpyToSymbol failed");
+}
+
+void SetCustomNonbondedCombiningRules(const Expression<64>* expressions)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(combiningRules, expressions, sizeof(combiningRules));
+    RTERROR(status, "SetCustomNonbondedCombiningRules: cudaMemcpyToSymbol failed");
+}
+
+template<int SIZE>
+__device__ float kEvaluateExpression_kernel(Expression<SIZE>* expression, float* stack, float var0, float4 vars1, float4 vars2)
+{
+    int stackPointer = -1;
+    for (int i = 0; i < expression->length; i++)
+    {
+        switch (expression->op[i])
+        {
+            case CONSTANT:
+                stack[++stackPointer] = expression->arg[i];
+                break;
+            case VARIABLE0:
+                stack[++stackPointer] = var0;
+                break;
+            case VARIABLE1:
+                stack[++stackPointer] = vars1.x;
+                break;
+//            case VARIABLE2:
+//                stack[++stackPointer] = vars1.y;
+//                break;
+//            case VARIABLE3:
+//                stack[++stackPointer] = vars1.z;
+//                break;
+//            case VARIABLE4:
+//                stack[++stackPointer] = vars1.w;
+//                break;
+            case VARIABLE5:
+                stack[++stackPointer] = vars2.x;
+                break;
+//            case VARIABLE6:
+//                stack[++stackPointer] = vars2.y;
+//                break;
+//            case VARIABLE7:
+//                stack[++stackPointer] = vars2.z;
+//                break;
+//            case VARIABLE8:
+//                stack[++stackPointer] = vars2.w;
+//                break;
+            case ADD:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = temp+stack[--stackPointer];
+                break;
+            }
+            case SUBTRACT:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = temp-stack[--stackPointer];
+                break;
+            }
+            case MULTIPLY:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = temp*stack[--stackPointer];
+                break;
+            }
+            case DIVIDE:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = temp/stack[--stackPointer];
+                break;
+            }
+            case POWER:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = pow(temp, stack[--stackPointer]);
+                break;
+            }
+            case NEGATE:
+                stack[stackPointer] = -stack[stackPointer];
+                break;
+            case SQRT:
+                stack[stackPointer] = sqrt(stack[stackPointer]);
+                break;
+            case EXP:
+                stack[stackPointer] = exp(stack[stackPointer]);
+                break;
+            case LOG:
+                stack[stackPointer] = log(stack[stackPointer]);
+                break;
+            case SIN:
+                stack[stackPointer] = sin(stack[stackPointer]);
+                break;
+            case COS:
+                stack[stackPointer] = cos(stack[stackPointer]);
+                break;
+            case SEC:
+                stack[stackPointer] = 1.0f/cos(stack[stackPointer]);
+                break;
+            case CSC:
+                stack[stackPointer] = 1.0f/sin(stack[stackPointer]);
+                break;
+            case TAN:
+                stack[stackPointer] = tan(stack[stackPointer]);
+                break;
+            case COT:
+                stack[stackPointer] = 1.0f/tan(stack[stackPointer]);
+                break;
+            case ASIN:
+                stack[stackPointer] = asin(stack[stackPointer]);
+                break;
+            case ACOS:
+                stack[stackPointer] = acos(stack[stackPointer]);
+                break;
+            case ATAN:
+                stack[stackPointer] = atan(stack[stackPointer]);
+                break;
+            case SQUARE:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = temp*temp;
+                break;
+            }
+            case CUBE:
+            {
+                float temp = stack[stackPointer];
+                stack[stackPointer] = temp*temp*temp;
+                break;
+            }
+            case RECIPROCAL:
+                stack[stackPointer] = 1.0f/stack[stackPointer];
+                break;
+            case INCREMENT:
+                stack[stackPointer] = stack[stackPointer]+1.0f;
+                break;
+            case DECREMENT:
+                stack[stackPointer] = stack[stackPointer]-1.0f;
+                break;
+        }
+    }
+    return stack[stackPointer];
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateCustomNonbondedForces.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateCustomNonbondedForces.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateCustomNonbondedForces.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateCustomNonbondedForces.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateCustomNonbondedForces.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateCustomNonbondedForces.h"
+
+__global__ void kFindBlockBoundsCutoff_kernel();
+__global__ void kFindBlocksWithInteractionsCutoff_kernel();
+__global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int* workUnit);
+__global__ void kFindBlockBoundsPeriodic_kernel();
+__global__ void kFindBlocksWithInteractionsPeriodic_kernel();
+__global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int* workUnit);
+
+void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
+{
+//    printf("kCalculateCustomNonbondedCutoffForces\n");
+    CUDPPResult result;
+    switch (gpu->sim.customNonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCustomNonbondedN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+MAX_STACK_SIZE*sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            else
+                kCalculateCustomNonbondedN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+MAX_STACK_SIZE*sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            LAUNCHERROR("kCalculateCustomNonbondedN2Forces");
+            break;
+        case CUTOFF:
+            if (!neighborListValid)
+            {
+                kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+                LAUNCHERROR("kFindBlockBoundsCutoff");
+                kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+                LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+                result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                        gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+                if (result != CUDPP_SUCCESS)
+                {
+                    printf("Error in cudppCompact: %d\n", result);
+                    exit(-1);
+                }
+                kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            }
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCustomNonbondedCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+MAX_STACK_SIZE*sizeof(float)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCustomNonbondedCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+MAX_STACK_SIZE*sizeof(float)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCustomNonbondedCutoffForces");
+            break;
+        case PERIODIC:
+            if (!neighborListValid)
+            {
+                kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+                LAUNCHERROR("kFindBlockBoundsPeriodic");
+                kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+                LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+                result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                        gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+                if (result != CUDPP_SUCCESS)
+                {
+                    printf("Error in cudppCompact: %d\n", result);
+                    exit(-1);
+                }
+                kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            }
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCustomNonbondedPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+MAX_STACK_SIZE*sizeof(float)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCustomNonbondedPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+MAX_STACK_SIZE*sizeof(float)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCustomNonbondedPeriodicForces");
+            break;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernels for evalauating nonbonded forces.  It is included
+ * several times in kCalculateCustomNonbondedForces.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned int* workUnit)
+{
+    extern __shared__ float stack[];
+    Atom* sA = (Atom*) &stack[MAX_STACK_SIZE*blockDim.x];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos = warp*numWorkUnits/totalWarps;
+    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
+#ifdef USE_CUTOFF
+    float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
+#endif
+
+    unsigned int lasty = 0xFFFFFFFF;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = workUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        unsigned int tj = tgx;
+        Atom* psA = &sA[tbx];
+        unsigned int i      = x + tgx;
+        apos                = cSim.pPosq[i];
+        float4 params       = make_float4(0,0,0,0);
+        af.x                = 0.0f;
+        af.y                = 0.0f;
+        af.z                = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            sA[threadIdx.x].x   = apos.x;
+            sA[threadIdx.x].y   = apos.y;
+            sA[threadIdx.x].z   = apos.z;
+            sA[threadIdx.x].params = params;
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float4 combinedParams;
+                    float dx        = psA[j].x - apos.x;
+                    float dy        = psA[j].y - apos.y;
+                    float dz        = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r         = sqrt(dx*dx + dy*dy + dz*dz);
+                    float invR      = 1.0f/r;
+                    float dEdR      = -kEvaluateExpression_kernel(&forceExp, &stack[MAX_STACK_SIZE*threadIdx.x], r, combinedParams, combinedParams)*invR;
+#ifdef USE_CUTOFF
+                    if (r > cSim.nonbondedCutoff)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                }
+            }
+            else  // bExclusion
+            {
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int cell          = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float4 combinedParams;
+                    float dx        = psA[j].x - apos.x;
+                    float dy        = psA[j].y - apos.y;
+                    float dz        = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r         = sqrt(dx*dx + dy*dy + dz*dz);
+                    float invR      = 1.0f/r;
+                    float dEdR      = -kEvaluateExpression_kernel(&forceExp, &stack[MAX_STACK_SIZE*threadIdx.x], r, combinedParams, combinedParams)*invR;
+#ifdef USE_CUTOFF
+                    if (!(excl & 0x1) || r > cSim.nonbondedCutoff)
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    excl          >>= 1;
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset                          = x + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += af.x;
+            of.y                               += af.y;
+            of.z                               += af.z;
+            cSim.pForce4a[offset]               = of;
+#else
+            of.x                                = af.x;
+            of.y                                = af.y;
+            of.z                                = af.z;
+            of.w                                = 0.0f;
+            unsigned int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                unsigned int j                   = y + tgx;
+                float4 temp             = cSim.pPosq[j];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].params  = make_float4(0,0,0,0);;
+            }
+            sA[threadIdx.x].fx      = 0.0f;
+            sA[threadIdx.x].fy      = 0.0f;
+            sA[threadIdx.x].fz      = 0.0f;
+            if (!bExclusionFlag)
+            {
+#ifdef USE_CUTOFF
+                unsigned int flags = cSim.pInteractionFlag[pos];
+                if (flags == 0)
+                {
+                    // No interactions in this block.
+                }
+                else if (flags == 0xFFFFFFFF)
+#endif
+                {
+                    // Compute all interactions within this block.
+
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        float4 combinedParams;
+                        float dx        = psA[tj].x - apos.x;
+                        float dy        = psA[tj].y - apos.y;
+                        float dz        = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        float r         = sqrt(dx*dx + dy*dy + dz*dz);
+                        float invR      = 1.0f/r;
+                        float dEdR      = -kEvaluateExpression_kernel(&forceExp, &stack[MAX_STACK_SIZE*threadIdx.x], r, combinedParams, combinedParams)*invR;
+#ifdef USE_CUTOFF
+                        if (r > cSim.nonbondedCutoff)
+                        {
+                            dEdR = 0.0f;
+                        }
+#endif
+                        dx             *= dEdR;
+                        dy             *= dEdR;
+                        dz             *= dEdR;
+                        af.x           -= dx;
+                        af.y           -= dy;
+                        af.z           -= dz;
+                        psA[tj].fx     += dx;
+                        psA[tj].fy     += dy;
+                        psA[tj].fz     += dz;
+                        tj              = (tj + 1) & (GRID - 1);
+                    }
+                }
+#ifdef USE_CUTOFF
+                else
+                {
+                    // Compute only a subset of the interactions in this block.
+
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        if ((flags&(1<<j)) != 0)
+                        {
+                            float4 combinedParams;
+                            float dx        = psA[j].x - apos.x;
+                            float dy        = psA[j].y - apos.y;
+                            float dz        = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                            float r         = sqrt(dx*dx + dy*dy + dz*dz);
+                            float invR      = 1.0f/r;
+                            float dEdR      = -kEvaluateExpression_kernel(&forceExp, &stack[MAX_STACK_SIZE*threadIdx.x], r, combinedParams, combinedParams)*invR;
+#ifdef USE_CUTOFF
+                            if (r > cSim.nonbondedCutoff)
+                            {
+                                dEdR = 0.0f;
+                            }
+#endif
+                            dx             *= dEdR;
+                            dy             *= dEdR;
+                            dz             *= dEdR;
+                            af.x           -= dx;
+                            af.y           -= dy;
+                            af.z           -= dz;
+                            tempBuffer[threadIdx.x].x = dx;
+                            tempBuffer[threadIdx.x].y = dy;
+                            tempBuffer[threadIdx.x].z = dz;
+
+                            // Sum the forces on atom j.
+
+                            if (tgx % 2 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
+                            }
+                            if (tgx % 4 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
+                            }
+                            if (tgx % 8 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
+                            }
+                            if (tgx % 16 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
+                            }
+                            if (tgx == 0)
+                            {
+                                psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
+                                psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
+                                psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
+                            }
+                        }
+                    }
+                }
+#endif
+            }
+            else  // bExclusion
+            {
+                // Read fixed atom data into registers and GRF
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int yi   = y>>GRIDBITS;
+                unsigned int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+                excl              = (excl >> tgx) | (excl << (GRID - tgx));
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float4 combinedParams;
+                    float dx        = psA[tj].x - apos.x;
+                    float dy        = psA[tj].y - apos.y;
+                    float dz        = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r         = sqrt(dx*dx + dy*dy + dz*dz);
+                    float invR      = 1.0f/r;
+                    float dEdR      = -kEvaluateExpression_kernel(&forceExp, &stack[MAX_STACK_SIZE*threadIdx.x], r, combinedParams, combinedParams)*invR;
+#ifdef USE_CUTOFF
+                    if (!(excl & 0x1) || r > cSim.nonbondedCutoff)
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    psA[tj].fx     += dx;
+                    psA[tj].fy     += dy;
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = (tj + 1) & (GRID - 1);
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset                 = x + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += af.x;
+            of.y                               += af.y;
+            of.z                               += af.z;
+            cSim.pForce4a[offset]               = of;
+            offset                              = y + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += sA[threadIdx.x].fx;
+            of.y                               += sA[threadIdx.x].fy;
+            of.z                               += sA[threadIdx.x].fz;
+            cSim.pForce4a[offset]               = of;
+#else
+            of.x                                = af.x;
+            of.y                                = af.y;
+            of.z                                = af.z;
+            of.w                                = 0.0f;
+            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+            of.x                                = sA[threadIdx.x].fx;
+            of.y                                = sA[threadIdx.x].fy;
+            of.z                                = sA[threadIdx.x].fz;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+#endif
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests all the different force terms in the Cuda implementation of CustomNonbondedForce.
+ */
+
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/CustomNonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testSimpleExpression() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("-0.1*r^3");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    double force = 0.1*3*(2*2);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(-0.1*(2*2*2), state.getPotentialEnergy(), TOL);
+}
+
+void testParameters() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("scale*a*(r*b)^3");
+    forceField->addParameter("a", "a1*a2");
+    forceField->addParameter("b", "c+b1+b2");
+    forceField->addGlobalParameter("scale", 3.0);
+    forceField->addGlobalParameter("c", -1.0);
+    vector<double> params(2);
+    params[0] = 1.5;
+    params[1] = 2.0;
+    forceField->addParticle(params);
+    params[0] = 2.0;
+    params[1] = 3.0;
+    forceField->addParticle(params);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+    context.setPositions(positions);
+    context.setParameter("scale", 1.0);
+    context.setParameter("c", 0.0);
+    State state = context.getState(State::Forces | State::Energy);
+    vector<Vec3> forces = state.getForces();
+    double force = -3.0*3*5.0*(10*10);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(3.0*(10*10*10), state.getPotentialEnergy(), TOL);
+    context.setParameter("scale", 1.5);
+    context.setParameter("c", 1.0);
+    state = context.getState(State::Forces | State::Energy);
+    forces = state.getForces();
+    force = -1.5*3.0*3*6.0*(12*12);
+    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_TOL(1.5*3.0*(12*12*12), state.getPotentialEnergy(), TOL);
+}
+
+void testExceptions() {
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("a*r");
+    nonbonded->addParameter("a", "a1+a2");
+    vector<double> params(1);
+    vector<Vec3> positions(4);
+    for (int i = 0; i < 4; i++) {
+        system.addParticle(1.0);
+        params[0] = i+1;
+        nonbonded->addParticle(params);
+        positions[i] = Vec3(i, 0, 0);
+    }
+    nonbonded->addException(0, 1, vector<double>());
+    nonbonded->addException(1, 2, vector<double>());
+    nonbonded->addException(2, 3, vector<double>());
+    params[0] = 0.5;
+    nonbonded->addException(0, 2, params);
+    nonbonded->addException(1, 3, params);
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0.5+1+4, 0, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0.5, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-0.5, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_VEC(Vec3(-(0.5+1+4), 0, 0), forces[3], TOL);
+    ASSERT_EQUAL_TOL((1+4)*3+0.5*2+0.5*2, state.getPotentialEnergy(), TOL);
+}
+
+void testCutoff() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("r");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    forceField->setCutoffDistance(2.5);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2, 0);
+    positions[2] = Vec3(0, 3, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, 1, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, -1, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(2+1, state.getPotentialEnergy(), TOL);
+}
+
+void testPeriodic() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* forceField = new CustomNonbondedForce("r");
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->addParticle(vector<double>());
+    forceField->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    forceField->setCutoffDistance(2.0);
+    forceField->setPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 4, 0), Vec3(0, 0, 4));
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 2.1, 0);
+    positions[2] = Vec3(0, 3, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(0, -2, 0), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 2, 0), forces[1], TOL);
+    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
+    ASSERT_EQUAL_TOL(1.9+1+0.9, state.getPotentialEnergy(), TOL);
+}
+
+int main() {
+    try {
+        testSimpleExpression();
+//        testParameters();
+//        testExceptions();
+        testCutoff();
+        testPeriodic();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+
--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -476,19 +476,19 @@ double ReferenceCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
 ReferenceCalcCustomNonbondedForceKernel::~ReferenceCalcCustomNonbondedForceKernel() {
    disposeRealArray(particleParamArray, numParticles);
    disposeIntArray(exclusionArray, numParticles);
-    disposeIntArray(bonded14IndexArray, num14);
-    disposeRealArray(bonded14ParamArray, num14);
+    disposeIntArray(exceptionIndexArray, numExceptions);
+    disposeRealArray(exceptionParamArray, numExceptions);
    if (neighborList != NULL)
        delete neighborList;
 }

 void ReferenceCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {

-    // Identify which exceptions are 1-4 interactions.
+    // Identify which exceptions are actual interactions.

    numParticles = force.getNumParticles();
    exclusions.resize(numParticles);
-    vector<int> nb14s;
+    vector<int> exceptions;
    vector<double> parameters;
    for (int i = 0; i < force.getNumExceptions(); i++) {
        int particle1, particle2;
@@ -496,15 +496,15 @@ void ReferenceCalcCustomNonbondedForceKernel::initialize(const System& system, c
        exclusions[particle1].insert(particle2);
        exclusions[particle2].insert(particle1);
        if (parameters.size() > 0)
-            nb14s.push_back(i);
+            exceptions.push_back(i);
    }

    // Build the arrays.

-    num14 = nb14s.size();
+    numExceptions = exceptions.size();
    int numParameters = force.getNumParameters();
-    bonded14IndexArray = allocateIntArray(num14, 2);
-    bonded14ParamArray = allocateRealArray(num14, numParameters);
+    exceptionIndexArray = allocateIntArray(numExceptions, 2);
+    exceptionParamArray = allocateRealArray(numExceptions, numParameters);
    particleParamArray = allocateRealArray(numParticles, numParameters);
    for (int i = 0; i < numParticles; ++i) {
        force.getParticleParameters(i, parameters);
@@ -520,13 +520,13 @@ void ReferenceCalcCustomNonbondedForceKernel::initialize(const System& system, c
        for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter)
            exclusionArray[i][++index] = *iter;
    }
-    for (int i = 0; i < num14; ++i) {
+    for (int i = 0; i < numExceptions; ++i) {
        int particle1, particle2;
-        force.getExceptionParameters(nb14s[i], particle1, particle2, parameters);
-        bonded14IndexArray[i][0] = particle1;
-        bonded14IndexArray[i][1] = particle2;
+        force.getExceptionParameters(exceptions[i], particle1, particle2, parameters);
+        exceptionIndexArray[i][0] = particle1;
+        exceptionIndexArray[i][1] = particle2;
        for (int j = 0; j < numParameters; j++)
-            bonded14ParamArray[i][j] = static_cast<RealOpenMM>(parameters[j]);
+            exceptionParamArray[i][j] = static_cast<RealOpenMM>(parameters[j]);
    }
    nonbondedMethod = CalcCustomNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
    nonbondedCutoff = (RealOpenMM) force.getCutoffDistance();
@@ -568,7 +568,7 @@ void ReferenceCalcCustomNonbondedForceKernel::executeForces(ContextImpl& context
    for (int i = 0; i < globalParameterNames.size(); i++)
        globalParameters[globalParameterNames[i]] = context.getParameter(globalParameterNames[i]);
    ixn.calculatePairIxn(numParticles, posData, particleParamArray, exclusionArray, 0, globalParameters, forceData, 0, 0);
-    ixn.calculateExceptionIxn(num14, bonded14IndexArray, posData, bonded14ParamArray, globalParameters, forceData, 0, 0);
+    ixn.calculateExceptionIxn(numExceptions, exceptionIndexArray, posData, exceptionParamArray, globalParameters, forceData, 0, 0);
 }

 double ReferenceCalcCustomNonbondedForceKernel::executeEnergy(ContextImpl& context) {
@@ -587,7 +587,7 @@ double ReferenceCalcCustomNonbondedForceKernel::executeEnergy(ContextImpl& conte
    for (int i = 0; i < globalParameterNames.size(); i++)
        globalParameters[globalParameterNames[i]] = context.getParameter(globalParameterNames[i]);
    ixn.calculatePairIxn(numParticles, posData, particleParamArray, exclusionArray, 0, globalParameters, forceData, 0, &energy);
-    ixn.calculateExceptionIxn(num14, bonded14IndexArray, posData, bonded14ParamArray, globalParameters, forceData, 0, &energy);
+    ixn.calculateExceptionIxn(numExceptions, exceptionIndexArray, posData, exceptionParamArray, globalParameters, forceData, 0, &energy);
    disposeRealArray(forceData, numParticles);
    return energy;
 }

--- a/platforms/reference/src/ReferenceKernels.h
+++ b/platforms/reference/src/ReferenceKernels.h
@@ -304,9 +304,9 @@ public:
     */
    double executeEnergy(ContextImpl& context);
 private:
-    int numParticles, num14;
-    int **exclusionArray, **bonded14IndexArray;
-    RealOpenMM **particleParamArray, **bonded14ParamArray;
+    int numParticles, numExceptions;
+    int **exclusionArray, **exceptionIndexArray;
+    RealOpenMM **particleParamArray, **exceptionParamArray;
    RealOpenMM nonbondedCutoff, periodicBoxSize[3];
    std::vector<std::set<int> > exclusions;
    Lepton::ExpressionProgram energyExpression, forceExpression;