Merge changes from main branch

18295108 · peastman · e6101f68 · 8d7234e5 · 18295108 · 18295108
Commit 18295108 authored Sep 05, 2017 by peastman
20 changed files
--- a/platforms/cuda/include/CudaIntegrationUtilities.h
+++ b/platforms/cuda/include/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -158,8 +158,11 @@ private:
    CudaArray* vsite3AvgWeights;
    CudaArray* vsiteOutOfPlaneAtoms;
    CudaArray* vsiteOutOfPlaneWeights;
+    CudaArray* vsiteLocalCoordsIndex;
    CudaArray* vsiteLocalCoordsAtoms;
-    CudaArray* vsiteLocalCoordsParams;
+    CudaArray* vsiteLocalCoordsWeights;
+    CudaArray* vsiteLocalCoordsPos;
+    CudaArray* vsiteLocalCoordsStartIndex;
    int randomPos;
    int lastSeed, numVsites;
    double2 lastStepSize;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -38,6 +38,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
 #include "lepton/CompiledExpression.h"
+#include "lepton/ExpressionProgram.h"
 #include <cufft.h>
 namespace OpenMM {
@@ -1229,6 +1230,54 @@ private:
    CUevent event;
 };
+/**
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+public:
+    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
+            cu(cu), hasInitializedListeners(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
+    }
+    ~CudaCalcCustomCVForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
+private:
+    class ReorderListener;
+    CudaContext& cu;
+    bool hasInitializedListeners;
+    Lepton::ExpressionProgram energyExpression;
+    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
+    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
+    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
+    std::vector<CudaArray*> cvForces;
+    CudaArray* invAtomOrder;
+    CudaArray* innerInvAtomOrder;
+    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
+};
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */
@@ -1485,7 +1534,9 @@ private:
    class ReorderListener;
    class GlobalTarget;
    class DerivFunction;
-    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+        const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
+        std::vector<std::pair<std::string, std::string> >& functionNames);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
@@ -1495,7 +1546,7 @@ private:
    CudaContext& cu;
    double energy;
    float energyFloat;
-    int numGlobalVariables;
+    int numGlobalVariables, sumWorkGroupSize;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
    mutable bool localValuesAreCurrent;
    CudaArray* globalValues;
@@ -1504,6 +1555,8 @@ private:
    CudaArray* uniformRandoms;
    CudaArray* randomSeed;
    CudaArray* perDofEnergyParamDerivs;
+    std::vector<CudaArray*> tabulatedFunctions;
+    std::map<int, double> savedEnergy;
    std::map<int, CudaArray*> savedForces;
    std::set<int> validSavedForces;
    CudaParameterSet* perDofValues;
@@ -1587,7 +1640,7 @@ private:
 class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
 public:
    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu),
-            hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
+            hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
    }
    ~CudaApplyMonteCarloBarostatKernel();
    /**
@@ -1622,6 +1675,7 @@ private:
    bool hasInitializedKernels;
    int numMolecules;
    CudaArray* savedPositions;
+    CudaArray* savedForces;
    CudaArray* moleculeAtoms;
    CudaArray* moleculeStartIndex;
    CUfunction kernel;

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -53,6 +53,7 @@ public:
    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
    void contextDestroyed(ContextImpl& context) const;
    /**
     * This is the name of the parameter for selecting which CUDA device or devices to use.
@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
-            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads);
+            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) {
 #endif
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
-        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
+        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    cacheDir = cacheDir+"/";
 #endif
    contextIndex = platformData.contexts.size();
-    int numDevices;
    string errorMessage = "Error initializing Context";
-    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    if (originalContext == NULL) {
-    if (deviceIndex < -1 || deviceIndex >= numDevices)
+        isLinkedContext = false;
-        throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
+        int numDevices;
+        CHECK_RESULT(cuDeviceGetCount(&numDevices));
-    vector<int> devicePrecedence;
+        if (deviceIndex < -1 || deviceIndex >= numDevices)
-    if (deviceIndex == -1) {
+            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
-        devicePrecedence = getDevicePrecedence();
-    } else {
+        vector<int> devicePrecedence;
-        devicePrecedence.push_back(deviceIndex);
+        if (deviceIndex == -1) {
-    }
+            devicePrecedence = getDevicePrecedence();
+        } else {
-    this->deviceIndex = -1;
+            devicePrecedence.push_back(deviceIndex);
-    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+        }
-        int trialDeviceIndex = devicePrecedence[i];
-        CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
-        defaultOptimizationOptions = "--use_fast_math";
-        unsigned int flags = CU_CTX_MAP_HOST;
-        if (useBlockingSync)
-            flags += CU_CTX_SCHED_BLOCKING_SYNC;
-        else
-            flags += CU_CTX_SCHED_SPIN;
-        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+        this->deviceIndex = -1;
-            this->deviceIndex = trialDeviceIndex;
+        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
-            break;
+            int trialDeviceIndex = devicePrecedence[i];
+            CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
+            defaultOptimizationOptions = "--use_fast_math";
+            unsigned int flags = CU_CTX_MAP_HOST;
+            if (useBlockingSync)
+                flags += CU_CTX_SCHED_BLOCKING_SYNC;
+            else
+                flags += CU_CTX_SCHED_SPIN;
+            if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+                this->deviceIndex = trialDeviceIndex;
+                break;
+            }
        }
+        if (this->deviceIndex == -1)
+            if (deviceIndex != -1)
+                throw OpenMMException("The requested CUDA device could not be loaded");
+            else
+                throw OpenMMException("No compatible CUDA device is available");
+    }
+    else {
+        isLinkedContext = true;
+        context = originalContext->context;
+        this->deviceIndex = originalContext->deviceIndex;
+        this->device = originalContext->device;
    }
-    if (this->deviceIndex == -1)
-        if (deviceIndex != -1)
-            throw OpenMMException("The requested CUDA device could not be loaded");
-        else
-            throw OpenMMException("No compatible CUDA device is available");
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
@@ -227,6 +236,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            minor = 3;
        }
    }
+    if (major == 7) {
+        // Don't generate Volta-specific code until we've made the changes needed
+        // to support it properly.
+        major = 6;
+        minor = 0;
+    }
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
@@ -292,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
    setChargesKernel = getKernel(utilities, "setCharges");
    // Set defines based on the requested precision.
@@ -405,6 +421,8 @@ CudaContext::~CudaContext() {
        delete force;
    if (energyBuffer != NULL)
        delete energyBuffer;
+    if (energySum != NULL)
+        delete energySum;
    if (energyParamDerivBuffer != NULL)
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
@@ -422,7 +440,7 @@ CudaContext::~CudaContext() {
    if (thread != NULL)
        delete thread;
    string errorMessage = "Error deleting Context";
-    if (contextIsValid) {
+    if (contextIsValid && !isLinkedContext) {
        cuProfilerStop();
        CHECK_RESULT(cuCtxDestroy(context));
    }
@@ -435,16 +453,19 @@ void CudaContext::initialize() {
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum = CudaArray::create<double>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum = CudaArray::create<double>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        energySum = CudaArray::create<float>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
@@ -864,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() {
    }
 }
+double CudaContext::reduceEnergy() {
+    int bufferSize = energyBuffer->getSize();
+    int workGroupSize  = 512;
+    void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize};
+    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize());
+    energySum->download(pinnedBuffer);
+    if (getUseDoublePrecision() || getUseMixedPrecision())
+        return *((double*) pinnedBuffer);
+    else
+        return *((float*) pinnedBuffer);
+}
 void CudaContext::setCharges(const vector<double>& charges) {
    if (chargeBuffer == NULL)
        chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
@@ -1050,9 +1083,16 @@ void CudaContext::findMoleculeGroups() {
            for (int i = 0; i < (int) forces.size() && identical; i++) {
                if (mol.groups[i].size() != mol2.groups[i].size())
                    identical = false;
-                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
+                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
                        identical = false;
+                    vector<int> p1, p2;
+                    forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
+                    forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
+                    for (int m = 0; m < p1.size(); m++)
+                        if (p1[m] != p2[m]-atomOffset)
+                            identical = false;
+                }
            }
            if (identical) {
                moleculeInstances[j].push_back(molIndex);

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -103,7 +103,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
-        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) {
+        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
+        vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL) {
    // Create workspace arrays.
    lastStepSize = make_double2(0.0, 0.0);
@@ -454,8 +455,11 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
    vector<double4> vsite3AvgWeightVec;
    vector<int4> vsiteOutOfPlaneAtomVec;
    vector<double4> vsiteOutOfPlaneWeightVec;
-    vector<int4> vsiteLocalCoordsAtomVec;
+    vector<int> vsiteLocalCoordsIndexVec;
-    vector<double> vsiteLocalCoordsParamVec;
+    vector<int> vsiteLocalCoordsAtomVec;
+    vector<int> vsiteLocalCoordsStartVec;
+    vector<double> vsiteLocalCoordsWeightVec;
+    vector<double4> vsiteLocalCoordsPosVec;
    for (int i = 0; i < numAtoms; i++) {
        if (system.isVirtualSite(i)) {
            if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
@@ -480,64 +484,72 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
            }
            else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
-                // An out of plane site.
+                // A local coordinates site.
                const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
-                vsiteLocalCoordsAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
+                int numParticles = site.getNumParticles();
-                Vec3 origin = site.getOriginWeights();
+                vector<double> origin, x, y;
-                Vec3 x = site.getXWeights();
+                site.getOriginWeights(origin);
-                Vec3 y = site.getYWeights();
+                site.getXWeights(x);
+                site.getYWeights(y);
+                vsiteLocalCoordsIndexVec.push_back(i);
+                vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
+                for (int j = 0; j < numParticles; j++) {
+                    vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
+                    vsiteLocalCoordsWeightVec.push_back(origin[j]);
+                    vsiteLocalCoordsWeightVec.push_back(x[j]);
+                    vsiteLocalCoordsWeightVec.push_back(y[j]);
+                }
                Vec3 pos = site.getLocalPosition();
-                vsiteLocalCoordsParamVec.push_back(origin[0]);
+                vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0));
-                vsiteLocalCoordsParamVec.push_back(origin[1]);
-                vsiteLocalCoordsParamVec.push_back(origin[2]);
-                vsiteLocalCoordsParamVec.push_back(x[0]);
-                vsiteLocalCoordsParamVec.push_back(x[1]);
-                vsiteLocalCoordsParamVec.push_back(x[2]);
-                vsiteLocalCoordsParamVec.push_back(y[0]);
-                vsiteLocalCoordsParamVec.push_back(y[1]);
-                vsiteLocalCoordsParamVec.push_back(y[2]);
-                vsiteLocalCoordsParamVec.push_back(pos[0]);
-                vsiteLocalCoordsParamVec.push_back(pos[1]);
-                vsiteLocalCoordsParamVec.push_back(pos[2]);
            }
        }
    }
+    vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
    int num2Avg = vsite2AvgAtomVec.size();
    int num3Avg = vsite3AvgAtomVec.size();
    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
-    int numLocalCoords = vsiteLocalCoordsAtomVec.size();
+    int numLocalCoords = vsiteLocalCoordsPosVec.size();
    vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
    vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
    vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
-    vsiteLocalCoordsAtoms = CudaArray::create<int4>(context, max(1, numLocalCoords), "vsiteLocalCoordinatesAtoms");
+    vsiteLocalCoordsIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
+    vsiteLocalCoordsAtoms = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
+    vsiteLocalCoordsStartIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
    if (num2Avg > 0)
        vsite2AvgAtoms->upload(vsite2AvgAtomVec);
    if (num3Avg > 0)
        vsite3AvgAtoms->upload(vsite3AvgAtomVec);
    if (numOutOfPlane > 0)
        vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
-    if (numLocalCoords > 0)
+    if (numLocalCoords > 0) {
+        vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec);
        vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec);
+        vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec);
+    }
    if (context.getUseDoublePrecision()) {
        vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
        vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
        vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsParams = CudaArray::create<double>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams");
+        vsiteLocalCoordsWeights = CudaArray::create<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+        vsiteLocalCoordsPos = CudaArray::create<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
        if (num2Avg > 0)
            vsite2AvgWeights->upload(vsite2AvgWeightVec);
        if (num3Avg > 0)
            vsite3AvgWeights->upload(vsite3AvgWeightVec);
        if (numOutOfPlane > 0)
            vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
-        if (numLocalCoords > 0)
+        if (numLocalCoords > 0) {
-            vsiteLocalCoordsParams->upload(vsiteLocalCoordsParamVec);
+            vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec);
+            vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec);
+        }
    }
    else {
        vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
        vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
        vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsParams = CudaArray::create<float>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams");
+        vsiteLocalCoordsWeights = CudaArray::create<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+        vsiteLocalCoordsPos = CudaArray::create<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
        if (num2Avg > 0) {
            vector<float2> floatWeights(num2Avg);
            for (int i = 0; i < num2Avg; i++)
@@ -557,10 +569,14 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
            vsiteOutOfPlaneWeights->upload(floatWeights);
        }
        if (numLocalCoords > 0) {
-            vector<float> floatParams(vsiteLocalCoordsParamVec.size());
+            vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
-            for (int i = 0; i < (int) vsiteLocalCoordsParamVec.size(); i++)
+            for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
-                floatParams[i] = (float) vsiteLocalCoordsParamVec[i];
+                floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
-            vsiteLocalCoordsParams->upload(floatParams);
+            vsiteLocalCoordsWeights->upload(floatWeights);
+            vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
+            for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
+                floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
+            vsiteLocalCoordsPos->upload(floatPos);
        }
    }
@@ -644,10 +660,16 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete vsiteOutOfPlaneAtoms;
    if (vsiteOutOfPlaneWeights != NULL)
        delete vsiteOutOfPlaneWeights;
+    if (vsiteLocalCoordsIndex != NULL)
+        delete vsiteLocalCoordsIndex;
    if (vsiteLocalCoordsAtoms != NULL)
        delete vsiteLocalCoordsAtoms;
-    if (vsiteLocalCoordsParams != NULL)
+    if (vsiteLocalCoordsWeights != NULL)
-        delete vsiteLocalCoordsParams;
+        delete vsiteLocalCoordsWeights;
+    if (vsiteLocalCoordsPos != NULL)
+        delete vsiteLocalCoordsPos;
+    if (vsiteLocalCoordsStartIndex != NULL)
+        delete vsiteLocalCoordsStartIndex;
 }
 void CudaIntegrationUtilities::setNextStepSize(double size) {
@@ -747,7 +769,9 @@ void CudaIntegrationUtilities::computeVirtualSites() {
        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
-                &vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()};
+                &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
+                &vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
+                &vsiteLocalCoordsStartIndex->getDevicePointer()};
        context.executeKernel(vsitePositionKernel, args, numVsites);
    }
 }
@@ -759,7 +783,9 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
                &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
-                &vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()};
+                &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
+                &vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
+                &vsiteLocalCoordsStartIndex->getDevicePointer()};
        context.executeKernel(vsiteForceKernel, args, numVsites);
    }
 }

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCVForceKernel::Name())
+        return new CudaCalcCustomCVForceKernel(name, platform, cu);
    if (name == CalcCustomManyParticleForceKernel::Name())
        return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGayBerneForceKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -48,6 +48,7 @@
 #include "lepton/Operation.h"
 #include "lepton/Parser.h"
 #include "lepton/ParsedExpression.h"
+#include "ReferenceTabulatedFunction.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
 #include <algorithm>
@@ -114,21 +115,8 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
    for (auto computation : cu.getPostComputations())
        sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
-    if (includeEnergy) {
+    if (includeEnergy)
-        CudaArray& energyArray = cu.getEnergyBuffer();
+        sum += cu.reduceEnergy();
-        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-            double* energy = (double*) cu.getPinnedBuffer();
-            energyArray.download(energy);
-            for (int i = 0; i < energyArray.getSize(); i++)
-                sum += energy[i];
-        }
-        else {
-            float* energy = (float*) cu.getPinnedBuffer();
-            energyArray.download(energy);
-            for (int i = 0; i < energyArray.getSize(); i++)
-                sum += energy[i];
-        }
-    }
    if (!cu.getForcesValid())
        valid = false;
    return sum;
@@ -1805,7 +1793,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                try {
                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
-                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
                    CUfunction addForcesKernel = cu.getKernel(module, "addForces");
                    pmeio = new PmeIO(cu, addForcesKernel);
                    cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
@@ -4457,7 +4445,7 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
        const vector<int>& atoms = distance.second;
        string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
        if (computedDeltas.count(deltaName) == 0) {
-            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n");
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
            computedDeltas.insert(deltaName);
        }
        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real r_"+deltaName+" = SQRT(delta"+deltaName+".w);\n");
@@ -4472,11 +4460,11 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
        string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
        string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]];
        if (computedDeltas.count(deltaName1) == 0) {
-            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[0]]+");\n");
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[0]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
            computedDeltas.insert(deltaName1);
        }
        if (computedDeltas.count(deltaName2) == 0) {
-            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[2]]+");\n");
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[1]]+", "+atomNamesLower[atoms[2]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
            computedDeltas.insert(deltaName2);
        }
        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real "+angleName+" = computeAngle(delta"+deltaName1+", delta"+deltaName2+");\n");
@@ -4494,15 +4482,15 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
        string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]];
        if (computedDeltas.count(deltaName1) == 0) {
-            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+");\n");
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName1+" = delta("+atomNamesLower[atoms[0]]+", "+atomNamesLower[atoms[1]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
            computedDeltas.insert(deltaName1);
        }
        if (computedDeltas.count(deltaName2) == 0) {
-            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[1]]+");\n");
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName2+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[1]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
            computedDeltas.insert(deltaName2);
        }
        if (computedDeltas.count(deltaName3) == 0) {
-            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName3+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[3]]+");\n");
+            addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 delta"+deltaName3+" = delta("+atomNamesLower[atoms[2]]+", "+atomNamesLower[atoms[3]]+", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n");
            computedDeltas.insert(deltaName3);
        }
        addDonorAndAcceptorCode(computeDonor, computeAcceptor, "real4 "+crossName1+" = computeCross(delta"+deltaName1+", delta"+deltaName2+");\n");
@@ -4521,12 +4509,12 @@ void CudaCalcCustomHbondForceKernel::initialize(const System& system, const Cust
    for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
        CudaNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
        extraArgs << ", const "+buffer.getType()+"* __restrict__ donor"+buffer.getName();
-        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[index];\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+cu.intToString(i+1)+" = donor"+buffer.getName()+"[donorIndex];\n");
    }
    for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
        CudaNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
        extraArgs << ", const "+buffer.getType()+"* __restrict__ acceptor"+buffer.getName();
-        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[index];\n");
+        addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+cu.intToString(i+1)+" = acceptor"+buffer.getName()+"[acceptorIndex];\n");
    }
    // Now evaluate the expressions.
@@ -6608,6 +6596,176 @@ void CudaCalcGayBerneForceKernel::sortAtoms() {
    exclusionStartIndex->upload(startIndexVec);
 }
+class CudaCalcCustomCVForceKernel::ReorderListener : public CudaContext::ReorderListener {
+public:
+    ReorderListener(CudaContext& cu, CudaArray& invAtomOrder) : cu(cu), invAtomOrder(invAtomOrder) {
+    }
+    void execute() {
+        vector<int> invOrder(cu.getPaddedNumAtoms());
+        const vector<int>& order = cu.getAtomIndex();
+        for (int i = 0; i < order.size(); i++)
+            invOrder[order[i]] = i;
+        invAtomOrder.upload(invOrder);
+    }
+private:
+    CudaContext& cu;
+    CudaArray& invAtomOrder;
+};
+CudaCalcCustomCVForceKernel::~CudaCalcCustomCVForceKernel() {
+    for (auto force : cvForces)
+        delete force;
+    if (invAtomOrder != NULL)
+        delete invAtomOrder;
+    if (innerInvAtomOrder != NULL)
+        delete innerInvAtomOrder;
+}
+void CudaCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
+    int numCVs = force.getNumCollectiveVariables();
+    for (int i = 0; i < force.getNumGlobalParameters(); i++)
+        globalParameterNames.push_back(force.getGlobalParameterName(i));
+    // Create custom functions for the tabulated functions.
+    map<string, Lepton::CustomFunction*> functions;
+    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
+        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
+    // Create the expressions.
+    Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
+    energyExpression = energyExpr.createProgram();
+    for (int i = 0; i < numCVs; i++) {
+        string name = force.getCollectiveVariableName(i);
+        variableNames.push_back(name);
+        variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
+    }
+    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
+        string name = force.getEnergyParameterDerivativeName(i);
+        paramDerivNames.push_back(name);
+        paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
+        cu.addEnergyParameterDerivative(name);
+    }
+    // Delete the custom functions.
+    for (auto& function : functions)
+        delete function.second;
+    // Copy parameter derivatives from the inner context.
+    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    for (auto& param : cu2.getEnergyParamDerivNames())
+        cu.addEnergyParameterDerivative(param);
+    // Create arrays for storing information.
+    int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+    for (int i = 0; i < numCVs; i++)
+        cvForces.push_back(CudaArray::create<long long>(cu, 3*cu.getPaddedNumAtoms(), "cvForce"));
+    invAtomOrder = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "invAtomOrder");
+    innerInvAtomOrder = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "innerInvAtomOrder");
+    // Create the kernels.
+    stringstream args, add;
+    for (int i = 0; i < numCVs; i++) {
+        args << ", long long* __restrict__ force" << i << ", real dEdV" << i;
+        add << "forces[i] += (long long) (force" << i << "[i]*dEdV" << i << ");\n";
+    }
+    map<string, string> replacements;
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    replacements["ADD_FORCES"] = add.str();
+    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customCVForce, replacements));
+    copyStateKernel = cu.getKernel(module, "copyState");
+    copyForcesKernel = cu.getKernel(module, "copyForces");
+    addForcesKernel = cu.getKernel(module, "addForces");
+}
+double CudaCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
+    copyState(context, innerContext);
+    int numCVs = variableNames.size();
+    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    vector<double> cvValues;
+    vector<map<string, double> > cvDerivs(numCVs);
+    void* copyForcesArgs[] = {NULL, &invAtomOrder->getDevicePointer(), &cu2.getForce().getDevicePointer(), &cu2.getAtomIndexArray().getDevicePointer(), &numAtoms, &paddedNumAtoms};
+    for (int i = 0; i < numCVs; i++) {
+        cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
+        copyForcesArgs[0] = &cvForces[i]->getDevicePointer();
+        cu.executeKernel(copyForcesKernel, copyForcesArgs, numAtoms);
+        innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
+    }
+    // Compute the energy and forces.
+    map<string, double> variables;
+    for (auto& name : globalParameterNames)
+        variables[name] = context.getParameter(name);
+    for (int i = 0; i < numCVs; i++)
+        variables[variableNames[i]] = cvValues[i];
+    double energy = energyExpression.evaluate(variables);
+    int bufferSize = cu.getForce().getSize();
+    vector<void*> addForcesArgs;
+    addForcesArgs.push_back(&cu.getForce().getDevicePointer());
+    addForcesArgs.push_back(&bufferSize);
+    vector<double> dEdV(numCVs);
+    vector<float> dEdVFloat(numCVs);
+    for (int i = 0; i < numCVs; i++) {
+        dEdV[i] = variableDerivExpressions[i].evaluate(variables);
+        dEdVFloat[i] = (float) dEdV[i];
+        addForcesArgs.push_back(&cvForces[i]->getDevicePointer());
+        if (cu.getUseDoublePrecision())
+            addForcesArgs.push_back(&dEdV[i]);
+        else
+            addForcesArgs.push_back(&dEdVFloat[i]);
+    }
+    cu.executeKernel(addForcesKernel, &addForcesArgs[0], numAtoms);
+    // Compute the energy parameter derivatives.
+    map<string, double>& energyParamDerivs = cu.getEnergyParamDerivWorkspace();
+    for (int i = 0; i < paramDerivExpressions.size(); i++)
+        energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
+    for (int i = 0; i < numCVs; i++) {
+        double dEdV = variableDerivExpressions[i].evaluate(variables);
+        for (auto& deriv : cvDerivs[i])
+            energyParamDerivs[deriv.first] += dEdV*deriv.second;
+    }
+    return energy;
+}
+void CudaCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
+    int numAtoms = cu.getNumAtoms();
+    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    if (!hasInitializedListeners) {
+        hasInitializedListeners = true;
+        // Initialize the listeners.
+        ReorderListener* listener1 = new ReorderListener(cu, *invAtomOrder);
+        ReorderListener* listener2 = new ReorderListener(cu2, *innerInvAtomOrder);
+        cu.addReorderListener(listener1);
+        cu2.addReorderListener(listener2);
+        listener1->execute();
+        listener2->execute();
+    }
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    CUdeviceptr posCorrection2 = (cu2.getUseMixedPrecision() ? cu2.getPosqCorrection().getDevicePointer() : 0);
+    void* copyStateArgs[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(),
+        &cu2.getPosq().getDevicePointer(), &posCorrection2,& cu2.getVelm().getDevicePointer(), &innerInvAtomOrder->getDevicePointer(), &numAtoms};
+    cu.executeKernel(copyStateKernel, copyStateArgs, numAtoms);
+    Vec3 a, b, c;
+    context.getPeriodicBoxVectors(a, b, c);
+    innerContext.setPeriodicBoxVectors(a, b, c);
+    innerContext.setTime(context.getTime());
+    map<string, double> innerParameters = innerContext.getParameters();
+    for (auto& param : innerParameters)
+        innerContext.setParameter(param.first, context.getParameter(param.first));
+}
 CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }
@@ -7061,6 +7219,8 @@ CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
        delete perDofEnergyParamDerivs;
    if (perDofValues != NULL)
        delete perDofValues;
+    for (auto function : tabulatedFunctions)
+        delete function;
    for (auto& f : savedForces)
        delete f.second;
 }
@@ -7078,7 +7238,8 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
 }
-string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const string& forceName, const string& energyName) {
+string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+        const string& forceName, const string& energyName, vector<const TabulatedFunction*>& functions, vector<pair<string, string> >& functionNames) {
    const string suffixes[] = {".x", ".y", ".z"};
    string suffix = suffixes[component];
    map<string, Lepton::ParsedExpression> expressions;
@@ -7111,8 +7272,6 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari
        variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i);
    for (int i = 0; i < (int) parameterNames.size(); i++)
        variables[parameterNames[i]] = "globals["+cu.intToString(parameterVariableIndex[i])+"]";
-    vector<const TabulatedFunction*> functions;
-    vector<pair<string, string> > functionNames;
    vector<pair<ExpressionTreeNode, string> > variableNodes;
    findExpressionsForDerivs(expr.getRootNode(), variableNodes);
    for (auto& var : variables)
@@ -7144,18 +7303,41 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        stepTarget.resize(numSteps);
        merged.resize(numSteps, false);
        modifiesParameters = false;
+        sumWorkGroupSize = 512;
        map<string, string> defines;
        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
        defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-        defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize);
+        defines["WORK_GROUP_SIZE"] = cu.intToString(sumWorkGroupSize);
        defines["SUM_BUFFER_SIZE"] = "0";
+        // Record the tabulated functions.
+        map<string, Lepton::CustomFunction*> functions;
+        vector<pair<string, string> > functionNames;
+        vector<const TabulatedFunction*> functionList;
+        vector<string> tableTypes;
+        for (int i = 0; i < integrator.getNumTabulatedFunctions(); i++) {
+            functionList.push_back(&integrator.getTabulatedFunction(i));
+            string name = integrator.getTabulatedFunctionName(i);
+            string arrayName = "table"+cu.intToString(i);
+            functionNames.push_back(make_pair(name, arrayName));
+            functions[name] = createReferenceTabulatedFunction(integrator.getTabulatedFunction(i));
+            int width;
+            vector<float> f = cu.getExpressionUtilities().computeFunctionCoefficients(integrator.getTabulatedFunction(i), width);
+            tabulatedFunctions.push_back(CudaArray::create<float>(cu, f.size(), "TabulatedFunction"));
+            tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
+            if (width == 1)
+                tableTypes.push_back("float");
+            else
+                tableTypes.push_back("float"+cu.intToString(width));
+        }
        // Record information about all the computation steps.
        vector<string> variable(numSteps);
        vector<int> forceGroup;
        vector<vector<Lepton::ParsedExpression> > expression;
-        CustomIntegratorUtilities::analyzeComputations(context, integrator, expression, comparisons, blockEnd, invalidatesForces, needsForces, needsEnergy, computeBothForceAndEnergy, forceGroup);
+        CustomIntegratorUtilities::analyzeComputations(context, integrator, expression, comparisons, blockEnd, invalidatesForces, needsForces, needsEnergy, computeBothForceAndEnergy, forceGroup, functions);
        for (int step = 0; step < numSteps; step++) {
            string expr;
            integrator.getComputationStep(step, stepType[step], variable[step], expr);
@@ -7326,7 +7508,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                    if (numUniform > 0)
                        compute << "float4 uniform = uniformValues[uniformIndex+index];\n";
                    for (int i = 0; i < 3; i++)
-                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], i, integrator, forceName[j], energyName[j]);
+                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], i, integrator, forceName[j], energyName[j], functionList, functionNames);
                    if (variable[j] == "x") {
                        if (storePosAsDelta[j])
                            compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n";
@@ -7357,6 +7539,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                    string valueName = "perDofValues"+cu.intToString(i+1);
                    args << ", " << buffer.getType() << "* __restrict__ " << valueName;
                }
+                for (int i = 0; i < (int) tableTypes.size(); i++)
+                    args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
                replacements["PARAMETER_ARGUMENTS"] = args.str();
                if (loadPosAsDelta[step])
                    defines["LOAD_POS_AS_DELTA"] = "1";
@@ -7386,6 +7570,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                args1.push_back(&perDofEnergyParamDerivs->getDevicePointer());
                for (auto& buffer : perDofValues->getBuffers())
                    args1.push_back(&buffer.getMemory());
+                for (auto array : tabulatedFunctions)
+                    args1.push_back(&array->getDevicePointer());
                kernelArgs[step].push_back(args1);
                if (stepType[step] == CustomIntegrator::ComputeSum) {
                    // Create a second kernel for this step that sums the values.
@@ -7448,7 +7634,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        }
        Lepton::ParsedExpression keExpression = Lepton::Parser::parse(integrator.getKineticEnergyExpression()).optimize();
        for (int i = 0; i < 3; i++)
-            computeKE << createPerDofComputation("", keExpression, i, integrator, "f", "");
+            computeKE << createPerDofComputation("", keExpression, i, integrator, "f", "", functionList, functionNames);
        map<string, string> replacements;
        replacements["COMPUTE_STEP"] = computeKE.str();
        stringstream args;
@@ -7457,6 +7643,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
            string valueName = "perDofValues"+cu.intToString(i+1);
            args << ", " << buffer.getType() << "* __restrict__ " << valueName;
        }
+        for (int i = 0; i < (int) tableTypes.size(); i++)
+            args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
        replacements["PARAMETER_ARGUMENTS"] = args.str();
        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
        if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
@@ -7481,6 +7669,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        kineticEnergyArgs.push_back(&perDofEnergyParamDerivs->getDevicePointer());
        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
            kineticEnergyArgs.push_back(&perDofValues->getBuffers()[i].getMemory());
+        for (auto array : tabulatedFunctions)
+            kineticEnergyArgs.push_back(&array->getDevicePointer());
        keNeedsForce = usesVariable(keExpression, "f");
        // Create a second kernel to sum the values.
@@ -7488,6 +7678,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
        module = cu.createModule(CudaKernelSources::customIntegrator, defines);
        sumKineticEnergyKernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
+        // Delete the custom functions.
+        for (auto& function : functions)
+            delete function.second;
    }
    // Make sure all values (variables, parameters, etc.) are up to date.
@@ -7557,6 +7752,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    int numAtoms = cu.getNumAtoms();
    int numSteps = integrator.getNumComputations();
+    if (!forcesAreValid)
+        savedEnergy.clear();
    // Loop over computation steps in the integrator and execute them.
@@ -7565,14 +7762,19 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
    for (int step = 0; step < numSteps; ) {
        int nextStep = step+1;
+        int forceGroups = forceGroupFlags[step];
        int lastForceGroups = context.getLastForceGroups();
-        if ((needsForces[step] || needsEnergy[step]) && (!forcesAreValid || lastForceGroups != forceGroupFlags[step])) {
+        bool haveForces = (!needsForces[step] || (forcesAreValid && lastForceGroups == forceGroups));
-            if (forcesAreValid && savedForces.find(lastForceGroups) != savedForces.end()) {
+        bool haveEnergy = (!needsEnergy[step] || savedEnergy.find(forceGroups) != savedEnergy.end());
-                // The forces are still valid.  We just need a different force group right now.  Save the old
+        if (!haveForces || !haveEnergy) {
-                // forces in case we need them again.
+            if (forcesAreValid) {
+                if (savedForces.find(lastForceGroups) != savedForces.end() && validSavedForces.find(lastForceGroups) == validSavedForces.end()) {
-                cu.getForce().copyTo(*savedForces[lastForceGroups]);
+                    // The forces are still valid.  We just need a different force group right now.  Save the old
-                validSavedForces.insert(lastForceGroups);
+                    // forces in case we need them again.
+                    cu.getForce().copyTo(*savedForces[lastForceGroups]);
+                    validSavedForces.insert(lastForceGroups);
+                }
            }
            else
                validSavedForces.clear();
@@ -7582,15 +7784,16 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            bool computeForce = (needsForces[step] || computeBothForceAndEnergy[step]);
            bool computeEnergy = (needsEnergy[step] || computeBothForceAndEnergy[step]);
-            if (!computeEnergy && validSavedForces.find(forceGroupFlags[step]) != validSavedForces.end()) {
+            if (!computeEnergy && validSavedForces.find(forceGroups) != validSavedForces.end()) {
                // We can just restore the forces we saved earlier.
-                savedForces[forceGroupFlags[step]]->copyTo(cu.getForce());
+                savedForces[forceGroups]->copyTo(cu.getForce());
+                context.getLastForceGroups() = forceGroups;
            }
            else {
                recordChangedParameters(context);
-                energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroupFlags[step]);
+                energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroups);
-                energyFloat = (float) energy;
+                savedEnergy[forceGroups] = energy;
                if (needsEnergyParamDerivs) {
                    context.getEnergyParameterDerivatives(energyParamDerivs);
                    if (perDofEnergyParamDerivNames.size() > 0) {
@@ -7609,6 +7812,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            }
            forcesAreValid = true;
        }
+        if (needsEnergy[step]) {
+            energy = savedEnergy[forceGroups];
+            energyFloat = (float) energy;
+        }
        if (needsGlobals[step] && !deviceGlobalsAreCurrent) {
            // Upload the global values to the device.
@@ -7620,6 +7827,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                globalValues->upload(globalValuesFloat);
            }
        }
+        bool stepInvalidatesForces = invalidatesForces[step];
        if (stepType[step] == CustomIntegrator::ComputePerDof && !merged[step]) {
            int randomIndex = integration.prepareRandomNumbers(requiredGaussian[step]);
            kernelArgs[step][0][1] = &posCorrection;
@@ -7646,7 +7854,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
            cu.clearBuffer(*sumBuffer);
            cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
-            cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+            cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], sumWorkGroupSize, sumWorkGroupSize);
            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
                double value;
                summedValue->download(&value);
@@ -7660,7 +7868,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        }
        else if (stepType[step] == CustomIntegrator::UpdateContextState) {
            recordChangedParameters(context);
-            context.updateContextState();
+            stepInvalidatesForces = context.updateContextState();
        }
        else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
            if (hasAnyConstraints) {
@@ -7685,8 +7893,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            if (blockEnd[step] != -1)
                nextStep = blockEnd[step]; // Return to the start of a while block.
        }
-        if (invalidatesForces[step])
+        if (stepInvalidatesForces) {
            forcesAreValid = false;
+            savedEnergy.clear();
+        }
        step = nextStep;
    }
    recordChangedParameters(context);
@@ -7746,7 +7956,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
    cu.clearBuffer(*sumBuffer);
    cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms());
    void* args[] = {&sumBuffer->getDevicePointer(), &summedValue->getDevicePointer()};
-    cu.executeKernel(sumKineticEnergyKernel, args, CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
+    cu.executeKernel(sumKineticEnergyKernel, args, sumWorkGroupSize, sumWorkGroupSize);
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        double ke;
        summedValue->download(&ke);
@@ -7902,6 +8112,8 @@ CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
    cu.setAsCurrent();
    if (savedPositions != NULL)
        delete savedPositions;
+    if (savedForces != NULL)
+        delete savedForces;
    if (moleculeAtoms != NULL)
        delete moleculeAtoms;
    if (moleculeStartIndex != NULL)
@@ -7911,6 +8123,7 @@ CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
 void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
    cu.setAsCurrent();
    savedPositions = new CudaArray(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "savedPositions");
+    savedForces = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms()*3, "savedForces");
    CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
    kernel = cu.getKernel(module, "scalePositions");
 }
@@ -7948,6 +8161,12 @@ void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, d
        m<<"Error saving positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
    }
+    result = cuMemcpyDtoD(savedForces->getDevicePointer(), cu.getForce().getDevicePointer(), savedForces->getSize()*savedForces->getElementSize());
+    if (result != CUDA_SUCCESS) {
+        std::stringstream m;
+        m<<"Error saving forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(m.str());
+    }
    float scalefX = (float) scaleX;
    float scalefY = (float) scaleY;
    float scalefZ = (float) scaleZ;
@@ -7969,6 +8188,12 @@ void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context)
        m<<"Error restoring positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
    }
+    result = cuMemcpyDtoD(cu.getForce().getDevicePointer(), savedForces->getDevicePointer(), savedForces->getSize()*savedForces->getElementSize());
+    if (result != CUDA_SUCCESS) {
+        std::stringstream m;
+        m<<"Error restoring forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(m.str());
+    }
 }
 CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() {

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    if (threadsEnv != NULL)
        stringstream(threadsEnv) >> threads;
    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
-            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads));
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
+}
+void CudaPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
+    Platform& platform = originalContext.getPlatform();
+    string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeviceIndex());
+    string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseBlockingSync());
+    string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaPrecision());
+    string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseCpuPme());
+    string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaCompiler());
+    string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaTempDirectory());
+    string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaHostCompiler());
+    string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDisablePmeStream());
+    string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeterministicForces());
+    int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
 }
 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
-            const string& deterministicForcesProperty, int numThreads) :
+            const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
+    PlatformData* originalData = NULL;
+    if (originalContext != NULL)
+        originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
    try {
        for (int i = 0; i < (int) devices.size(); i++) {
            if (devices[i].length() > 0) {
                int deviceIndex;
                stringstream(devices[i]) >> deviceIndex;
-                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
            }
        }
        if (contexts.size() == 0)
-            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
    }
    catch (...) {
        // If an exception was thrown, do our best to clean up memory.

--- a/platforms/cuda/src/kernels/customCVForce.cu
+++ b/platforms/cuda/src/kernels/customCVForce.cu
+/**
+ * Copy the positions and velocities to the inner context.
+ */
+extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4* velm, int* __restrict__ atomOrder,
+        real4* innerPosq, real4* innerPosqCorrection, mixed4* innerVelm, int* __restrict__ innerInvAtomOrder,
+        int numAtoms) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+        int index = innerInvAtomOrder[atomOrder[i]];
+        innerPosq[index] = posq[i];
+        innerVelm[index] = velm[i];
+#ifdef USE_MIXED_PRECISION
+        innerPosqCorrection[index] = posqCorrection[i];
+#endif
+    }
+}
+/**
+ * Copy the forces back to the main context.
+ */
+extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAtomOrder, long long* innerForces,
+        int* __restrict__ innerAtomOrder, int numAtoms, int paddedNumAtoms) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+        int index = invAtomOrder[innerAtomOrder[i]];
+        forces[index] = innerForces[i];
+        forces[index+paddedNumAtoms] = innerForces[i+paddedNumAtoms];
+        forces[index+paddedNumAtoms*2] = innerForces[i+paddedNumAtoms*2];
+    }
+}
+/**
+ * Add all the forces from the CVs.
+ */
+extern "C" __global__ void addForces(long long* forces, int bufferSize
+    PARAMETER_ARGUMENTS) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x) {
+        ADD_FORCES
+    }
+}
--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -6,26 +6,17 @@ inline __device__ real3 trim(real4 v) {
 }
 /**
- * This does nothing, and just exists to simply the code generation.
+ * This does nothing, and just exists to simplify the code generation.
 */
 inline __device__ real3 trim(real3 v) {
    return v;
 }
 /**
- * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
+ * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
- */
-inline __device__ real4 delta(real4 vec1, real4 vec2) {
-    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
-/**
- * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+inline __device__ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
 #ifdef USE_PERIODIC
    APPLY_PERIODIC_TO_DELTA(result)
@@ -95,6 +86,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
            // Load the next block of acceptors into local memory.
+            __syncthreads();
            int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
            if (threadIdx.x < blockSize) {
                int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
@@ -105,8 +97,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
            __syncthreads();
            if (donorIndex < NUM_DONORS) {
                for (int index = 0; index < blockSize; index++) {
-#ifdef USE_EXCLUSIONS
                    int acceptorIndex = acceptorStart+index;
+#ifdef USE_EXCLUSIONS
                    if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
                        continue;
 #endif
@@ -115,7 +107,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
                    real4 a1 = posBuffer[3*index];
                    real4 a2 = posBuffer[3*index+1];
                    real4 a3 = posBuffer[3*index+2];
-                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
+                    real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
 #ifdef USE_CUTOFF
                    if (deltaD1A1.w < CUTOFF_SQUARED) {
 #endif
@@ -183,6 +175,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
            // Load the next block of donors into local memory.
+            __syncthreads();
            int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
            if (threadIdx.x < blockSize) {
                int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
@@ -193,8 +186,8 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
            __syncthreads();
            if (acceptorIndex < NUM_ACCEPTORS) {
                for (int index = 0; index < blockSize; index++) {
-#ifdef USE_EXCLUSIONS
                    int donorIndex = donorStart+index;
+#ifdef USE_EXCLUSIONS
                    if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
                        continue;
 #endif
@@ -203,7 +196,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
                    real4 d1 = posBuffer[3*index];
                    real4 d2 = posBuffer[3*index+1];
                    real4 d3 = posBuffer[3*index+2];
-                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
+                    real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
 #ifdef USE_CUTOFF
                    if (deltaD1A1.w < CUTOFF_SQUARED) {
 #endif

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -60,6 +60,11 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 * Determine whether a particular interaction is in the list of exclusions.
 */
 inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
+    if (atom1 > atom2) {
+        int temp = atom1;
+        atom1 = atom2;
+        atom2 = temp;
+    }
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -680,7 +680,9 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
 extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
        const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
        const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
-        const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) {
+        const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
+        const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
+        const int* __restrict__ localCoordsStartIndex) {
    // Two particle average sites.
@@ -732,30 +734,31 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
    // Local coordinates sites.
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
-        int4 atoms = localCoordsAtoms[index];
+        int siteAtomIndex = localCoordsIndex[index];
-        const real* params = &localCoordsParams[12*index];
+        int start = localCoordsStartIndex[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
+        int end = localCoordsStartIndex[index+1];
-        mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y);
+        mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
-        mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z);
+        for (int j = start; j < end; j++) {
-        mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w);
+            mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
-        mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z);
+            origin += pos*localCoordsWeights[3*j];
-        mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z);
+            xdir += pos*localCoordsWeights[3*j+1];
-        mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z);
+            ydir += pos*localCoordsWeights[3*j+2];
-        mixed3 originWeights = make_mixed3(params[0], params[1], params[2]);
+        }
-        mixed3 xWeights = make_mixed3(params[3], params[4], params[5]);
-        mixed3 yWeights = make_mixed3(params[6], params[7], params[8]);
-        mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
-        mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
-        mixed3 xdir = pos1*xWeights.x + pos2*xWeights.y + pos3*xWeights.z;
-        mixed3 ydir = pos1*yWeights.x + pos2*yWeights.y + pos3*yWeights.z;
        mixed3 zdir = cross(xdir, ydir);
-        xdir *= rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
+        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
-        zdir *= rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
+        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
+        xdir *= invNormXdir;
+        zdir *= invNormZdir;
        ydir = cross(zdir, xdir);
+        real4 localPosition_4 = localCoordsPos[index];
+        mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
+        mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
        pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
        pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
        pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
-        storePos(posq, posqCorrection, atoms.x, pos);
+        storePos(posq, posqCorrection, siteAtomIndex, pos);
    }
 }
@@ -778,7 +781,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
        const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
        const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
        const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
-        const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) {
+        const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
+        const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
+        const int* __restrict__ localCoordsStartIndex) {
    // Two particle average sites.
@@ -826,87 +831,56 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
    // Local coordinates sites.
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
-        int4 atoms = localCoordsAtoms[index];
+        int siteAtomIndex = localCoordsIndex[index];
-        const real* params = &localCoordsParams[12*index];
+        int start = localCoordsStartIndex[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
+        int end = localCoordsStartIndex[index+1];
-        mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y);
+        mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
-        mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z);
+        for (int j = start; j < end; j++) {
-        mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w);
+            mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
-        mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z);
+            origin += pos*localCoordsWeights[3*j];
-        mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z);
+            xdir += pos*localCoordsWeights[3*j+1];
-        mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z);
+            ydir += pos*localCoordsWeights[3*j+2];
-        mixed3 originWeights = make_mixed3(params[0], params[1], params[2]);
+        }
-        mixed3 wx = make_mixed3(params[3], params[4], params[5]);
-        mixed3 wy = make_mixed3(params[6], params[7], params[8]);
-        mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
-        mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
-        mixed3 xdir = pos1*wx.x + pos2*wx.y + pos3*wx.z;
-        mixed3 ydir = pos1*wy.x + pos2*wy.y + pos3*wy.z;
        mixed3 zdir = cross(xdir, ydir);
-        mixed invNormXdir = rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
+        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
-        mixed invNormZdir = rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
+        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
        mixed3 dx = xdir*invNormXdir;
        mixed3 dz = zdir*invNormZdir;
        mixed3 dy = cross(dz, dx);
+        real4 localPosition_4 = localCoordsPos[index];
+        mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
        // The derivatives for this case are very complicated.  They were computed with SymPy then simplified by hand.
-        mixed t11 = (wx.x*ydir.x-wy.x*xdir.x)*invNormZdir;
+        real3 f = loadForce(siteAtomIndex, force);
-        mixed t12 = (wx.x*ydir.y-wy.x*xdir.y)*invNormZdir;
-        mixed t13 = (wx.x*ydir.z-wy.x*xdir.z)*invNormZdir;
-        mixed t21 = (wx.y*ydir.x-wy.y*xdir.x)*invNormZdir;
-        mixed t22 = (wx.y*ydir.y-wy.y*xdir.y)*invNormZdir;
-        mixed t23 = (wx.y*ydir.z-wy.y*xdir.z)*invNormZdir;
-        mixed t31 = (wx.z*ydir.x-wy.z*xdir.x)*invNormZdir;
-        mixed t32 = (wx.z*ydir.y-wy.z*xdir.y)*invNormZdir;
-        mixed t33 = (wx.z*ydir.z-wy.z*xdir.z)*invNormZdir;
-        mixed sx1 = t13*dz.y-t12*dz.z;
-        mixed sy1 = t11*dz.z-t13*dz.x;
-        mixed sz1 = t12*dz.x-t11*dz.y;
-        mixed sx2 = t23*dz.y-t22*dz.z;
-        mixed sy2 = t21*dz.z-t23*dz.x;
-        mixed sz2 = t22*dz.x-t21*dz.y;
-        mixed sx3 = t33*dz.y-t32*dz.z;
-        mixed sy3 = t31*dz.z-t33*dz.x;
-        mixed sz3 = t32*dz.x-t31*dz.y;
-        mixed3 wxScaled = wx*invNormXdir;
-        real3 f = loadForce(atoms.x, force);
        mixed3 fp1 = localPosition*f.x;
        mixed3 fp2 = localPosition*f.y;
        mixed3 fp3 = localPosition*f.z;
-        real3 f1 = make_real3(0);
+        for (int j = start; j < end; j++) {
-        real3 f2 = make_real3(0);
+            real originWeight = localCoordsWeights[3*j];
-        real3 f3 = make_real3(0);
+            real wx = localCoordsWeights[3*j+1];
-        f1.x += fp1.x*wxScaled.x*(1-dx.x*dx.x) + fp1.z*(dz.x*sx1    ) + fp1.y*((-dx.x*dy.x     )*wxScaled.x + dy.x*sx1 - dx.y*t12 - dx.z*t13) + f.x*originWeights.x;
+            real wy = localCoordsWeights[3*j+2];
-        f1.y += fp1.x*wxScaled.x*( -dx.x*dx.y) + fp1.z*(dz.x*sy1+t13) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.x + dy.x*sy1 + dx.y*t11);
+            mixed wxScaled = wx*invNormXdir;
-        f1.z += fp1.x*wxScaled.x*( -dx.x*dx.z) + fp1.z*(dz.x*sz1-t12) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.x + dy.x*sz1 + dx.z*t11);
+            mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
-        f2.x += fp1.x*wxScaled.y*(1-dx.x*dx.x) + fp1.z*(dz.x*sx2    ) + fp1.y*((-dx.x*dy.x     )*wxScaled.y + dy.x*sx2 - dx.y*t22 - dx.z*t23) + f.x*originWeights.y;
+            mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
-        f2.y += fp1.x*wxScaled.y*( -dx.x*dx.y) + fp1.z*(dz.x*sy2+t23) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.y + dy.x*sy2 + dx.y*t21);
+            mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
-        f2.z += fp1.x*wxScaled.y*( -dx.x*dx.z) + fp1.z*(dz.x*sz2-t22) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.y + dy.x*sz2 + dx.z*t21);
+            mixed sx = t3*dz.y-t2*dz.z;
-        f3.x += fp1.x*wxScaled.z*(1-dx.x*dx.x) + fp1.z*(dz.x*sx3    ) + fp1.y*((-dx.x*dy.x     )*wxScaled.z + dy.x*sx3 - dx.y*t32 - dx.z*t33) + f.x*originWeights.z;
+            mixed sy = t1*dz.z-t3*dz.x;
-        f3.y += fp1.x*wxScaled.z*( -dx.x*dx.y) + fp1.z*(dz.x*sy3+t33) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.z + dy.x*sy3 + dx.y*t31);
+            mixed sz = t2*dz.x-t1*dz.y;
-        f3.z += fp1.x*wxScaled.z*( -dx.x*dx.z) + fp1.z*(dz.x*sz3-t32) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.z + dy.x*sz3 + dx.z*t31);
+            real3 fresult = make_real3(0);
-        f1.x += fp2.x*wxScaled.x*( -dx.y*dx.x) + fp2.z*(dz.y*sx1-t13) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.x - dy.y*sx1 - dx.x*t12);
+            fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx   ) + fp1.y*((-dx.x*dy.x     )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
-        f1.y += fp2.x*wxScaled.x*(1-dx.y*dx.y) + fp2.z*(dz.y*sy1    ) - fp2.y*(( dx.y*dy.y     )*wxScaled.x - dy.y*sy1 + dx.x*t11 + dx.z*t13) + f.y*originWeights.x;
+            fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
-        f1.z += fp2.x*wxScaled.x*( -dx.y*dx.z) + fp2.z*(dz.y*sz1+t11) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.x - dy.y*sz1 - dx.z*t12);
+            fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
-        f2.x += fp2.x*wxScaled.y*( -dx.y*dx.x) + fp2.z*(dz.y*sx2-t23) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.y - dy.y*sx2 - dx.x*t22);
+            fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
-        f2.y += fp2.x*wxScaled.y*(1-dx.y*dx.y) + fp2.z*(dz.y*sy2    ) - fp2.y*(( dx.y*dy.y     )*wxScaled.y - dy.y*sy2 + dx.x*t21 + dx.z*t23) + f.y*originWeights.y;
+            fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy   ) - fp2.y*(( dx.y*dy.y     )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
-        f2.z += fp2.x*wxScaled.y*( -dx.y*dx.z) + fp2.z*(dz.y*sz2+t21) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.y - dy.y*sz2 - dx.z*t22);
+            fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
-        f3.x += fp2.x*wxScaled.z*( -dx.y*dx.x) + fp2.z*(dz.y*sx3-t33) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.z - dy.y*sx3 - dx.x*t32);
+            fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
-        f3.y += fp2.x*wxScaled.z*(1-dx.y*dx.y) + fp2.z*(dz.y*sy3    ) - fp2.y*(( dx.y*dy.y     )*wxScaled.z - dy.y*sy3 + dx.x*t31 + dx.z*t33) + f.y*originWeights.z;
+            fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
-        f3.z += fp2.x*wxScaled.z*( -dx.y*dx.z) + fp2.z*(dz.y*sz3+t31) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.z - dy.y*sz3 - dx.z*t32);
+            fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz   ) + fp3.y*((-dx.z*dy.z     )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
-        f1.x += fp3.x*wxScaled.x*( -dx.z*dx.x) + fp3.z*(dz.z*sx1+t12) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.x + dy.z*sx1 + dx.x*t13);
+            addForce(localCoordsAtoms[j], force, fresult);
-        f1.y += fp3.x*wxScaled.x*( -dx.z*dx.y) + fp3.z*(dz.z*sy1-t11) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.x + dy.z*sy1 + dx.y*t13);
+        }
-        f1.z += fp3.x*wxScaled.x*(1-dx.z*dx.z) + fp3.z*(dz.z*sz1    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.x + dy.z*sz1 - dx.x*t11 - dx.y*t12) + f.z*originWeights.x;
-        f2.x += fp3.x*wxScaled.y*( -dx.z*dx.x) + fp3.z*(dz.z*sx2+t22) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.y + dy.z*sx2 + dx.x*t23);
-        f2.y += fp3.x*wxScaled.y*( -dx.z*dx.y) + fp3.z*(dz.z*sy2-t21) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.y + dy.z*sy2 + dx.y*t23);
-        f2.z += fp3.x*wxScaled.y*(1-dx.z*dx.z) + fp3.z*(dz.z*sz2    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.y + dy.z*sz2 - dx.x*t21 - dx.y*t22) + f.z*originWeights.y;
-        f3.x += fp3.x*wxScaled.z*( -dx.z*dx.x) + fp3.z*(dz.z*sx3+t32) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.z + dy.z*sx3 + dx.x*t33);
-        f3.y += fp3.x*wxScaled.z*( -dx.z*dx.y) + fp3.z*(dz.z*sy3-t31) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.z + dy.z*sy3 + dx.y*t33);
-        f3.z += fp3.x*wxScaled.z*(1-dx.z*dx.z) + fp3.z*(dz.z*sz3    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.z + dy.z*sz3 - dx.x*t31 - dx.y*t32) + f.z*originWeights.z;
-        addForce(atoms.y, force, f1);
-        addForce(atoms.z, force, f2);
-        addForce(atoms.w, force, f3);
    }
 }
@@ -924,4 +898,4 @@ extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const
            velm[index] = velocity;
        }
    }
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/utilities.cu
+++ b/platforms/cuda/src/kernels/utilities.cu
@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
    clearSingleBuffer(buffer6, size6);
 }
+/**
+ * Sum the energy buffer.
+ */
+__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
+    extern __shared__ mixed tempBuffer[];
+    const unsigned int thread = threadIdx.x;
+    mixed sum = 0;
+    for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
+        sum += energyBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < workGroupSize)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        *result = tempBuffer[0];
+}
 /**
 * Record the atomic charges into the posq array.
 */

--- a/platforms/cuda/tests/TestCudaCustomCVForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomCVForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaTests.h"
+#include "TestCustomCVForce.h"
+void runPlatformTests() {
+}
--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -56,7 +56,7 @@ void testGaussian() {
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -163,7 +163,8 @@ public:
    class ForcePostComputation;
    static const int ThreadBlockSize;
    static const int TileSize;
-    OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
+    OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData,
+        OpenCLContext* originalContext);
    ~OpenCLContext();
    /**
     * This is called to initialize internal data structures after all Forces in the system
@@ -363,9 +364,13 @@ public:
     */
    void reduceBuffer(OpenCLArray& array, int numBuffers);
    /**
-     * Sum the buffesr containing forces.
+     * Sum the buffers containing forces.
     */
    void reduceForces();
+    /**
+     * Sum the buffer containing energy.
+     */
+    double reduceEnergy();
    /**
     * Get the current simulation time.
     */
@@ -749,6 +754,7 @@ private:
    cl::Kernel clearSixBuffersKernel;
    cl::Kernel reduceReal4Kernel;
    cl::Kernel reduceForcesKernel;
+    cl::Kernel reduceEnergyKernel;
    cl::Kernel setChargesKernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<Molecule> molecules;
@@ -763,6 +769,7 @@ private:
    OpenCLArray* forceBuffers;
    OpenCLArray* longForceBuffer;
    OpenCLArray* energyBuffer;
+    OpenCLArray* energySum;
    OpenCLArray* energyParamDerivBuffer;
    OpenCLArray* atomIndexDevice;
    OpenCLArray* chargeBuffer;

--- a/platforms/opencl/include/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/include/OpenCLIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -156,8 +156,11 @@ private:
    OpenCLArray* vsite3AvgWeights;
    OpenCLArray* vsiteOutOfPlaneAtoms;
    OpenCLArray* vsiteOutOfPlaneWeights;
+    OpenCLArray* vsiteLocalCoordsIndex;
    OpenCLArray* vsiteLocalCoordsAtoms;
-    OpenCLArray* vsiteLocalCoordsParams;
+    OpenCLArray* vsiteLocalCoordsWeights;
+    OpenCLArray* vsiteLocalCoordsPos;
+    OpenCLArray* vsiteLocalCoordsStartIndex;
    int randomPos;
    int lastSeed, numVsites;
    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -37,6 +37,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
 #include "lepton/CompiledExpression.h"
+#include "lepton/ExpressionProgram.h"
 #include "openmm/System.h"
 namespace OpenMM {
@@ -1207,6 +1208,54 @@ private:
    cl::Kernel framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
 };
+/**
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
+ */
+class OpenCLCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+public:
+    OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcCustomCVForceKernel(name, platform),
+            cl(cl), hasInitializedKernels(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
+    }
+    ~OpenCLCalcCustomCVForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
+private:
+    class ReorderListener;
+    OpenCLContext& cl;
+    bool hasInitializedKernels;
+    Lepton::ExpressionProgram energyExpression;
+    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
+    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
+    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
+    std::vector<OpenCLArray*> cvForces;
+    OpenCLArray* invAtomOrder;
+    OpenCLArray* innerInvAtomOrder;
+    cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
+};
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */
@@ -1472,7 +1521,9 @@ private:
    class ReorderListener;
    class GlobalTarget;
    class DerivFunction;
-    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+        const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
+        std::vector<std::pair<std::string, std::string> >& functionNames);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
@@ -1482,7 +1533,7 @@ private:
    OpenCLContext& cl;
    double energy;
    float energyFloat;
-    int numGlobalVariables;
+    int numGlobalVariables, sumWorkGroupSize;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
    mutable bool localValuesAreCurrent;
    OpenCLArray* globalValues;
@@ -1491,6 +1542,8 @@ private:
    OpenCLArray* uniformRandoms;
    OpenCLArray* randomSeed;
    OpenCLArray* perDofEnergyParamDerivs;
+    std::vector<OpenCLArray*> tabulatedFunctions;
+    std::map<int, double> savedEnergy;
    std::map<int, OpenCLArray*> savedForces;
    std::set<int> validSavedForces;
    OpenCLParameterSet* perDofValues;
@@ -1573,7 +1626,7 @@ private:
 class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
 public:
    OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
-            hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
+            hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
    }
    ~OpenCLApplyMonteCarloBarostatKernel();
    /**
@@ -1608,6 +1661,7 @@ private:
    bool hasInitializedKernels;
    int numMolecules;
    OpenCLArray* savedPositions;
+    OpenCLArray* savedForces;
    OpenCLArray* moleculeAtoms;
    OpenCLArray* moleculeStartIndex;
    cl::Kernel kernel;