Merge changes from main branch

18295108 · peastman · e6101f68 · 8d7234e5 · 18295108 · 18295108
Commit 18295108 authored Sep 05, 2017 by peastman
20 changed files
--- a/platforms/cuda/include/CudaIntegrationUtilities.h
+++ b/platforms/cuda/include/CudaIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -158,8 +158,11 @@ private:
    CudaArray* vsite3AvgWeights;
    CudaArray* vsiteOutOfPlaneAtoms;
    CudaArray* vsiteOutOfPlaneWeights;
+    CudaArray* vsiteLocalCoordsIndex;
    CudaArray* vsiteLocalCoordsAtoms;
-    CudaArray* vsiteLocalCoordsParams;
+    CudaArray* vsiteLocalCoordsWeights;
+    CudaArray* vsiteLocalCoordsPos;
+    CudaArray* vsiteLocalCoordsStartIndex;
    int randomPos;
    int lastSeed, numVsites;
    double2 lastStepSize;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -38,6 +38,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
 #include "lepton/CompiledExpression.h"
+#include "lepton/ExpressionProgram.h"
 #include <cufft.h>
 namespace OpenMM {
@@ -1229,6 +1230,54 @@ private:
    CUevent event;
 };
+/**
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+public:
+    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
+            cu(cu), hasInitializedListeners(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
+    }
+    ~CudaCalcCustomCVForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
+private:
+    class ReorderListener;
+    CudaContext& cu;
+    bool hasInitializedListeners;
+    Lepton::ExpressionProgram energyExpression;
+    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
+    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
+    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
+    std::vector<CudaArray*> cvForces;
+    CudaArray* invAtomOrder;
+    CudaArray* innerInvAtomOrder;
+    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
+};
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */
@@ -1485,7 +1534,9 @@ private:
    class ReorderListener;
    class GlobalTarget;
    class DerivFunction;
-    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+        const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
+        std::vector<std::pair<std::string, std::string> >& functionNames);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
@@ -1495,7 +1546,7 @@ private:
    CudaContext& cu;
    double energy;
    float energyFloat;
-    int numGlobalVariables;
+    int numGlobalVariables, sumWorkGroupSize;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
    mutable bool localValuesAreCurrent;
    CudaArray* globalValues;
@@ -1504,6 +1555,8 @@ private:
    CudaArray* uniformRandoms;
    CudaArray* randomSeed;
    CudaArray* perDofEnergyParamDerivs;
+    std::vector<CudaArray*> tabulatedFunctions;
+    std::map<int, double> savedEnergy;
    std::map<int, CudaArray*> savedForces;
    std::set<int> validSavedForces;
    CudaParameterSet* perDofValues;
@@ -1587,7 +1640,7 @@ private:
 class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
 public:
    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu),
-            hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
+            hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
    }
    ~CudaApplyMonteCarloBarostatKernel();
    /**
@@ -1622,6 +1675,7 @@ private:
    bool hasInitializedKernels;
    int numMolecules;
    CudaArray* savedPositions;
+    CudaArray* savedForces;
    CudaArray* moleculeAtoms;
    CudaArray* moleculeStartIndex;
    CUfunction kernel;

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -53,6 +53,7 @@ public:
    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
    void contextDestroyed(ContextImpl& context) const;
    /**
     * This is the name of the parameter for selecting which CUDA device or devices to use.
@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
-            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads);
+            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) {
 #endif
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
-        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
+        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
-        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
+        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    cacheDir = cacheDir+"/";
 #endif
    contextIndex = platformData.contexts.size();
-    int numDevices;
    string errorMessage = "Error initializing Context";
-    CHECK_RESULT(cuDeviceGetCount(&numDevices));
+    if (originalContext == NULL) {
-    if (deviceIndex < -1 || deviceIndex >= numDevices)
+        isLinkedContext = false;
-        throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
+        int numDevices;
+        CHECK_RESULT(cuDeviceGetCount(&numDevices));
-    vector<int> devicePrecedence;
+        if (deviceIndex < -1 || deviceIndex >= numDevices)
-    if (deviceIndex == -1) {
+            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
-        devicePrecedence = getDevicePrecedence();
-    } else {
+        vector<int> devicePrecedence;
-        devicePrecedence.push_back(deviceIndex);
+        if (deviceIndex == -1) {
-    }
+            devicePrecedence = getDevicePrecedence();
+        } else {
-    this->deviceIndex = -1;
+            devicePrecedence.push_back(deviceIndex);
-    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+        }
-        int trialDeviceIndex = devicePrecedence[i];
-        CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
-        defaultOptimizationOptions = "--use_fast_math";
-        unsigned int flags = CU_CTX_MAP_HOST;
-        if (useBlockingSync)
-            flags += CU_CTX_SCHED_BLOCKING_SYNC;
-        else
-            flags += CU_CTX_SCHED_SPIN;
-        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+        this->deviceIndex = -1;
-            this->deviceIndex = trialDeviceIndex;
+        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
-            break;
+            int trialDeviceIndex = devicePrecedence[i];
+            CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
+            defaultOptimizationOptions = "--use_fast_math";
+            unsigned int flags = CU_CTX_MAP_HOST;
+            if (useBlockingSync)
+                flags += CU_CTX_SCHED_BLOCKING_SYNC;
+            else
+                flags += CU_CTX_SCHED_SPIN;
+            if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+                this->deviceIndex = trialDeviceIndex;
+                break;
+            }
        }
+        if (this->deviceIndex == -1)
+            if (deviceIndex != -1)
+                throw OpenMMException("The requested CUDA device could not be loaded");
+            else
+                throw OpenMMException("No compatible CUDA device is available");
+    }
+    else {
+        isLinkedContext = true;
+        context = originalContext->context;
+        this->deviceIndex = originalContext->deviceIndex;
+        this->device = originalContext->device;
    }
-    if (this->deviceIndex == -1)
-        if (deviceIndex != -1)
-            throw OpenMMException("The requested CUDA device could not be loaded");
-        else
-            throw OpenMMException("No compatible CUDA device is available");
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
@@ -227,6 +236,12 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            minor = 3;
        }
    }
+    if (major == 7) {
+        // Don't generate Volta-specific code until we've made the changes needed
+        // to support it properly.
+        major = 6;
+        minor = 0;
+    }
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;
@@ -292,6 +307,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
    setChargesKernel = getKernel(utilities, "setCharges");
    // Set defines based on the requested precision.
@@ -405,6 +421,8 @@ CudaContext::~CudaContext() {
        delete force;
    if (energyBuffer != NULL)
        delete energyBuffer;
+    if (energySum != NULL)
+        delete energySum;
    if (energyParamDerivBuffer != NULL)
        delete energyParamDerivBuffer;
    if (atomIndexDevice != NULL)
@@ -422,7 +440,7 @@ CudaContext::~CudaContext() {
    if (thread != NULL)
        delete thread;
    string errorMessage = "Error deleting Context";
-    if (contextIsValid) {
+    if (contextIsValid && !isLinkedContext) {
        cuProfilerStop();
        CHECK_RESULT(cuCtxDestroy(context));
    }
@@ -435,16 +453,19 @@ void CudaContext::initialize() {
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum = CudaArray::create<double>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
+        energySum = CudaArray::create<double>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        energySum = CudaArray::create<float>(*this, 1, "energySum");
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
@@ -864,6 +885,18 @@ void CudaContext::clearAutoclearBuffers() {
    }
 }
+double CudaContext::reduceEnergy() {
+    int bufferSize = energyBuffer->getSize();
+    int workGroupSize  = 512;
+    void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize};
+    executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize());
+    energySum->download(pinnedBuffer);
+    if (getUseDoublePrecision() || getUseMixedPrecision())
+        return *((double*) pinnedBuffer);
+    else
+        return *((float*) pinnedBuffer);
+}
 void CudaContext::setCharges(const vector<double>& charges) {
    if (chargeBuffer == NULL)
        chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
@@ -1050,9 +1083,16 @@ void CudaContext::findMoleculeGroups() {
            for (int i = 0; i < (int) forces.size() && identical; i++) {
                if (mol.groups[i].size() != mol2.groups[i].size())
                    identical = false;
-                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
+                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
                        identical = false;
+                    vector<int> p1, p2;
+                    forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
+                    forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
+                    for (int m = 0; m < p1.size(); m++)
+                        if (p1[m] != p2[m]-atomOffset)
+                            identical = false;
+                }
            }
            if (identical) {
                moleculeInstances[j].push_back(molIndex);

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -103,7 +103,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
-        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) {
+        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
+        vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL) {
    // Create workspace arrays.
    lastStepSize = make_double2(0.0, 0.0);
@@ -454,8 +455,11 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
    vector<double4> vsite3AvgWeightVec;
    vector<int4> vsiteOutOfPlaneAtomVec;
    vector<double4> vsiteOutOfPlaneWeightVec;
-    vector<int4> vsiteLocalCoordsAtomVec;
+    vector<int> vsiteLocalCoordsIndexVec;
-    vector<double> vsiteLocalCoordsParamVec;
+    vector<int> vsiteLocalCoordsAtomVec;
+    vector<int> vsiteLocalCoordsStartVec;
+    vector<double> vsiteLocalCoordsWeightVec;
+    vector<double4> vsiteLocalCoordsPosVec;
    for (int i = 0; i < numAtoms; i++) {
        if (system.isVirtualSite(i)) {
            if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
@@ -480,64 +484,72 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
            }
            else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
-                // An out of plane site.
+                // A local coordinates site.
                const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
-                vsiteLocalCoordsAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
+                int numParticles = site.getNumParticles();
-                Vec3 origin = site.getOriginWeights();
+                vector<double> origin, x, y;
-                Vec3 x = site.getXWeights();
+                site.getOriginWeights(origin);
-                Vec3 y = site.getYWeights();
+                site.getXWeights(x);
+                site.getYWeights(y);
+                vsiteLocalCoordsIndexVec.push_back(i);
+                vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
+                for (int j = 0; j < numParticles; j++) {
+                    vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
+                    vsiteLocalCoordsWeightVec.push_back(origin[j]);
+                    vsiteLocalCoordsWeightVec.push_back(x[j]);
+                    vsiteLocalCoordsWeightVec.push_back(y[j]);
+                }
                Vec3 pos = site.getLocalPosition();
-                vsiteLocalCoordsParamVec.push_back(origin[0]);
+                vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0));
-                vsiteLocalCoordsParamVec.push_back(origin[1]);
-                vsiteLocalCoordsParamVec.push_back(origin[2]);
-                vsiteLocalCoordsParamVec.push_back(x[0]);
-                vsiteLocalCoordsParamVec.push_back(x[1]);
-                vsiteLocalCoordsParamVec.push_back(x[2]);
-                vsiteLocalCoordsParamVec.push_back(y[0]);
-                vsiteLocalCoordsParamVec.push_back(y[1]);
-                vsiteLocalCoordsParamVec.push_back(y[2]);
-                vsiteLocalCoordsParamVec.push_back(pos[0]);
-                vsiteLocalCoordsParamVec.push_back(pos[1]);
-                vsiteLocalCoordsParamVec.push_back(pos[2]);
            }
        }
    }
+    vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
    int num2Avg = vsite2AvgAtomVec.size();
    int num3Avg = vsite3AvgAtomVec.size();
    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
-    int numLocalCoords = vsiteLocalCoordsAtomVec.size();
+    int numLocalCoords = vsiteLocalCoordsPosVec.size();
    vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
    vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
    vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
-    vsiteLocalCoordsAtoms = CudaArray::create<int4>(context, max(1, numLocalCoords), "vsiteLocalCoordinatesAtoms");
+    vsiteLocalCoordsIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
+    vsiteLocalCoordsAtoms = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
+    vsiteLocalCoordsStartIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
    if (num2Avg > 0)
        vsite2AvgAtoms->upload(vsite2AvgAtomVec);
    if (num3Avg > 0)
        vsite3AvgAtoms->upload(vsite3AvgAtomVec);
    if (numOutOfPlane > 0)
        vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
-    if (numLocalCoords > 0)
+    if (numLocalCoords > 0) {
+        vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec);
        vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec);
+        vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec);
+    }
    if (context.getUseDoublePrecision()) {
        vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
        vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
        vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsParams = CudaArray::create<double>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams");
+        vsiteLocalCoordsWeights = CudaArray::create<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+        vsiteLocalCoordsPos = CudaArray::create<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
        if (num2Avg > 0)
            vsite2AvgWeights->upload(vsite2AvgWeightVec);
        if (num3Avg > 0)
            vsite3AvgWeights->upload(vsite3AvgWeightVec);
        if (numOutOfPlane > 0)
            vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
-        if (numLocalCoords > 0)
+        if (numLocalCoords > 0) {
-            vsiteLocalCoordsParams->upload(vsiteLocalCoordsParamVec);
+            vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec);
+            vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec);
+        }
    }
    else {
        vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
        vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
        vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsParams = CudaArray::create<float>(context, max(1, 12*numLocalCoords), "vsiteLocalCoordinatesParams");
+        vsiteLocalCoordsWeights = CudaArray::create<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+        vsiteLocalCoordsPos = CudaArray::create<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
        if (num2Avg > 0) {
            vector<float2> floatWeights(num2Avg);
            for (int i = 0; i < num2Avg; i++)
@@ -557,10 +569,14 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
            vsiteOutOfPlaneWeights->upload(floatWeights);
        }
        if (numLocalCoords > 0) {
-            vector<float> floatParams(vsiteLocalCoordsParamVec.size());
+            vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
-            for (int i = 0; i < (int) vsiteLocalCoordsParamVec.size(); i++)
+            for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
-                floatParams[i] = (float) vsiteLocalCoordsParamVec[i];
+                floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
-            vsiteLocalCoordsParams->upload(floatParams);
+            vsiteLocalCoordsWeights->upload(floatWeights);
+            vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
+            for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
+                floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
+            vsiteLocalCoordsPos->upload(floatPos);
        }
    }
@@ -644,10 +660,16 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete vsiteOutOfPlaneAtoms;
    if (vsiteOutOfPlaneWeights != NULL)
        delete vsiteOutOfPlaneWeights;
+    if (vsiteLocalCoordsIndex != NULL)
+        delete vsiteLocalCoordsIndex;
    if (vsiteLocalCoordsAtoms != NULL)
        delete vsiteLocalCoordsAtoms;
-    if (vsiteLocalCoordsParams != NULL)
+    if (vsiteLocalCoordsWeights != NULL)
-        delete vsiteLocalCoordsParams;
+        delete vsiteLocalCoordsWeights;
+    if (vsiteLocalCoordsPos != NULL)
+        delete vsiteLocalCoordsPos;
+    if (vsiteLocalCoordsStartIndex != NULL)
+        delete vsiteLocalCoordsStartIndex;
 }
 void CudaIntegrationUtilities::setNextStepSize(double size) {
@@ -747,7 +769,9 @@ void CudaIntegrationUtilities::computeVirtualSites() {
        void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
-                &vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()};
+                &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
+                &vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
+                &vsiteLocalCoordsStartIndex->getDevicePointer()};
        context.executeKernel(vsitePositionKernel, args, numVsites);
    }
 }
@@ -759,7 +783,9 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
                &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
                &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
                &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(),
-                &vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsParams->getDevicePointer()};
+                &vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(),
+                &vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(),
+                &vsiteLocalCoordsStartIndex->getDevicePointer()};
        context.executeKernel(vsiteForceKernel, args, numVsites);
    }
 }

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCVForceKernel::Name())
+        return new CudaCalcCustomCVForceKernel(name, platform, cu);
    if (name == CalcCustomManyParticleForceKernel::Name())
        return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGayBerneForceKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    if (threadsEnv != NULL)
        stringstream(threadsEnv) >> threads;
    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
-            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads));
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
+}
+void CudaPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
+    Platform& platform = originalContext.getPlatform();
+    string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeviceIndex());
+    string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseBlockingSync());
+    string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaPrecision());
+    string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseCpuPme());
+    string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaCompiler());
+    string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaTempDirectory());
+    string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaHostCompiler());
+    string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDisablePmeStream());
+    string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeterministicForces());
+    int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
 }
 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
-            const string& deterministicForcesProperty, int numThreads) :
+            const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
+    PlatformData* originalData = NULL;
+    if (originalContext != NULL)
+        originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
    try {
        for (int i = 0; i < (int) devices.size(); i++) {
            if (devices[i].length() > 0) {
                int deviceIndex;
                stringstream(devices[i]) >> deviceIndex;
-                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
            }
        }
        if (contexts.size() == 0)
-            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
    }
    catch (...) {
        // If an exception was thrown, do our best to clean up memory.

--- a/platforms/cuda/src/kernels/customCVForce.cu
+++ b/platforms/cuda/src/kernels/customCVForce.cu
+/**
+ * Copy the positions and velocities to the inner context.
+ */
+extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4* velm, int* __restrict__ atomOrder,
+        real4* innerPosq, real4* innerPosqCorrection, mixed4* innerVelm, int* __restrict__ innerInvAtomOrder,
+        int numAtoms) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+        int index = innerInvAtomOrder[atomOrder[i]];
+        innerPosq[index] = posq[i];
+        innerVelm[index] = velm[i];
+#ifdef USE_MIXED_PRECISION
+        innerPosqCorrection[index] = posqCorrection[i];
+#endif
+    }
+}
+/**
+ * Copy the forces back to the main context.
+ */
+extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAtomOrder, long long* innerForces,
+        int* __restrict__ innerAtomOrder, int numAtoms, int paddedNumAtoms) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+        int index = invAtomOrder[innerAtomOrder[i]];
+        forces[index] = innerForces[i];
+        forces[index+paddedNumAtoms] = innerForces[i+paddedNumAtoms];
+        forces[index+paddedNumAtoms*2] = innerForces[i+paddedNumAtoms*2];
+    }
+}
+/**
+ * Add all the forces from the CVs.
+ */
+extern "C" __global__ void addForces(long long* forces, int bufferSize
+    PARAMETER_ARGUMENTS) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x) {
+        ADD_FORCES
+    }
+}
--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -6,26 +6,17 @@ inline __device__ real3 trim(real4 v) {
 }
 /**
- * This does nothing, and just exists to simply the code generation.
+ * This does nothing, and just exists to simplify the code generation.
 */
 inline __device__ real3 trim(real3 v) {
    return v;
 }
 /**
- * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
+ * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
- */
-inline __device__ real4 delta(real4 vec1, real4 vec2) {
-    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
-/**
- * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+inline __device__ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
 #ifdef USE_PERIODIC
    APPLY_PERIODIC_TO_DELTA(result)
@@ -95,6 +86,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
            // Load the next block of acceptors into local memory.
+            __syncthreads();
            int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
            if (threadIdx.x < blockSize) {
                int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
@@ -105,8 +97,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
            __syncthreads();
            if (donorIndex < NUM_DONORS) {
                for (int index = 0; index < blockSize; index++) {
-#ifdef USE_EXCLUSIONS
                    int acceptorIndex = acceptorStart+index;
+#ifdef USE_EXCLUSIONS
                    if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
                        continue;
 #endif
@@ -115,7 +107,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
                    real4 a1 = posBuffer[3*index];
                    real4 a2 = posBuffer[3*index+1];
                    real4 a3 = posBuffer[3*index+2];
-                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
+                    real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
 #ifdef USE_CUTOFF
                    if (deltaD1A1.w < CUTOFF_SQUARED) {
 #endif
@@ -183,6 +175,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
            // Load the next block of donors into local memory.
+            __syncthreads();
            int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
            if (threadIdx.x < blockSize) {
                int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
@@ -193,8 +186,8 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
            __syncthreads();
            if (acceptorIndex < NUM_ACCEPTORS) {
                for (int index = 0; index < blockSize; index++) {
-#ifdef USE_EXCLUSIONS
                    int donorIndex = donorStart+index;
+#ifdef USE_EXCLUSIONS
                    if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
                        continue;
 #endif
@@ -203,7 +196,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
                    real4 d1 = posBuffer[3*index];
                    real4 d2 = posBuffer[3*index+1];
                    real4 d3 = posBuffer[3*index+2];
-                    real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
+                    real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
 #ifdef USE_CUTOFF
                    if (deltaD1A1.w < CUTOFF_SQUARED) {
 #endif

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -60,6 +60,11 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 * Determine whether a particular interaction is in the list of exclusions.
 */
 inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
+    if (atom1 > atom2) {
+        int temp = atom1;
+        atom1 = atom2;
+        atom2 = temp;
+    }
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -680,7 +680,9 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
 extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
        const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
        const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
-        const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) {
+        const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
+        const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
+        const int* __restrict__ localCoordsStartIndex) {
    // Two particle average sites.
@@ -732,30 +734,31 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
    // Local coordinates sites.
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
-        int4 atoms = localCoordsAtoms[index];
+        int siteAtomIndex = localCoordsIndex[index];
-        const real* params = &localCoordsParams[12*index];
+        int start = localCoordsStartIndex[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
+        int end = localCoordsStartIndex[index+1];
-        mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y);
+        mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
-        mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z);
+        for (int j = start; j < end; j++) {
-        mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w);
+            mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
-        mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z);
+            origin += pos*localCoordsWeights[3*j];
-        mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z);
+            xdir += pos*localCoordsWeights[3*j+1];
-        mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z);
+            ydir += pos*localCoordsWeights[3*j+2];
-        mixed3 originWeights = make_mixed3(params[0], params[1], params[2]);
+        }
-        mixed3 xWeights = make_mixed3(params[3], params[4], params[5]);
-        mixed3 yWeights = make_mixed3(params[6], params[7], params[8]);
-        mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
-        mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
-        mixed3 xdir = pos1*xWeights.x + pos2*xWeights.y + pos3*xWeights.z;
-        mixed3 ydir = pos1*yWeights.x + pos2*yWeights.y + pos3*yWeights.z;
        mixed3 zdir = cross(xdir, ydir);
-        xdir *= rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
+        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
-        zdir *= rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
+        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
+        xdir *= invNormXdir;
+        zdir *= invNormZdir;
        ydir = cross(zdir, xdir);
+        real4 localPosition_4 = localCoordsPos[index];
+        mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
+        mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
        pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
        pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
        pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
-        storePos(posq, posqCorrection, atoms.x, pos);
+        storePos(posq, posqCorrection, siteAtomIndex, pos);
    }
 }
@@ -778,7 +781,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
        const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
        const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
        const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
-        const int4* __restrict__ localCoordsAtoms, const real* __restrict__ localCoordsParams) {
+        const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
+        const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
+        const int* __restrict__ localCoordsStartIndex) {
    // Two particle average sites.
@@ -826,87 +831,56 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
    // Local coordinates sites.
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
-        int4 atoms = localCoordsAtoms[index];
+        int siteAtomIndex = localCoordsIndex[index];
-        const real* params = &localCoordsParams[12*index];
+        int start = localCoordsStartIndex[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
+        int end = localCoordsStartIndex[index+1];
-        mixed4 pos1_4 = loadPos(posq, posqCorrection, atoms.y);
+        mixed3 origin = make_mixed3(0), xdir = make_mixed3(0), ydir = make_mixed3(0);
-        mixed4 pos2_4 = loadPos(posq, posqCorrection, atoms.z);
+        for (int j = start; j < end; j++) {
-        mixed4 pos3_4 = loadPos(posq, posqCorrection, atoms.w);
+            mixed3 pos = trimTo3(loadPos(posq, posqCorrection, localCoordsAtoms[j]));
-        mixed3 pos1 = make_mixed3(pos1_4.x, pos1_4.y, pos1_4.z);
+            origin += pos*localCoordsWeights[3*j];
-        mixed3 pos2 = make_mixed3(pos2_4.x, pos2_4.y, pos2_4.z);
+            xdir += pos*localCoordsWeights[3*j+1];
-        mixed3 pos3 = make_mixed3(pos3_4.x, pos3_4.y, pos3_4.z);
+            ydir += pos*localCoordsWeights[3*j+2];
-        mixed3 originWeights = make_mixed3(params[0], params[1], params[2]);
+        }
-        mixed3 wx = make_mixed3(params[3], params[4], params[5]);
-        mixed3 wy = make_mixed3(params[6], params[7], params[8]);
-        mixed3 localPosition = make_mixed3(params[9], params[10], params[11]);
-        mixed3 origin = pos1*originWeights.x + pos2*originWeights.y + pos3*originWeights.z;
-        mixed3 xdir = pos1*wx.x + pos2*wx.y + pos3*wx.z;
-        mixed3 ydir = pos1*wy.x + pos2*wy.y + pos3*wy.z;
        mixed3 zdir = cross(xdir, ydir);
-        mixed invNormXdir = rsqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
+        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
-        mixed invNormZdir = rsqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
+        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
+        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
        mixed3 dx = xdir*invNormXdir;
        mixed3 dz = zdir*invNormZdir;
        mixed3 dy = cross(dz, dx);
+        real4 localPosition_4 = localCoordsPos[index];
+        mixed3 localPosition = make_mixed3(localPosition_4.x, localPosition_4.y, localPosition_4.z);
        // The derivatives for this case are very complicated.  They were computed with SymPy then simplified by hand.
-        mixed t11 = (wx.x*ydir.x-wy.x*xdir.x)*invNormZdir;
+        real3 f = loadForce(siteAtomIndex, force);
-        mixed t12 = (wx.x*ydir.y-wy.x*xdir.y)*invNormZdir;
-        mixed t13 = (wx.x*ydir.z-wy.x*xdir.z)*invNormZdir;
-        mixed t21 = (wx.y*ydir.x-wy.y*xdir.x)*invNormZdir;
-        mixed t22 = (wx.y*ydir.y-wy.y*xdir.y)*invNormZdir;
-        mixed t23 = (wx.y*ydir.z-wy.y*xdir.z)*invNormZdir;
-        mixed t31 = (wx.z*ydir.x-wy.z*xdir.x)*invNormZdir;
-        mixed t32 = (wx.z*ydir.y-wy.z*xdir.y)*invNormZdir;
-        mixed t33 = (wx.z*ydir.z-wy.z*xdir.z)*invNormZdir;
-        mixed sx1 = t13*dz.y-t12*dz.z;
-        mixed sy1 = t11*dz.z-t13*dz.x;
-        mixed sz1 = t12*dz.x-t11*dz.y;
-        mixed sx2 = t23*dz.y-t22*dz.z;
-        mixed sy2 = t21*dz.z-t23*dz.x;
-        mixed sz2 = t22*dz.x-t21*dz.y;
-        mixed sx3 = t33*dz.y-t32*dz.z;
-        mixed sy3 = t31*dz.z-t33*dz.x;
-        mixed sz3 = t32*dz.x-t31*dz.y;
-        mixed3 wxScaled = wx*invNormXdir;
-        real3 f = loadForce(atoms.x, force);
        mixed3 fp1 = localPosition*f.x;
        mixed3 fp2 = localPosition*f.y;
        mixed3 fp3 = localPosition*f.z;
-        real3 f1 = make_real3(0);
+        for (int j = start; j < end; j++) {
-        real3 f2 = make_real3(0);
+            real originWeight = localCoordsWeights[3*j];
-        real3 f3 = make_real3(0);
+            real wx = localCoordsWeights[3*j+1];
-        f1.x += fp1.x*wxScaled.x*(1-dx.x*dx.x) + fp1.z*(dz.x*sx1    ) + fp1.y*((-dx.x*dy.x     )*wxScaled.x + dy.x*sx1 - dx.y*t12 - dx.z*t13) + f.x*originWeights.x;
+            real wy = localCoordsWeights[3*j+2];
-        f1.y += fp1.x*wxScaled.x*( -dx.x*dx.y) + fp1.z*(dz.x*sy1+t13) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.x + dy.x*sy1 + dx.y*t11);
+            mixed wxScaled = wx*invNormXdir;
-        f1.z += fp1.x*wxScaled.x*( -dx.x*dx.z) + fp1.z*(dz.x*sz1-t12) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.x + dy.x*sz1 + dx.z*t11);
+            mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
-        f2.x += fp1.x*wxScaled.y*(1-dx.x*dx.x) + fp1.z*(dz.x*sx2    ) + fp1.y*((-dx.x*dy.x     )*wxScaled.y + dy.x*sx2 - dx.y*t22 - dx.z*t23) + f.x*originWeights.y;
+            mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
-        f2.y += fp1.x*wxScaled.y*( -dx.x*dx.y) + fp1.z*(dz.x*sy2+t23) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.y + dy.x*sy2 + dx.y*t21);
+            mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
-        f2.z += fp1.x*wxScaled.y*( -dx.x*dx.z) + fp1.z*(dz.x*sz2-t22) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.y + dy.x*sz2 + dx.z*t21);
+            mixed sx = t3*dz.y-t2*dz.z;
-        f3.x += fp1.x*wxScaled.z*(1-dx.x*dx.x) + fp1.z*(dz.x*sx3    ) + fp1.y*((-dx.x*dy.x     )*wxScaled.z + dy.x*sx3 - dx.y*t32 - dx.z*t33) + f.x*originWeights.z;
+            mixed sy = t1*dz.z-t3*dz.x;
-        f3.y += fp1.x*wxScaled.z*( -dx.x*dx.y) + fp1.z*(dz.x*sy3+t33) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled.z + dy.x*sy3 + dx.y*t31);
+            mixed sz = t2*dz.x-t1*dz.y;
-        f3.z += fp1.x*wxScaled.z*( -dx.x*dx.z) + fp1.z*(dz.x*sz3-t32) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled.z + dy.x*sz3 + dx.z*t31);
+            real3 fresult = make_real3(0);
-        f1.x += fp2.x*wxScaled.x*( -dx.y*dx.x) + fp2.z*(dz.y*sx1-t13) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.x - dy.y*sx1 - dx.x*t12);
+            fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx   ) + fp1.y*((-dx.x*dy.x     )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
-        f1.y += fp2.x*wxScaled.x*(1-dx.y*dx.y) + fp2.z*(dz.y*sy1    ) - fp2.y*(( dx.y*dy.y     )*wxScaled.x - dy.y*sy1 + dx.x*t11 + dx.z*t13) + f.y*originWeights.x;
+            fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
-        f1.z += fp2.x*wxScaled.x*( -dx.y*dx.z) + fp2.z*(dz.y*sz1+t11) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.x - dy.y*sz1 - dx.z*t12);
+            fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
-        f2.x += fp2.x*wxScaled.y*( -dx.y*dx.x) + fp2.z*(dz.y*sx2-t23) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.y - dy.y*sx2 - dx.x*t22);
+            fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
-        f2.y += fp2.x*wxScaled.y*(1-dx.y*dx.y) + fp2.z*(dz.y*sy2    ) - fp2.y*(( dx.y*dy.y     )*wxScaled.y - dy.y*sy2 + dx.x*t21 + dx.z*t23) + f.y*originWeights.y;
+            fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy   ) - fp2.y*(( dx.y*dy.y     )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
-        f2.z += fp2.x*wxScaled.y*( -dx.y*dx.z) + fp2.z*(dz.y*sz2+t21) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.y - dy.y*sz2 - dx.z*t22);
+            fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
-        f3.x += fp2.x*wxScaled.z*( -dx.y*dx.x) + fp2.z*(dz.y*sx3-t33) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled.z - dy.y*sx3 - dx.x*t32);
+            fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
-        f3.y += fp2.x*wxScaled.z*(1-dx.y*dx.y) + fp2.z*(dz.y*sy3    ) - fp2.y*(( dx.y*dy.y     )*wxScaled.z - dy.y*sy3 + dx.x*t31 + dx.z*t33) + f.y*originWeights.z;
+            fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
-        f3.z += fp2.x*wxScaled.z*( -dx.y*dx.z) + fp2.z*(dz.y*sz3+t31) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled.z - dy.y*sz3 - dx.z*t32);
+            fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz   ) + fp3.y*((-dx.z*dy.z     )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
-        f1.x += fp3.x*wxScaled.x*( -dx.z*dx.x) + fp3.z*(dz.z*sx1+t12) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.x + dy.z*sx1 + dx.x*t13);
+            addForce(localCoordsAtoms[j], force, fresult);
-        f1.y += fp3.x*wxScaled.x*( -dx.z*dx.y) + fp3.z*(dz.z*sy1-t11) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.x + dy.z*sy1 + dx.y*t13);
+        }
-        f1.z += fp3.x*wxScaled.x*(1-dx.z*dx.z) + fp3.z*(dz.z*sz1    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.x + dy.z*sz1 - dx.x*t11 - dx.y*t12) + f.z*originWeights.x;
-        f2.x += fp3.x*wxScaled.y*( -dx.z*dx.x) + fp3.z*(dz.z*sx2+t22) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.y + dy.z*sx2 + dx.x*t23);
-        f2.y += fp3.x*wxScaled.y*( -dx.z*dx.y) + fp3.z*(dz.z*sy2-t21) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.y + dy.z*sy2 + dx.y*t23);
-        f2.z += fp3.x*wxScaled.y*(1-dx.z*dx.z) + fp3.z*(dz.z*sz2    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.y + dy.z*sz2 - dx.x*t21 - dx.y*t22) + f.z*originWeights.y;
-        f3.x += fp3.x*wxScaled.z*( -dx.z*dx.x) + fp3.z*(dz.z*sx3+t32) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled.z + dy.z*sx3 + dx.x*t33);
-        f3.y += fp3.x*wxScaled.z*( -dx.z*dx.y) + fp3.z*(dz.z*sy3-t31) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled.z + dy.z*sy3 + dx.y*t33);
-        f3.z += fp3.x*wxScaled.z*(1-dx.z*dx.z) + fp3.z*(dz.z*sz3    ) + fp3.y*((-dx.z*dy.z     )*wxScaled.z + dy.z*sz3 - dx.x*t31 - dx.y*t32) + f.z*originWeights.z;
-        addForce(atoms.y, force, f1);
-        addForce(atoms.z, force, f2);
-        addForce(atoms.w, force, f3);
    }
 }
@@ -924,4 +898,4 @@ extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const
            velm[index] = velocity;
        }
    }
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/utilities.cu
+++ b/platforms/cuda/src/kernels/utilities.cu
@@ -73,6 +73,25 @@ __global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __res
    clearSingleBuffer(buffer6, size6);
 }
+/**
+ * Sum the energy buffer.
+ */
+__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
+    extern __shared__ mixed tempBuffer[];
+    const unsigned int thread = threadIdx.x;
+    mixed sum = 0;
+    for (unsigned int index = thread; index < bufferSize; index += blockDim.x)
+        sum += energyBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        __syncthreads();
+        if (thread%(i*2) == 0 && thread+i < workGroupSize)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        *result = tempBuffer[0];
+}
 /**
 * Record the atomic charges into the posq array.
 */

--- a/platforms/cuda/tests/TestCudaCustomCVForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomCVForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "CudaTests.h"
+#include "TestCustomCVForce.h"
+void runPlatformTests() {
+}
--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -56,7 +56,7 @@ void testGaussian() {
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -163,7 +163,8 @@ public:
    class ForcePostComputation;
    static const int ThreadBlockSize;
    static const int TileSize;
-    OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
+    OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData,
+        OpenCLContext* originalContext);
    ~OpenCLContext();
    /**
     * This is called to initialize internal data structures after all Forces in the system
@@ -363,9 +364,13 @@ public:
     */
    void reduceBuffer(OpenCLArray& array, int numBuffers);
    /**
-     * Sum the buffesr containing forces.
+     * Sum the buffers containing forces.
     */
    void reduceForces();
+    /**
+     * Sum the buffer containing energy.
+     */
+    double reduceEnergy();
    /**
     * Get the current simulation time.
     */
@@ -749,6 +754,7 @@ private:
    cl::Kernel clearSixBuffersKernel;
    cl::Kernel reduceReal4Kernel;
    cl::Kernel reduceForcesKernel;
+    cl::Kernel reduceEnergyKernel;
    cl::Kernel setChargesKernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<Molecule> molecules;
@@ -763,6 +769,7 @@ private:
    OpenCLArray* forceBuffers;
    OpenCLArray* longForceBuffer;
    OpenCLArray* energyBuffer;
+    OpenCLArray* energySum;
    OpenCLArray* energyParamDerivBuffer;
    OpenCLArray* atomIndexDevice;
    OpenCLArray* chargeBuffer;

--- a/platforms/opencl/include/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/include/OpenCLIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -156,8 +156,11 @@ private:
    OpenCLArray* vsite3AvgWeights;
    OpenCLArray* vsiteOutOfPlaneAtoms;
    OpenCLArray* vsiteOutOfPlaneWeights;
+    OpenCLArray* vsiteLocalCoordsIndex;
    OpenCLArray* vsiteLocalCoordsAtoms;
-    OpenCLArray* vsiteLocalCoordsParams;
+    OpenCLArray* vsiteLocalCoordsWeights;
+    OpenCLArray* vsiteLocalCoordsPos;
+    OpenCLArray* vsiteLocalCoordsStartIndex;
    int randomPos;
    int lastSeed, numVsites;
    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -37,6 +37,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
 #include "lepton/CompiledExpression.h"
+#include "lepton/ExpressionProgram.h"
 #include "openmm/System.h"
 namespace OpenMM {
@@ -1207,6 +1208,54 @@ private:
    cl::Kernel framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
 };
+/**
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
+ */
+class OpenCLCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+public:
+    OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcCustomCVForceKernel(name, platform),
+            cl(cl), hasInitializedKernels(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
+    }
+    ~OpenCLCalcCustomCVForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
+private:
+    class ReorderListener;
+    OpenCLContext& cl;
+    bool hasInitializedKernels;
+    Lepton::ExpressionProgram energyExpression;
+    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
+    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
+    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
+    std::vector<OpenCLArray*> cvForces;
+    OpenCLArray* invAtomOrder;
+    OpenCLArray* innerInvAtomOrder;
+    cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
+};
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */
@@ -1472,7 +1521,9 @@ private:
    class ReorderListener;
    class GlobalTarget;
    class DerivFunction;
-    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+        const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
+        std::vector<std::pair<std::string, std::string> >& functionNames);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
    void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
@@ -1482,7 +1533,7 @@ private:
    OpenCLContext& cl;
    double energy;
    float energyFloat;
-    int numGlobalVariables;
+    int numGlobalVariables, sumWorkGroupSize;
    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
    mutable bool localValuesAreCurrent;
    OpenCLArray* globalValues;
@@ -1491,6 +1542,8 @@ private:
    OpenCLArray* uniformRandoms;
    OpenCLArray* randomSeed;
    OpenCLArray* perDofEnergyParamDerivs;
+    std::vector<OpenCLArray*> tabulatedFunctions;
+    std::map<int, double> savedEnergy;
    std::map<int, OpenCLArray*> savedForces;
    std::set<int> validSavedForces;
    OpenCLParameterSet* perDofValues;
@@ -1573,7 +1626,7 @@ private:
 class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
 public:
    OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
-            hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
+            hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
    }
    ~OpenCLApplyMonteCarloBarostatKernel();
    /**
@@ -1608,6 +1661,7 @@ private:
    bool hasInitializedKernels;
    int numMolecules;
    OpenCLArray* savedPositions;
+    OpenCLArray* savedForces;
    OpenCLArray* moleculeAtoms;
    OpenCLArray* moleculeStartIndex;
    cl::Kernel kernel;