CUDA implementation of CustomCVForce

f995aceb · Peter Eastman · aee84c82 · f995aceb · f995aceb · f995aceb
Commit f995aceb authored Jun 16, 2017 by Peter Eastman
16 changed files
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -77,7 +77,8 @@ public:
    static const int ThreadBlockSize;
    static const int TileSize;
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
-            const std::string& compiler, const std::string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData);
+            const std::string& compiler, const std::string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData,
+            CudaContext* originalContext);
    ~CudaContext();
    /**
     * This is called to initialize internal data structures after all Forces in the system
@@ -622,6 +623,7 @@ private:
    int numAtomBlocks;
    int numThreadBlocks;
    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel, isNvccAvailable, forcesValid;
+    bool isLinkedContext;
    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -38,6 +38,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/CustomIntegratorUtilities.h"
 #include "lepton/CompiledExpression.h"
+#include "lepton/ExpressionProgram.h"
 #include <cufft.h>

 namespace OpenMM {
@@ -1229,6 +1230,54 @@ private:
    CUevent event;
 };

+/**
+ * This kernel is invoked by CustomCVForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
+public:
+    CudaCalcCustomCVForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcCustomCVForceKernel(name, platform),
+            cu(cu), hasInitializedListeners(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
+    }
+    ~CudaCalcCustomCVForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomCVForce this kernel will be used for
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy);
+    /**
+     * Copy state information to the inner context.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param innerContext   the context created by the CustomCVForce for computing collective variables
+     */
+    void copyState(ContextImpl& context, ContextImpl& innerContext);
+private:
+    class ReorderListener;
+    CudaContext& cu;
+    bool hasInitializedListeners;
+    Lepton::ExpressionProgram energyExpression;
+    std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
+    std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
+    std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
+    std::vector<CudaArray*> cvForces;
+    CudaArray* invAtomOrder;
+    CudaArray* innerInvAtomOrder;
+    CUfunction copyStateKernel, copyForcesKernel, addForcesKernel;
+};
+
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -53,6 +53,7 @@ public:
    const std::string& getPropertyValue(const Context& context, const std::string& property) const;
    void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
    void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
+    void linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const;
    void contextDestroyed(ContextImpl& context) const;
    /**
     * This is the name of the parameter for selecting which CUDA device or devices to use.
@@ -130,7 +131,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
-            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads);
+            const std::string& pmeStreamProperty, const std::string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -106,7 +106,7 @@ static int executeInWindows(const string &command) {
 #endif

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
-        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
+        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
        pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL),
        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
@@ -173,40 +173,49 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    cacheDir = cacheDir+"/";
 #endif
    contextIndex = platformData.contexts.size();
-    int numDevices;
    string errorMessage = "Error initializing Context";
-    CHECK_RESULT(cuDeviceGetCount(&numDevices));
-    if (deviceIndex < -1 || deviceIndex >= numDevices)
-        throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
-
-    vector<int> devicePrecedence;
-    if (deviceIndex == -1) {
-        devicePrecedence = getDevicePrecedence();
-    } else {
-        devicePrecedence.push_back(deviceIndex);
-    }
-
-    this->deviceIndex = -1;
-    for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
-        int trialDeviceIndex = devicePrecedence[i];
-        CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
-        defaultOptimizationOptions = "--use_fast_math";
-        unsigned int flags = CU_CTX_MAP_HOST;
-        if (useBlockingSync)
-            flags += CU_CTX_SCHED_BLOCKING_SYNC;
-        else
-            flags += CU_CTX_SCHED_SPIN;
+    if (originalContext == NULL) {
+        isLinkedContext = false;
+        int numDevices;
+        CHECK_RESULT(cuDeviceGetCount(&numDevices));
+        if (deviceIndex < -1 || deviceIndex >= numDevices)
+            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
+
+        vector<int> devicePrecedence;
+        if (deviceIndex == -1) {
+            devicePrecedence = getDevicePrecedence();
+        } else {
+            devicePrecedence.push_back(deviceIndex);
+        }

-        if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
-            this->deviceIndex = trialDeviceIndex;
-            break;
+        this->deviceIndex = -1;
+        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
+            int trialDeviceIndex = devicePrecedence[i];
+            CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
+            defaultOptimizationOptions = "--use_fast_math";
+            unsigned int flags = CU_CTX_MAP_HOST;
+            if (useBlockingSync)
+                flags += CU_CTX_SCHED_BLOCKING_SYNC;
+            else
+                flags += CU_CTX_SCHED_SPIN;
+
+            if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
+                this->deviceIndex = trialDeviceIndex;
+                break;
+            }
        }
+        if (this->deviceIndex == -1)
+            if (deviceIndex != -1)
+                throw OpenMMException("The requested CUDA device could not be loaded");
+            else
+                throw OpenMMException("No compatible CUDA device is available");
+    }
+    else {
+        isLinkedContext = true;
+        context = originalContext->context;
+        this->deviceIndex = originalContext->deviceIndex;
+        this->device = originalContext->device;
    }
-    if (this->deviceIndex == -1)
-        if (deviceIndex != -1)
-            throw OpenMMException("The requested CUDA device could not be loaded");
-        else
-            throw OpenMMException("No compatible CUDA device is available");

    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
@@ -428,7 +437,7 @@ CudaContext::~CudaContext() {
    if (thread != NULL)
        delete thread;
    string errorMessage = "Error deleting Context";
-    if (contextIsValid) {
+    if (contextIsValid && !isLinkedContext) {
        cuProfilerStop();
        CHECK_RESULT(cuCtxDestroy(context));
    }

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -108,6 +108,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomCompoundBondForceKernel::Name())
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomCVForceKernel::Name())
+        return new CudaCalcCustomCVForceKernel(name, platform, cu);
    if (name == CalcCustomManyParticleForceKernel::Name())
        return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGayBerneForceKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -6609,6 +6609,176 @@ void CudaCalcGayBerneForceKernel::sortAtoms() {
    exclusionStartIndex->upload(startIndexVec);
 }

+class CudaCalcCustomCVForceKernel::ReorderListener : public CudaContext::ReorderListener {
+public:
+    ReorderListener(CudaContext& cu, CudaArray& invAtomOrder) : cu(cu), invAtomOrder(invAtomOrder) {
+    }
+    void execute() {
+        vector<int> invOrder(cu.getPaddedNumAtoms());
+        const vector<int>& order = cu.getAtomIndex();
+        for (int i = 0; i < order.size(); i++)
+            invOrder[order[i]] = i;
+        invAtomOrder.upload(invOrder);
+    }
+private:
+    CudaContext& cu;
+    CudaArray& invAtomOrder;
+};
+
+CudaCalcCustomCVForceKernel::~CudaCalcCustomCVForceKernel() {
+    for (auto force : cvForces)
+        delete force;
+    if (invAtomOrder != NULL)
+        delete invAtomOrder;
+    if (innerInvAtomOrder != NULL)
+        delete innerInvAtomOrder;
+}
+
+void CudaCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
+    int numCVs = force.getNumCollectiveVariables();
+    for (int i = 0; i < force.getNumGlobalParameters(); i++)
+        globalParameterNames.push_back(force.getGlobalParameterName(i));
+    
+    // Create custom functions for the tabulated functions.
+
+    map<string, Lepton::CustomFunction*> functions;
+    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
+        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
+
+    // Create the expressions.
+
+    Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
+    energyExpression = energyExpr.createProgram();
+    for (int i = 0; i < numCVs; i++) {
+        string name = force.getCollectiveVariableName(i);
+        variableNames.push_back(name);
+        variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
+    }
+    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
+        string name = force.getEnergyParameterDerivativeName(i);
+        paramDerivNames.push_back(name);
+        paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
+        cu.addEnergyParameterDerivative(name);
+    }
+
+    // Delete the custom functions.
+
+    for (auto& function : functions)
+        delete function.second;
+        
+    // Copy parameter derivatives from the inner context.
+
+    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    for (auto& param : cu2.getEnergyParamDerivNames())
+        cu.addEnergyParameterDerivative(param);
+    
+    // Create arrays for storing information.
+    
+    int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+    for (int i = 0; i < numCVs; i++)
+        cvForces.push_back(CudaArray::create<long long>(cu, 3*cu.getPaddedNumAtoms(), "cvForce"));
+    invAtomOrder = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "invAtomOrder");
+    innerInvAtomOrder = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "innerInvAtomOrder");
+    
+    // Create the kernels.
+    
+    stringstream args, add;
+    for (int i = 0; i < numCVs; i++) {
+        args << ", long long* __restrict__ force" << i << ", real dEdV" << i;
+        add << "forces[i] += (long long) (force" << i << "[i]*dEdV" << i << ");\n";
+    }
+    map<string, string> replacements;
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    replacements["ADD_FORCES"] = add.str();
+    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customCVForce, replacements));
+    copyStateKernel = cu.getKernel(module, "copyState");
+    copyForcesKernel = cu.getKernel(module, "copyForces");
+    addForcesKernel = cu.getKernel(module, "addForces");
+}
+
+double CudaCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
+    copyState(context, innerContext);
+    int numCVs = variableNames.size();
+    int numAtoms = cu.getNumAtoms();
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    vector<double> cvValues;
+    vector<map<string, double> > cvDerivs(numCVs);
+    void* copyForcesArgs[] = {NULL, &invAtomOrder->getDevicePointer(), &cu2.getForce().getDevicePointer(), &cu2.getAtomIndexArray().getDevicePointer(), &numAtoms, &paddedNumAtoms};
+    for (int i = 0; i < numCVs; i++) {
+        cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
+        copyForcesArgs[0] = &cvForces[i]->getDevicePointer();
+        cu.executeKernel(copyForcesKernel, copyForcesArgs, numAtoms);
+        innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
+    }
+    
+    // Compute the energy and forces.
+    
+    map<string, double> variables;
+    for (auto& name : globalParameterNames)
+        variables[name] = context.getParameter(name);
+    for (int i = 0; i < numCVs; i++)
+        variables[variableNames[i]] = cvValues[i];
+    double energy = energyExpression.evaluate(variables);
+    int bufferSize = cu.getForce().getSize();
+    vector<void*> addForcesArgs;
+    addForcesArgs.push_back(&cu.getForce().getDevicePointer());
+    addForcesArgs.push_back(&bufferSize);
+    vector<double> dEdV(numCVs);
+    vector<float> dEdVFloat(numCVs);
+    for (int i = 0; i < numCVs; i++) {
+        dEdV[i] = variableDerivExpressions[i].evaluate(variables);
+        dEdVFloat[i] = (float) dEdV[i];
+        addForcesArgs.push_back(&cvForces[i]->getDevicePointer());
+        if (cu.getUseDoublePrecision())
+            addForcesArgs.push_back(&dEdV[i]);
+        else
+            addForcesArgs.push_back(&dEdVFloat[i]);
+    }
+    cu.executeKernel(addForcesKernel, &addForcesArgs[0], numAtoms);
+    
+    // Compute the energy parameter derivatives.
+    
+    map<string, double>& energyParamDerivs = cu.getEnergyParamDerivWorkspace();
+    for (int i = 0; i < paramDerivExpressions.size(); i++)
+        energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
+    for (int i = 0; i < numCVs; i++) {
+        double dEdV = variableDerivExpressions[i].evaluate(variables);
+        for (auto& deriv : cvDerivs[i])
+            energyParamDerivs[deriv.first] += dEdV*deriv.second;
+    }
+    return energy;
+}
+
+void CudaCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
+    int numAtoms = cu.getNumAtoms();
+    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
+    if (!hasInitializedListeners) {
+        hasInitializedListeners = true;
+        
+        // Initialize the listeners.
+        
+        ReorderListener* listener1 = new ReorderListener(cu, *invAtomOrder);
+        ReorderListener* listener2 = new ReorderListener(cu2, *innerInvAtomOrder);
+        cu.addReorderListener(listener1);
+        cu2.addReorderListener(listener2);
+        listener1->execute();
+        listener2->execute();
+    }
+    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
+    CUdeviceptr posCorrection2 = (cu2.getUseMixedPrecision() ? cu2.getPosqCorrection().getDevicePointer() : 0);
+    void* copyStateArgs[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(),
+        &cu2.getPosq().getDevicePointer(), &posCorrection2,& cu2.getVelm().getDevicePointer(), &innerInvAtomOrder->getDevicePointer(), &numAtoms};
+    cu.executeKernel(copyStateKernel, copyStateArgs, numAtoms);
+    Vec3 a, b, c;
+    context.getPeriodicBoxVectors(a, b, c);
+    innerContext.setPeriodicBoxVectors(a, b, c);
+    innerContext.setTime(context.getTime());
+    map<string, double> innerParameters = innerContext.getParameters();
+    for (auto& param : innerParameters)
+        innerContext.setParameter(param.first, context.getParameter(param.first));
+}
+
 CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }


--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -91,6 +91,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
@@ -198,7 +199,23 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    if (threadsEnv != NULL)
        stringstream(threadsEnv) >> threads;
    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
-            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads));
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
+}
+
+void CudaPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
+    Platform& platform = originalContext.getPlatform();
+    string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeviceIndex());
+    string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseBlockingSync());
+    string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaPrecision());
+    string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), CudaUseCpuPme());
+    string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaCompiler());
+    string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaTempDirectory());
+    string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaHostCompiler());
+    string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), CudaDisablePmeStream());
+    string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), CudaDeterministicForces());
+    int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
+            hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -208,7 +225,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {

 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
-            const string& deterministicForcesProperty, int numThreads) :
+            const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
@@ -218,16 +235,19 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
+    PlatformData* originalData = NULL;
+    if (originalContext != NULL)
+        originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
    try {
        for (int i = 0; i < (int) devices.size(); i++) {
            if (devices[i].length() > 0) {
                int deviceIndex;
                stringstream(devices[i]) >> deviceIndex;
-                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
            }
        }
        if (contexts.size() == 0)
-            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
    }
    catch (...) {
        // If an exception was thrown, do our best to clean up memory.

--- a/platforms/cuda/src/kernels/customCVForce.cu
+++ b/platforms/cuda/src/kernels/customCVForce.cu
+/**
+ * Copy the positions and velocities to the inner context.
+ */
+extern "C" __global__ void copyState(real4* posq, real4* posqCorrection, mixed4* velm, int* __restrict__ atomOrder,
+        real4* innerPosq, real4* innerPosqCorrection, mixed4* innerVelm, int* __restrict__ innerInvAtomOrder,
+        int numAtoms) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+        int index = innerInvAtomOrder[atomOrder[i]];
+        innerPosq[index] = posq[i];
+        innerVelm[index] = velm[i];
+#ifdef USE_MIXED_PRECISION
+        innerPosqCorrection[index] = posqCorrection[i];
+#endif
+    }
+}
+
+/**
+ * Copy the forces back to the main context.
+ */
+extern "C" __global__ void copyForces(long long* forces, int* __restrict__ invAtomOrder, long long* innerForces,
+        int* __restrict__ innerAtomOrder, int numAtoms, int paddedNumAtoms) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x) {
+        int index = invAtomOrder[innerAtomOrder[i]];
+        forces[index] = innerForces[i];
+        forces[index+paddedNumAtoms] = innerForces[i+paddedNumAtoms];
+        forces[index+paddedNumAtoms*2] = innerForces[i+paddedNumAtoms*2];
+    }
+}
+
+/**
+ * Add all the forces from the CVs.
+ */
+extern "C" __global__ void addForces(long long* forces, int bufferSize
+    PARAMETER_ARGUMENTS) {
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < bufferSize; i += blockDim.x*gridDim.x) {
+        ADD_FORCES
+    }
+}
--- a/platforms/cuda/tests/TestCudaCustomCVForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomCVForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2017 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestCustomCVForce.h"
+
+void runPlatformTests() {
+}
--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -56,7 +56,7 @@ void testGaussian() {
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1, NULL);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *

--- a/tests/TestCustomCVForce.h
+++ b/tests/TestCustomCVForce.h
@@ -37,8 +37,10 @@
 #include "openmm/CustomBondForce.h"
 #include "openmm/CustomCVForce.h"
 #include "openmm/CustomExternalForce.h"
+#include "openmm/CustomNonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
+#include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

@@ -176,6 +178,50 @@ void testTabulatedFunction() {
    }
 }

+void testReordering() {
+    // Create a larger system with a nonbonded force, since that will trigger atom
+    // reordering on the GPU.
+    
+    const int numParticles = 100;
+    System system;
+    CustomCVForce* cv = new CustomCVForce("2*v2");
+    CustomNonbondedForce* v1 = new CustomNonbondedForce("r");
+    v1->addPerParticleParameter("a");
+    CustomBondForce* v2 = new CustomBondForce("r+1");
+    v2->addBond(5, 10);
+    cv->addCollectiveVariable("v1", v1);
+    cv->addCollectiveVariable("v2", v2);
+    cv->setForceGroup(2);
+    system.addForce(cv);
+    CustomNonbondedForce* nb = new CustomNonbondedForce("r^2");
+    nb->addPerParticleParameter("a");
+    system.addForce(nb);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions;
+    vector<double> params(1);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        params[0] = i%2;
+        v1->addParticle(params);
+        params[0] = (i < numParticles/2 ? 2.0 : 3.0);
+        nb->addParticle(params);
+        positions.push_back(Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt))*10);
+    }
+    
+    // Make sure it works correctly.
+    
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy | State::Forces, false, 1<<2);
+    Vec3 delta = positions[5]-positions[10];
+    double r = sqrt(delta.dot(delta));
+    ASSERT_EQUAL_TOL(2*(r+1), state.getPotentialEnergy(), 1e-5);
+    ASSERT_EQUAL_VEC(-delta*2/r, state.getForces()[5], 1e-5);
+    ASSERT_EQUAL_VEC(delta*2/r, state.getForces()[10], 1e-5);
+}
+
 void runPlatformTests();

 int main(int argc, char* argv[]) {
@@ -184,6 +230,7 @@ int main(int argc, char* argv[]) {
        testCVs();
        testEnergyParameterDerivatives();
        testTabulatedFunction();
+        testReordering();
        runPlatformTests();
    }
    catch(const exception& e) {

--- a/wrappers/python/src/swig_doxygen/swigInputBuilder.py
+++ b/wrappers/python/src/swig_doxygen/swigInputBuilder.py
@@ -280,6 +280,11 @@ class SwigInputBuilder:
            self.fOut.write(",\n         OpenMM::%s" % name)
        self.fOut.write(");\n\n")

+        self.fOut.write("%factory(OpenMM::Force& OpenMM::CustomCVForce::getCollectiveVariable")
+        for name in sorted(forceSubclassList):
+            self.fOut.write(",\n         OpenMM::%s" % name)
+        self.fOut.write(");\n\n")
+
        self.fOut.write("%factory(OpenMM::Integrator* OpenMM::Integrator::__copy__")
        for name in sorted(integratorSubclassList):
            self.fOut.write(",\n         OpenMM::%s" % name)

--- a/wrappers/python/src/swig_doxygen/swigInputConfig.py
+++ b/wrappers/python/src/swig_doxygen/swigInputConfig.py
@@ -390,6 +390,8 @@ UNITS = {
 ("CustomTorsionForce", "getPerTorsionParameterName") : (None, ()),
 ("CustomTorsionForce", "getGlobalParameterName") : (None, ()),
 ("CustomTorsionForce", "getTorsionParameters") : (None, ()),
+("CustomCVForce", "getCollectiveVariable") : (None, ()),
+("CustomCVForce", "getInnerContext") : (None, ()),
 ("DrudeForce", "getParticleParameters") : (None, (None, None, None, None, None, 'unit.elementary_charge', 'unit.nanometer**3', None, None)),
 ("DrudeForce", "getNumScreenedPairs") : (None, ()),
 ("DrudeForce", "getScreenedPairParameters") : (None, ()),