Continuing to implement new CUDA platform: constraints, LangevinIntegrator,...

Continuing to implement new CUDA platform: constraints, LangevinIntegrator, BrownianIntegrator, VariableLangevinIntegrator, VariableVerletIntegrator

Continuing to implement new CUDA platform: constraints, LangevinIntegrator,...
Continuing to implement new CUDA platform: constraints, LangevinIntegrator, BrownianIntegrator, VariableLangevinIntegrator, VariableVerletIntegrator
ecbbf442 · Peter Eastman · 03cc3523 · ecbbf442 · ecbbf442 · ecbbf442
Commit ecbbf442 authored Jun 16, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -32,8 +32,8 @@
 using namespace OpenMM;
-CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
+CudaArray::CudaArray(CudaContext& context, int size, int elementSize, const std::string& name) :
-        size(size), elementSize(elementSize), name(name), ownsMemory(true) {
+        context(context), size(size), elementSize(elementSize), name(name), ownsMemory(true) {
    CUresult result = cuMemAlloc(&pointer, size*elementSize);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
@@ -43,7 +43,7 @@ CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
 }
 CudaArray::~CudaArray() {
-    if (ownsMemory) {
+    if (ownsMemory && context.getContextIsValid()) {
        CUresult result = cuMemFree(pointer);
        if (result != CUDA_SUCCESS) {
            std::stringstream str;

--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -35,6 +35,8 @@
 namespace OpenMM {
+class CudaContext;
 /**
 * This class encapsulates a block of CUDA device memory.  It provides a simplified API
 * for working with it and for copying data to and from device memory.
@@ -46,21 +48,23 @@ public:
     * Create a CudaArray object.  The object is allocated on the heap with the "new" operator.
     * The template argument is the data type of each array element.
     *
+     * @param context           the context for which to create the array
     * @param size              the number of elements in the array
     * @param name              the name of the array
     */
    template <class T>
-    static CudaArray* create(int size, const std::string& name) {
+    static CudaArray* create(CudaContext& context, int size, const std::string& name) {
-        return new CudaArray(size, sizeof(T), name);
+        return new CudaArray(context, size, sizeof(T), name);
    }
    /**
     * Create a CudaArray object.
     *
+     * @param context           the context for which to create the array
     * @param size              the number of elements in the array
     * @param elementSize       the size of each element in bytes
     * @param name              the name of the array
     */
-    CudaArray(int size, int elementSize, const std::string& name);
+    CudaArray(CudaContext& context, int size, int elementSize, const std::string& name);
    ~CudaArray();
    /**
     * Get the number of elements in the array.
@@ -123,6 +127,7 @@ public:
     */
    void download(void* data, bool blocking = true) const;
 private:
+    CudaContext& context;
    CUdeviceptr pointer;
    int size, elementSize;
    bool ownsMemory;

--- a/platforms/cuda2/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaBondedUtilities.cpp
@@ -86,7 +86,7 @@ void CudaBondedUtilities::initialize(const System& system) {
                for (int atom = 0; atom < width; atom++)
                    indexVec[bond*width+atom] = forceAtoms[i][bond][startAtom+atom];
            }
-            CudaArray* indices = new CudaArray(numBonds, 4*width, "bondedIndices");
+            CudaArray* indices = new CudaArray(context, numBonds, 4*width, "bondedIndices");
            indices->upload(&indexVec[0]);
            atomIndices[i].push_back(indices);
            startAtom += width;

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -141,23 +141,23 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    nonbonded = new CudaNonbondedUtilities(*this);
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
-        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
+        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
-        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
+        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
        compilationDefines["make_real2"] = "make_double2";
        compilationDefines["make_real3"] = "make_double3";
        compilationDefines["make_real4"] = "make_double4";
-        energyBuffer = CudaArray::create<double>(numEnergyBuffers, "energyBuffer");
+        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
-        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
+        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
-        velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
+        velm = CudaArray::create<float4>(*this, paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
-        energyBuffer = CudaArray::create<float>(numEnergyBuffers, "energyBuffer");
+        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
@@ -198,7 +198,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
 }
 CudaContext::~CudaContext() {
-    cuCtxSetCurrent(context);
+    setAsCurrent();
    for (int i = 0; i < (int) forces.size(); i++)
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
@@ -226,6 +226,7 @@ CudaContext::~CudaContext() {
    string errorMessage = "Error deleting Context";
    if (contextIsValid)
        CHECK_RESULT(cuCtxDestroy(context));
+    contextIsValid = false;
 }
 void CudaContext::initialize() {
@@ -240,10 +241,10 @@ void CudaContext::initialize() {
    }
    velm->upload(pinnedBuffer);
    bonded->initialize(system);
-    force = CudaArray::create<long long>(paddedNumAtoms*3, "force");
+    force = CudaArray::create<long long>(*this, paddedNumAtoms*3, "force");
    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize());
    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize());
-    atomIndexDevice = CudaArray::create<int>(paddedNumAtoms, "atomIndex");
+    atomIndexDevice = CudaArray::create<int>(*this, paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
        atomIndex[i] = i;
@@ -257,6 +258,11 @@ void CudaContext::addForce(CudaForceInfo* force) {
    forces.push_back(force);
 }
+void CudaContext::setAsCurrent() {
+    if (contextIsValid)
+        cuCtxSetCurrent(context);
+}
 string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
    string result = input;
    for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -88,6 +88,17 @@ public:
    CUcontext getContext() {
        return context;
    }
+    /**
+     * Get whether the CUcontext associated with this object is currently a valid contex.
+     */
+    bool getContextIsValid() const {
+        return contextIsValid;
+    }
+    /**
+     * Set the CUcontext associated with this object to be the current context.  If the context is not
+     * valid, this returns without doing anything.
+     */
+    void setAsCurrent();
    /**
     * Get the CUdevice associated with this object.
     */

--- a/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.cpp
@@ -101,22 +101,22 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
        ccmaConvergedMemory(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
-        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
+        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.
    if (context.getUseDoublePrecision()) {
-        posDelta = CudaArray::create<double4>(context.getPaddedNumAtoms(), "posDelta");
+        posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
        posDelta->upload(deltas);
-        stepSize = CudaArray::create<double2>(1, "stepSize");
+        stepSize = CudaArray::create<double2>(context, 1, "stepSize");
        vector<double2> step(1, make_double2(0.0f, 0.0f));
        stepSize->upload(step);
    }
    else {
-        posDelta = CudaArray::create<float4>(context.getPaddedNumAtoms(), "posDelta");
+        posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
        vector<float4> deltas(posDelta->getSize(), make_float4(0.0, 0.0, 0.0, 0.0));
        posDelta->upload(deltas);
-        stepSize = CudaArray::create<float2>(1, "stepSize");
+        stepSize = CudaArray::create<float2>(context, 1, "stepSize");
        vector<float2> step(1, make_float2(0.0f, 0.0f));
        stepSize->upload(step);
    }
@@ -125,13 +125,13 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
    map<string, string> velocityDefines;
    velocityDefines["CONSTRAIN_VELOCITIES"] = "1";
-//    CUmodule settleModule = context.createModule(CudaKernelSources::settle);
+    CUmodule settleModule = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::settle);
-//    settlePosKernel = context.getKernel(settleModule, "applySettle");
+    settlePosKernel = context.getKernel(settleModule, "applySettle");
-//    settleVelKernel = context.getKernel(settleModule, "constrainVelocities");
+    settleVelKernel = context.getKernel(settleModule, "constrainVelocities");
-//    CUmodule shakeModule = context.createModule(CudaKernelSources::shakeHydrogens);
+    CUmodule shakeModule = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::shakeHydrogens);
-//    shakePosKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
+    shakePosKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
-//    shakeModule = context.createModule(CudaKernelSources::shakeHydrogens, velocityDefines);
+    shakeModule = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::shakeHydrogens, velocityDefines);
-//    shakeVelKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
+    shakeVelKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
    // Record the set of constraints and how many constraints each atom is involved in.
@@ -210,8 +210,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
            isShakeAtom[atom2] = true;
            isShakeAtom[atom3] = true;
        }
-        settleAtoms = CudaArray::create<int4>(atoms.size(), "settleAtoms");
+        settleAtoms = CudaArray::create<int4>(context, atoms.size(), "settleAtoms");
-        settleParams = CudaArray::create<float2>(params.size(), "settleParams");
+        settleParams = CudaArray::create<float2>(context, params.size(), "settleParams");
        settleAtoms->upload(atoms);
        settleParams->upload(params);
    }
@@ -292,8 +292,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
                isShakeAtom[cluster.peripheralID[2]] = true;
            ++index;
        }
-        shakeAtoms = CudaArray::create<int4>(atoms.size(), "shakeAtoms");
+        shakeAtoms = CudaArray::create<int4>(context, atoms.size(), "shakeAtoms");
-        shakeParams = CudaArray::create<float4>(params.size(), "shakeParams");
+        shakeParams = CudaArray::create<float4>(context, params.size(), "shakeParams");
        shakeAtoms->upload(atoms);
        shakeParams->upload(params);
    }
@@ -475,36 +475,67 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        // Record the CCMA data structures.
-        ccmaAtoms = CudaArray::create<int2>(numCCMA, "CcmaAtoms");
+        ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
-        ccmaDistance = CudaArray::create<float4>(numCCMA, "CcmaDistance");
+        ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
-        ccmaAtomConstraints = CudaArray::create<int>(numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
+        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        ccmaNumAtomConstraints = CudaArray::create<int>(numAtoms, "CcmaAtomConstraintsIndex");
+        ccmaConverged = CudaArray::create<int>(context, 2, "CcmaConverged");
-        ccmaDelta1 = CudaArray::create<float>(numCCMA, "CcmaDelta1");
-        ccmaDelta2 = CudaArray::create<float>(numCCMA, "CcmaDelta2");
-        ccmaConverged = CudaArray::create<int>(2, "CcmaConverged");
        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), 0), "Error allocating pinned memory");
-        ccmaReducedMass = CudaArray::create<float>(numCCMA, "CcmaReducedMass");
+        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
-        ccmaConstraintMatrixColumn = CudaArray::create<int>(numCCMA*maxRowElements, "ConstraintMatrixColumn");
-        ccmaConstraintMatrixValue = CudaArray::create<float>(numCCMA*maxRowElements, "ConstraintMatrixValue");
        vector<int2> atomsVec(ccmaAtoms->getSize());
-        vector<float4> distanceVec(ccmaDistance->getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
-        vector<float> reducedMassVec(ccmaReducedMass->getSize());
        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
-        vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
+        if (context.getUseDoublePrecision()) {
-        for (int i = 0; i < numCCMA; i++) {
+            ccmaDistance = CudaArray::create<double4>(context, numCCMA, "CcmaDistance");
-            int index = constraintOrder[i];
+            ccmaDelta1 = CudaArray::create<double>(context, numCCMA, "CcmaDelta1");
-            int c = ccmaConstraints[index];
+            ccmaDelta2 = CudaArray::create<double>(context, numCCMA, "CcmaDelta2");
-            atomsVec[i].x = atom1[c];
+            ccmaReducedMass = CudaArray::create<double>(context, numCCMA, "CcmaReducedMass");
-            atomsVec[i].y = atom2[c];
+            ccmaConstraintMatrixValue = CudaArray::create<double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
-            distanceVec[i].w = (float) distance[c];
+            vector<double4> distanceVec(ccmaDistance->getSize());
-            reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+            vector<double> reducedMassVec(ccmaReducedMass->getSize());
-            for (unsigned int j = 0; j < matrix[index].size(); j++) {
+            vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
-                constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+            for (int i = 0; i < numCCMA; i++) {
-                constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
+                int index = constraintOrder[i];
+                int c = ccmaConstraints[index];
+                atomsVec[i].x = atom1[c];
+                atomsVec[i].y = atom2[c];
+                distanceVec[i].w = distance[c];
+                reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+                for (unsigned int j = 0; j < matrix[index].size(); j++) {
+                    constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+                    constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
+                }
+                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
+            }
+            ccmaDistance->upload(distanceVec);
+            ccmaReducedMass->upload(reducedMassVec);
+            ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
+        }
+        else {
+            ccmaDistance = CudaArray::create<float4>(context, numCCMA, "CcmaDistance");
+            ccmaDelta1 = CudaArray::create<float>(context, numCCMA, "CcmaDelta1");
+            ccmaDelta2 = CudaArray::create<float>(context, numCCMA, "CcmaDelta2");
+            ccmaReducedMass = CudaArray::create<float>(context, numCCMA, "CcmaReducedMass");
+            ccmaConstraintMatrixValue = CudaArray::create<float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+            vector<float4> distanceVec(ccmaDistance->getSize());
+            vector<float> reducedMassVec(ccmaReducedMass->getSize());
+            vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
+            for (int i = 0; i < numCCMA; i++) {
+                int index = constraintOrder[i];
+                int c = ccmaConstraints[index];
+                atomsVec[i].x = atom1[c];
+                atomsVec[i].y = atom2[c];
+                distanceVec[i].w = (float) distance[c];
+                reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+                for (unsigned int j = 0; j < matrix[index].size(); j++) {
+                    constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+                    constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
+                }
+                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
            }
-            constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
+            ccmaDistance->upload(distanceVec);
+            ccmaReducedMass->upload(reducedMassVec);
+            ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
        }
        for (unsigned int i = 0; i < atomConstraints.size(); i++) {
            numAtomConstraintsVec[i] = atomConstraints[i].size();
@@ -514,27 +545,25 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
            }
        }
        ccmaAtoms->upload(atomsVec);
-        ccmaDistance->upload(distanceVec);
        ccmaAtomConstraints->upload(atomConstraintsVec);
        ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
-        ccmaReducedMass->upload(reducedMassVec);
        ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
-        ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
        // Create the CCMA kernels.
        map<string, string> defines;
        defines["NUM_CONSTRAINTS"] = context.intToString(numCCMA);
        defines["NUM_ATOMS"] = context.intToString(numAtoms);
-//        CUmodule ccmaModule = context.createModule(CudaKernelSources::ccma, defines);
+        CUmodule ccmaModule = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ccma, defines);
-//        ccmaDirectionsKernel = context.getKernel(ccmaModule, "computeConstraintDirections");
+        ccmaDirectionsKernel = context.getKernel(ccmaModule, "computeConstraintDirections");
-//        ccmaPosForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
+        ccmaPosForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
-//        ccmaMultiplyKernel = context.getKernel(ccmaModule, "multiplyByConstraintMatrix");
+        ccmaMultiplyKernel = context.getKernel(ccmaModule, "multiplyByConstraintMatrix");
-//        ccmaPosUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
+        ccmaPosUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
-//        defines["CONSTRAIN_VELOCITIES"] = "1";
+        defines["CONSTRAIN_VELOCITIES"] = "1";
-//        ccmaModule = context.createModule(CudaKernelSources::ccma, defines);
+        ccmaModule = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ccma, defines);
-//        ccmaVelForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
+        ccmaVelForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
-//        ccmaVelUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
+        ccmaVelUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
+        CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA");
    }
    // Build the list of virtual sites.
@@ -573,12 +602,12 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
    int num2Avg = vsite2AvgAtomVec.size();
    int num3Avg = vsite3AvgAtomVec.size();
    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
-    vsite2AvgAtoms = CudaArray::create<int4>(max(1, num2Avg), "vsite2AvgAtoms");
+    vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
-    vsite2AvgWeights = CudaArray::create<float2>(max(1, num2Avg), "vsite2AvgWeights");
+    vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
-    vsite3AvgAtoms = CudaArray::create<int4>(max(1, num3Avg), "vsite3AvgAtoms");
+    vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
-    vsite3AvgWeights = CudaArray::create<float4>(max(1, num3Avg), "vsite3AvgWeights");
+    vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
-    vsiteOutOfPlaneAtoms = CudaArray::create<int4>(max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
+    vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
-    vsiteOutOfPlaneWeights = CudaArray::create<float4>(max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
+    vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
    if (num2Avg > 0) {
        vsite2AvgAtoms->upload(vsite2AvgAtomVec);
        vsite2AvgWeights->upload(vsite2AvgWeightVec);
@@ -620,6 +649,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
 }
 CudaIntegrationUtilities::~CudaIntegrationUtilities() {
+    context.setAsCurrent();
    if (posDelta != NULL)
        delete posDelta;
    if (settleAtoms != NULL)
@@ -681,97 +711,69 @@ void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
 }
 void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
-//    bool hasInitialized;
+    CUfunction settleKernel, shakeKernel, ccmaForceKernel, ccmaUpdateKernel;
-//    CUfunction settleKernel, shakeKernel, ccmaForceKernel, ccmaUpdateKernel;
+    if (constrainVelocities) {
-//    if (constrainVelocities) {
+        settleKernel = settleVelKernel;
-//        hasInitialized = hasInitializedVelConstraintKernels;
+        shakeKernel = shakeVelKernel;
-//        settleKernel = settleVelKernel;
+        ccmaForceKernel = ccmaVelForceKernel;
-//        shakeKernel = shakeVelKernel;
+        ccmaUpdateKernel = ccmaVelUpdateKernel;
-//        ccmaForceKernel = ccmaVelForceKernel;
+    }
-//        ccmaUpdateKernel = ccmaVelUpdateKernel;
+    else {
-//        hasInitializedVelConstraintKernels = true;
+        settleKernel = settlePosKernel;
-//    }
+        shakeKernel = shakePosKernel;
-//    else {
+        ccmaForceKernel = ccmaPosForceKernel;
-//        hasInitialized = hasInitializedPosConstraintKernels;
+        ccmaUpdateKernel = ccmaPosUpdateKernel;
-//        settleKernel = settlePosKernel;
+    }
-//        shakeKernel = shakePosKernel;
+    float floatTol = (float) tol;
-//        ccmaForceKernel = ccmaPosForceKernel;
+    if (settleAtoms != NULL) {
-//        ccmaUpdateKernel = ccmaPosUpdateKernel;
+        int numClusters = settleAtoms->getSize();
-//        hasInitializedPosConstraintKernels = true;
+        void* args[] = {&numClusters, &floatTol, &context.getPosq().getDevicePointer(),
-//    }
+                &posDelta->getDevicePointer(), &context.getVelm().getDevicePointer(),
-//    if (settleAtoms != NULL) {
+                &settleAtoms->getDevicePointer(), &settleParams->getDevicePointer()};
-//        if (!hasInitialized) {
+        context.executeKernel(settleKernel, args, settleAtoms->getSize());
-//            settleKernel.setArg<int>(0, settleAtoms->getSize());
+    }
-//            settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+    if (shakeAtoms != NULL) {
-//            settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer());
+        int numClusters = shakeAtoms->getSize();
-//            settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
+        void* args[] = {&numClusters, &floatTol, &context.getPosq().getDevicePointer(),
-//            settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer());
+                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-//            settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer());
+                &shakeAtoms->getDevicePointer(), &shakeParams->getDevicePointer()};
-//        }
+        context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
-//        settleKernel.setArg<float>(1, (float) tol);
+    }
-//        context.executeKernel(settleKernel, settleAtoms->getSize());
+    if (ccmaAtoms != NULL) {
-//    }
+        void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer()};
-//    if (shakeAtoms != NULL) {
+        context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
-//        if (!hasInitialized) {
+        int i;
-//            shakeKernel.setArg<int>(0, shakeAtoms->getSize());
+        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
-//            shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-//            shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
-//            shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer());
+                &floatTol, &i};
-//            shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer());
+        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-//        }
+                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
-//        shakeKernel.setArg<float>(1, (float) tol);
+        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
-//        context.executeKernel(shakeKernel, shakeAtoms->getSize());
+                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-//    }
+                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-//    if (ccmaAtoms != NULL) {
+                &ccmaConverged->getDevicePointer(), &i};
-//        if (!hasInitialized) {
+        const int checkInterval = 4;
-//            ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
+        for (i = 0; i < 150; i++) {
-//            ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
+            if (i == 0) {
-//            ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+                ccmaConvergedMemory[0] = 1;
-//            ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
+                ccmaConvergedMemory[1] = 0;
-//            ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
+                cuMemcpyHtoD(ccmaConverged->getDevicePointer(), ccmaConvergedMemory, 2*sizeof(int));
-//            ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+            }
-//            ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
+            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
-//            ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
+            if ((i+1)%checkInterval == 0) {
-//            ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
+                cuMemcpyDtoH(ccmaConvergedMemory, ccmaConverged->getDevicePointer(), 2*sizeof(int));
-//            ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
+                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
-//            ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
+            }
-//            ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
+            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
-//            ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue->getDeviceBuffer());
+            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
-//            ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
+            if ((i+1)%checkInterval == 0) {
-//            ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints->getDeviceBuffer());
+                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
-//            ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints->getDeviceBuffer());
+                if (ccmaConvergedMemory[i%2])
-//            ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance->getDeviceBuffer());
+                    break;
-//            ccmaUpdateKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+            }
-//            ccmaUpdateKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
+        }
-//            ccmaUpdateKernel.setArg<cl::Buffer>(5, ccmaDelta1->getDeviceBuffer());
+    }
-//            ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
-//            ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
-//        }
-//        ccmaForceKernel.setArg<float>(6, (float) tol);
-//        context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
-//        const int checkInterval = 4;
-//        cl::Event event;
-//        for (int i = 0; i < 150; i++) {
-//            ccmaForceKernel.setArg<int>(7, i);
-//            if (i == 0) {
-//                ccmaConvergedMemory[0] = 1;
-//                ccmaConvergedMemory[1] = 0;
-//                context.getQueue().enqueueWriteBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), ccmaConvergedMemory);
-//            }
-//            context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
-//            if ((i+1)%checkInterval == 0)
-//                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), ccmaConvergedMemory, NULL, &event);
-//            ccmaMultiplyKernel.setArg<int>(5, i);
-//            context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
-//            ccmaUpdateKernel.setArg<int>(8, i);
-//            context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
-//            if ((i+1)%checkInterval == 0) {
-//                event.wait();
-//                if (ccmaConvergedMemory[i%2])
-//                    break;
-//            }
-//        }
-//    }
 }
 void CudaIntegrationUtilities::computeVirtualSites() {
@@ -796,8 +798,8 @@ void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumb
    // Create the random number arrays.
    lastSeed = randomNumberSeed;
-    random = CudaArray::create<float4>(32*context.getPaddedNumAtoms(), "random");
+    random = CudaArray::create<float4>(context, 32*context.getPaddedNumAtoms(), "random");
-    randomSeed = CudaArray::create<int4>(context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
+    randomSeed = CudaArray::create<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
    randomPos = random->getSize();
    // Use a quick and dirty RNG to pick seeds for the real random number generator.
@@ -826,7 +828,7 @@ int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
    }
    if (numValues > random->getSize()) {
        delete random;
-        random = CudaArray::create<float4>(numValues, "random");
+        random = CudaArray::create<float4>(context, numValues, "random");
    }
    int size = random->getSize();
    void* args[] = {&size, &random->getDevicePointer(), &randomSeed->getDevicePointer()};

--- a/platforms/cuda2/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.h
@@ -135,6 +135,7 @@ private:
    CudaArray* ccmaDelta2;
    CudaArray* ccmaConverged;
    int* ccmaConvergedMemory;
+    CUevent ccmaEvent;
    CudaArray* vsite2AvgAtoms;
    CudaArray* vsite2AvgWeights;
    CudaArray* vsite3AvgAtoms;
@@ -143,7 +144,6 @@ private:
    CudaArray* vsiteOutOfPlaneWeights;
    int randomPos;
    int lastSeed, numVsites;
-    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;
    struct ShakeCluster;
    struct ConstraintOrderer;
 };

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -108,14 +108,14 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == IntegrateVerletStepKernel::Name())
        return new CudaIntegrateVerletStepKernel(name, platform, cu);
-//    if (name == IntegrateLangevinStepKernel::Name())
+    if (name == IntegrateLangevinStepKernel::Name())
-//        return new CudaIntegrateLangevinStepKernel(name, platform, cu);
+        return new CudaIntegrateLangevinStepKernel(name, platform, cu);
-//    if (name == IntegrateBrownianStepKernel::Name())
+    if (name == IntegrateBrownianStepKernel::Name())
-//        return new CudaIntegrateBrownianStepKernel(name, platform, cu);
+        return new CudaIntegrateBrownianStepKernel(name, platform, cu);
-//    if (name == IntegrateVariableVerletStepKernel::Name())
+    if (name == IntegrateVariableVerletStepKernel::Name())
-//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
+        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
-//    if (name == IntegrateVariableLangevinStepKernel::Name())
+    if (name == IntegrateVariableLangevinStepKernel::Name())
-//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
+        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
 //    if (name == IntegrateCustomStepKernel::Name())
 //        return new CudaIntegrateCustomStepKernel(name, platform, cu);
 //    if (name == ApplyAndersenThermostatKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -82,7 +82,7 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
 }
 void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cu.setAtomsWereReordered(false);
@@ -134,7 +134,7 @@ void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
 }
 void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
@@ -160,7 +160,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
 }
 void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    if (cu.getUseDoublePrecision()) {
@@ -196,7 +196,7 @@ void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<
 }
 void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    velocities.resize(numParticles);
@@ -221,7 +221,7 @@ void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>
 }
 void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    if (cu.getUseDoublePrecision()) {
@@ -255,7 +255,7 @@ void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector
 }
 void CudaUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    long long* force = (long long*) cu.getPinnedBuffer();
    cu.getForce().download(force);
    const vector<int>& order = cu.getAtomIndex();
@@ -281,7 +281,7 @@ void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, cons
 }
 void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
 //    int version = 1;
 //    stream.write((char*) &version, sizeof(int));
 //    double time = cu.getTime();
@@ -299,7 +299,7 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
 }
 void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
 //    int version;
 //    stream.read((char*) &version, sizeof(int));
 //    if (version != 1)
@@ -330,20 +330,19 @@ void CudaApplyConstraintsKernel::initialize(const System& system) {
 }
 void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
-//    if (!hasInitializedKernel) {
+    if (!hasInitializedKernel) {
-//        hasInitializedKernel = true;
+        hasInitializedKernel = true;
-//        map<string, string> defines;
+        map<string, string> defines;
-//        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//        CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
+        CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
-//        applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
+        applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
-//        applyDeltasKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
+    }
-//        applyDeltasKernel.setArg<cu::Buffer>(1, cu.getIntegrationUtilities().getPosDelta().getDevicePointer());
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-//    }
+    cu.clearBuffer(integration.getPosDelta());
-//    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    integration.applyConstraints(tol);
-//    cu.clearBuffer(integration.getPosDelta());
+    void* args[] = {&cu.getPosq().getDevicePointer(), &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
-//    integration.applyConstraints(tol);
+    cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
-//    cu.executeKernel(applyDeltasKernel, cu.getNumAtoms());
+    integration.computeVirtualSites();
-//    integration.computeVirtualSites();
 }
 void CudaVirtualSitesKernel::initialize(const System& system) {
@@ -380,13 +379,13 @@ private:
 };
 CudaCalcHarmonicBondForceKernel::~CudaCalcHarmonicBondForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
 }
 void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -394,7 +393,7 @@ void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const Har
    if (numBonds == 0)
        return;
    vector<vector<int> > atoms(numBonds, vector<int>(2));
-    params = CudaArray::create<float2>(numBonds, "bondParams");
+    params = CudaArray::create<float2>(cu, numBonds, "bondParams");
    vector<float2> paramVector(numBonds);
    for (int i = 0; i < numBonds; i++) {
        double length, k;
@@ -414,7 +413,7 @@ double CudaCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool inclu
 }
 void CudaCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -467,7 +466,7 @@ private:
 };
 CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -475,7 +474,7 @@ CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
 }
 void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -518,7 +517,7 @@ void CudaCalcCustomBondForceKernel::initialize(const System& system, const Custo
        variables[name] = "bondParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = CudaArray::create<float>(force.getNumGlobalParameters(), "customBondGlobals");
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customBondGlobals");
        globals->upload(globalParamValues);
        string argName = cu.getBondedUtilities().addArgument(globals->getDevicePointer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -556,7 +555,7 @@ double CudaCalcCustomBondForceKernel::execute(ContextImpl& context, bool include
 }
 void CudaCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -609,13 +608,13 @@ private:
 };
 CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
 }
 void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
@@ -623,7 +622,7 @@ void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const Ha
    if (numAngles == 0)
        return;
    vector<vector<int> > atoms(numAngles, vector<int>(3));
-    params = CudaArray::create<float2>(numAngles, "angleParams");
+    params = CudaArray::create<float2>(cu, numAngles, "angleParams");
    vector<float2> paramVector(numAngles);
    for (int i = 0; i < numAngles; i++) {
        double angle, k;
@@ -644,7 +643,7 @@ double CudaCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool incl
 }
 void CudaCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
@@ -698,7 +697,7 @@ private:
 };
 CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -706,7 +705,7 @@ CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
 }
 void CudaCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
@@ -749,7 +748,7 @@ void CudaCalcCustomAngleForceKernel::initialize(const System& system, const Cust
        variables[name] = "angleParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = CudaArray::create<float>(force.getNumGlobalParameters(), "customAngleGlobals");
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customAngleGlobals");
        globals->upload(globalParamValues);
        string argName = cu.getBondedUtilities().addArgument(globals->getDevicePointer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -787,7 +786,7 @@ double CudaCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includ
 }
 void CudaCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
@@ -846,7 +845,7 @@ CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
 }
 void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -854,7 +853,7 @@ void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const
    if (numTorsions == 0)
        return;
    vector<vector<int> > atoms(numTorsions, vector<int>(4));
-    params = CudaArray::create<float4>(numTorsions, "periodicTorsionParams");
+    params = CudaArray::create<float4>(cu, numTorsions, "periodicTorsionParams");
    vector<float4> paramVector(numTorsions);
    for (int i = 0; i < numTorsions; i++) {
        int periodicity;
@@ -875,7 +874,7 @@ double CudaCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool in
 }
 void CudaCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -934,7 +933,7 @@ CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
 }
 void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -942,8 +941,8 @@ void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTors
    if (numTorsions == 0)
        return;
    vector<vector<int> > atoms(numTorsions, vector<int>(4));
-    params1 = CudaArray::create<float4>(numTorsions, "rbTorsionParams1");
+    params1 = CudaArray::create<float4>(cu, numTorsions, "rbTorsionParams1");
-    params2 = CudaArray::create<float2>(numTorsions, "rbTorsionParams2");
+    params2 = CudaArray::create<float2>(cu, numTorsions, "rbTorsionParams2");
    vector<float4> paramVector1(numTorsions);
    vector<float2> paramVector2(numTorsions);
    for (int i = 0; i < numTorsions; i++) {
@@ -968,7 +967,7 @@ double CudaCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeF
 }
 void CudaCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1034,7 +1033,7 @@ CudaCalcCMAPTorsionForceKernel::~CudaCalcCMAPTorsionForceKernel() {
 }
 void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1064,9 +1063,9 @@ void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAP
    vector<int> torsionMapsVec(numTorsions);
    for (int i = 0; i < numTorsions; i++)
        force.getTorsionParameters(startIndex+i, torsionMapsVec[i], atoms[i][0], atoms[i][1], atoms[i][2], atoms[i][3], atoms[i][4], atoms[i][5], atoms[i][6], atoms[i][7]);
-    coefficients = CudaArray::create<float4>(coeffVec.size(), "cmapTorsionCoefficients");
+    coefficients = CudaArray::create<float4>(cu, coeffVec.size(), "cmapTorsionCoefficients");
-    mapPositions = CudaArray::create<int2>(numMaps, "cmapTorsionMapPositions");
+    mapPositions = CudaArray::create<int2>(cu, numMaps, "cmapTorsionMapPositions");
-    torsionMaps = CudaArray::create<int>(numTorsions, "cmapTorsionMaps");
+    torsionMaps = CudaArray::create<int>(cu, numTorsions, "cmapTorsionMaps");
    coefficients->upload(coeffVec);
    mapPositions->upload(mapPositionsVec);
    torsionMaps->upload(torsionMapsVec);
@@ -1121,7 +1120,7 @@ CudaCalcCustomTorsionForceKernel::~CudaCalcCustomTorsionForceKernel() {
 }
 void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1164,7 +1163,7 @@ void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const Cu
        variables[name] = "torsionParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = CudaArray::create<float>(force.getNumGlobalParameters(), "customTorsionGlobals");
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customTorsionGlobals");
        globals->upload(globalParamValues);
        string argName = cu.getBondedUtilities().addArgument(globals->getDevicePointer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -1202,7 +1201,7 @@ double CudaCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool incl
 }
 void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
@@ -1260,7 +1259,7 @@ private:
 };
 CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (sigmaEpsilon != NULL)
        delete sigmaEpsilon;
    if (exceptionParams != NULL)
@@ -1310,7 +1309,7 @@ static int findFFTDimension(int minimum) {
 }
 void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    // Identify which exceptions are 1-4 interactions.
@@ -1328,7 +1327,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Initialize nonbonded interactions.
    int numParticles = force.getNumParticles();
-    sigmaEpsilon = CudaArray::create<float2>(numParticles, "sigmaEpsilon");
+    sigmaEpsilon = CudaArray::create<float2>(cu, numParticles, "sigmaEpsilon");
    CudaArray& posq = cu.getPosq();
    float4* posqf = (float4*) cu.getPinnedBuffer();
    double4* posqd = (double4*) cu.getPinnedBuffer();
@@ -1400,7 +1399,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
        ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
-        cosSinSums = new CudaArray((2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
+        cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
    }
    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
        // Compute the PME parameters.
@@ -1433,14 +1432,14 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        // Create required data structures.
        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-        pmeGrid = new CudaArray(gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
+        pmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
        cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*sizeof(float2));
-        pmeBsplineModuliX = new CudaArray(gridSizeX, elementSize, "pmeBsplineModuliX");
+        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
-        pmeBsplineModuliY = new CudaArray(gridSizeY, elementSize, "pmeBsplineModuliY");
+        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
-        pmeBsplineModuliZ = new CudaArray(gridSizeZ, elementSize, "pmeBsplineModuliZ");
+        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-        pmeBsplineTheta = new CudaArray(PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
+        pmeBsplineTheta = new CudaArray(cu, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
-        pmeAtomRange = CudaArray::create<int>(gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+        pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-        pmeAtomGridIndex = CudaArray::create<int2>(numParticles, "pmeAtomGridIndex");
+        pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, CUFFT_C2C);
        if (result != CUFFT_SUCCESS)
@@ -1537,7 +1536,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    if (numExceptions > 0) {
        exceptionAtoms.resize(numExceptions);
        vector<vector<int> > atoms(numExceptions, vector<int>(2));
-        exceptionParams = CudaArray::create<float4>(numExceptions, "exceptionParams");
+        exceptionParams = CudaArray::create<float4>(cu, numExceptions, "exceptionParams");
        vector<float4> exceptionParamsVector(numExceptions);
        for (int i = 0; i < numExceptions; i++) {
            double chargeProd, sigma, epsilon;
@@ -1596,7 +1595,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
 void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
    // Make sure the new parameters are acceptable.
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (force.getNumParticles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of particles has changed");
    if (!hasCoulomb || !hasLJ) {
@@ -1702,7 +1701,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //};
 //
 //CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (params != NULL)
 //        delete params;
 //    if (globals != NULL)
@@ -1714,7 +1713,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //}
 //
 //void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    int forceIndex;
 //    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
 //        ;
@@ -1838,7 +1837,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //}
 //
 //void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    int numParticles = force.getNumParticles();
 //    if (numParticles != cu.getNumAtoms())
 //        throw OpenMMException("updateParametersInContext: The number of particles has changed");
@@ -1875,7 +1874,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //};
 //
 //CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (params != NULL)
 //        delete params;
 //    if (bornSum != NULL)
@@ -1893,7 +1892,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //}
 //
 //void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (cu.getPlatformData().contexts.size() > 1)
 //        throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
 //    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
@@ -2059,7 +2058,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
 //    // Make sure the new parameters are acceptable.
 //    
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    int numParticles = force.getNumParticles();
 //    if (numParticles != cu.getNumAtoms())
 //        throw OpenMMException("updateParametersInContext: The number of particles has changed");
@@ -2117,7 +2116,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //};
 //
 //CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (params != NULL)
 //        delete params;
 //    if (computedValues != NULL)
@@ -2139,7 +2138,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //}
 //
 //void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (cu.getPlatformData().contexts.size() > 1)
 //        throw OpenMMException("CustomGBForce does not support using multiple CUDA devices");
 //    bool useExclusionsForValue = false;
@@ -2998,7 +2997,7 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 //}
 //
 //void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context, const CustomGBForce& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    int numParticles = force.getNumParticles();
 //    if (numParticles != cu.getNumAtoms())
 //        throw OpenMMException("updateParametersInContext: The number of particles has changed");
@@ -3053,7 +3052,7 @@ private:
 };
 CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -3061,7 +3060,7 @@ CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
 }
 void CudaCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumParticles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumParticles()/numContexts;
@@ -3110,7 +3109,7 @@ void CudaCalcCustomExternalForceKernel::initialize(const System& system, const C
        variables[name] = "particleParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = CudaArray::create<float>(force.getNumGlobalParameters(), "customExternalGlobals");
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customExternalGlobals");
        globals->upload(globalParamValues);
        string argName = cu.getBondedUtilities().addArgument(globals->getDevicePointer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -3148,7 +3147,7 @@ double CudaCalcCustomExternalForceKernel::execute(ContextImpl& context, bool inc
 }
 void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumParticles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumParticles()/numContexts;
@@ -3250,7 +3249,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
 //};
 //
 //CudaCalcCustomHbondForceKernel::~CudaCalcCustomHbondForceKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (donorParams != NULL)
 //        delete donorParams;
 //    if (acceptorParams != NULL)
@@ -3291,7 +3290,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
 //void CudaCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
 //    // Record the lists of donors and acceptors, and the parameters for each one.
 //
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    int numContexts = cu.getPlatformData().contexts.size();
 //    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
 //    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
@@ -3686,7 +3685,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
 //}
 //
 //void CudaCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    int numContexts = cu.getPlatformData().contexts.size();
 //    int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
 //    int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
@@ -3751,7 +3750,7 @@ private:
 };
 CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel() {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    if (params != NULL)
        delete params;
    if (globals != NULL)
@@ -3763,7 +3762,7 @@ CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel()
 }
 void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -3799,7 +3798,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
        functions[name] = &fp;
        tabulatedFunctionParamsVec[i] = make_float4((float) min, (float) max, (float) ((values.size()-1)/(max-min)), (float) values.size()-2);
        vector<float4> f = cu.getExpressionUtilities().computeFunctionCoefficients(values, min, max);
-        CudaArray* array = CudaArray::create<float4>(values.size()-1, "TabulatedFunction");
+        CudaArray* array = CudaArray::create<float4>(cu, values.size()-1, "TabulatedFunction");
        tabulatedFunctions.push_back(array);
        array->upload(f);
        string arrayName = cu.getBondedUtilities().addArgument(array->getDevicePointer(), "float4");
@@ -3807,7 +3806,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
    }
    string functionParamsName;
    if (force.getNumFunctions() > 0) {
-        tabulatedFunctionParams = CudaArray::create<float4>(tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters");
+        tabulatedFunctionParams = CudaArray::create<float4>(cu, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters");
        tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
        functionParamsName = cu.getBondedUtilities().addArgument(tabulatedFunctionParams->getDevicePointer(), "float4");
    }
@@ -3832,7 +3831,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
        variables[name] = "bondParams"+params->getParameterSuffix(i);
    }
    if (force.getNumGlobalParameters() > 0) {
-        globals = CudaArray::create<float>(force.getNumGlobalParameters(), "customCompoundBondGlobals");
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customCompoundBondGlobals");
        globals->upload(globalParamValues);
        string argName = cu.getBondedUtilities().addArgument(globals->getDevicePointer(), "float");
        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
@@ -4020,7 +4019,7 @@ double CudaCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool
 }
 void CudaCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
@@ -4049,7 +4048,7 @@ CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }
 void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    cu.getPlatformData().initializeContexts(system);
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4101,295 +4100,303 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
    cu.setStepCount(cu.getStepCount()+1);
 }
-//CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
+CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
-//    if (params != NULL)
+    if (params != NULL)
-//        delete params;
+        delete params;
-//}
+}
-//
-//void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
+void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
-//    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
-//    cu.getPlatformData().initializeContexts(system);
+    cu.getPlatformData().initializeContexts(system);
-//    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
-//    map<string, string> defines;
+    map<string, string> defines;
-//    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
+    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
-//    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
+    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
-//    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
+    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
-//    params = new CudaArray<cl_float>(cu, 3, "langevinParams");
+    params = new CudaArray(cu, 3, cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "langevinParams");
-//    prevStepSize = -1.0;
+    prevStepSize = -1.0;
-//}
+}
-//
-//void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
-//    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-//    int numAtoms = cu.getNumAtoms();
+    int numAtoms = cu.getNumAtoms();
-//    if (!hasInitializedKernels) {
+    double temperature = integrator.getTemperature();
-//        hasInitializedKernels = true;
+    double friction = integrator.getFriction();
-//        kernel1.setArg<cu::Buffer>(0, cu.getVelm().getDevicePointer());
+    double stepSize = integrator.getStepSize();
-//        kernel1.setArg<cu::Buffer>(1, cu.getForce().getDevicePointer());
+    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
-//        kernel1.setArg<cu::Buffer>(2, integration.getPosDelta().getDevicePointer());
+        // Calculate the integration parameters.
-//        kernel1.setArg<cu::Buffer>(3, params->getDevicePointer());
-//        kernel1.setArg<cu::Buffer>(4, integration.getStepSize().getDevicePointer());
+        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-//        kernel1.setArg<cu::Buffer>(5, integration.getRandom().getDevicePointer());
+        double kT = BOLTZ*temperature;
-//        kernel2.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
+        double vscale = exp(-stepSize/tau);
-//        kernel2.setArg<cu::Buffer>(1, integration.getPosDelta().getDevicePointer());
+        double fscale = (1-vscale)*tau;
-//        kernel2.setArg<cu::Buffer>(2, cu.getVelm().getDevicePointer());
+        double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
-//        kernel2.setArg<cu::Buffer>(3, integration.getStepSize().getDevicePointer());
+        if (cu.getUseDoublePrecision()) {
-//    }
+            vector<double> p(params->getSize());
-//    double temperature = integrator.getTemperature();
+            p[0] = vscale;
-//    double friction = integrator.getFriction();
+            p[1] = fscale;
-//    double stepSize = integrator.getStepSize();
+            p[2] = noisescale;
-//    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+            params->upload(p);
-//        // Calculate the integration parameters.
+            double2 ss = make_double2(0, stepSize);
-//
+            integration.getStepSize().upload(&ss);
-//        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+        }
-//        double kT = BOLTZ*temperature;
+        else {
-//        double vscale = exp(-stepSize/tau);
+            vector<float> p(params->getSize());
-//        double fscale = (1-vscale)*tau;
+            p[0] = (float) vscale;
-//        double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
+            p[1] = (float) fscale;
-//        vector<cl_float> p(params->getSize());
+            p[2] = (float) noisescale;
-//        p[0] = (cl_float) vscale;
+            params->upload(p);
-//        p[1] = (cl_float) fscale;
+            float2 ss = make_float2(0, (float) stepSize);
-//        p[2] = (cl_float) noisescale;
+            integration.getStepSize().upload(&ss);
-//        params->upload(p);
+        }
-//        integration.getStepSize()[0].y = (cl_float) stepSize;
+        prevTemp = temperature;
-//        integration.getStepSize().upload();
+        prevFriction = friction;
-//        prevTemp = temperature;
+        prevStepSize = stepSize;
-//        prevFriction = friction;
+    }
-//        prevStepSize = stepSize;
-//    }
+    // Call the first integration kernel.
-//
-//    // Call the first integration kernel.
+    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-//
+    void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-//    kernel1.setArg<cl_uint>(6, integration.prepareRandomNumbers(cu.getPaddedNumAtoms()));
+            &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
-//    cu.executeKernel(kernel1, numAtoms);
+    cu.executeKernel(kernel1, args1, numAtoms);
-//
-//    // Apply constraints.
+    // Apply constraints.
-//
-//    integration.applyConstraints(integrator.getConstraintTolerance());
+    integration.applyConstraints(integrator.getConstraintTolerance());
-//
-//    // Call the second integration kernel.
+    // Call the second integration kernel.
-//
-//    cu.executeKernel(kernel2, numAtoms);
+    void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-//    integration.computeVirtualSites();
+            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
-//
+    cu.executeKernel(kernel2, args2, numAtoms);
-//    // Update the time and step count.
+    integration.computeVirtualSites();
-//
-//    cu.setTime(cu.getTime()+stepSize);
+    // Update the time and step count.
-//    cu.setStepCount(cu.getStepCount()+1);
-//}
+    cu.setTime(cu.getTime()+stepSize);
-//
+    cu.setStepCount(cu.getStepCount()+1);
-//CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
+}
-//}
-//
+CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
-//void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
+}
-//    cuCtxSetCurrent(cu.getContext());
-//    cu.getPlatformData().initializeContexts(system);
+void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
-//    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    cu.setAsCurrent();
-//    map<string, string> defines;
+    cu.getPlatformData().initializeContexts(system);
-//    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
-//    CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
+    map<string, string> defines;
-//    kernel1 = cu.getKernel(module, "integrateBrownianPart1");
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//    kernel2 = cu.getKernel(module, "integrateBrownianPart2");
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    prevStepSize = -1.0;
+    CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
-//}
+    kernel1 = cu.getKernel(module, "integrateBrownianPart1");
-//
+    kernel2 = cu.getKernel(module, "integrateBrownianPart2");
-//void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
+    prevStepSize = -1.0;
-//    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+}
-//    int numAtoms = cu.getNumAtoms();
-//    if (!hasInitializedKernels) {
+void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
-//        hasInitializedKernels = true;
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-//        kernel1.setArg<cu::Buffer>(2, cu.getForce().getDevicePointer());
+    int numAtoms = cu.getNumAtoms();
-//        kernel1.setArg<cu::Buffer>(3, integration.getPosDelta().getDevicePointer());
+    double temperature = integrator.getTemperature();
-//        kernel1.setArg<cu::Buffer>(4, cu.getVelm().getDevicePointer());
+    double friction = integrator.getFriction();
-//        kernel1.setArg<cu::Buffer>(5, integration.getRandom().getDevicePointer());
+    double stepSize = integrator.getStepSize();
-//        kernel2.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer());
+    double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-//        kernel2.setArg<cu::Buffer>(2, cu.getVelm().getDevicePointer());
+    double tauDt = tau*stepSize;
-//        kernel2.setArg<cu::Buffer>(3, integration.getPosDelta().getDevicePointer());
+    double noise = sqrt(2.0f*BOLTZ*temperature*stepSize*tau);
-//    }
+    float stepSizeFloat = (float) stepSize;
-//    double temperature = integrator.getTemperature();
+    float tauDtFloat = (float) tauDt;
-//    double friction = integrator.getFriction();
+    float noiseFloat = (float) noise;
-//    double stepSize = integrator.getStepSize();
-//    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+    // Call the first integration kernel.
-//        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-//        kernel1.setArg<cl_float>(0, (cl_float) (tau*stepSize));
+    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-//        kernel1.setArg<cl_float>(1, (cl_float) (sqrt(2.0f*BOLTZ*temperature*stepSize*tau)));
+    void* args1[] = {cu.getUseDoublePrecision() ? (void*) &tauDt : (void*) &tauDtFloat,
-//        kernel2.setArg<cl_float>(0, (cl_float) (1.0/stepSize));
+            cu.getUseDoublePrecision() ? (void*) &noise : (void*) &noiseFloat,
-//        prevTemp = temperature;
+            &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-//        prevFriction = friction;
+            &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
-//        prevStepSize = stepSize;
+    cu.executeKernel(kernel1, args1, numAtoms);
-//    }
-//
+    // Apply constraints.
-//    // Call the first integration kernel.
-//
+    integration.applyConstraints(integrator.getConstraintTolerance());
-//    kernel1.setArg<cl_uint>(6, integration.prepareRandomNumbers(cu.getPaddedNumAtoms()));
-//    cu.executeKernel(kernel1, numAtoms);
+    // Call the second integration kernel.
-//
-//    // Apply constraints.
+    void* args2[] = {cu.getUseDoublePrecision() ? (void*) &stepSize : (void*) &stepSizeFloat,
-//
+            &cu.getPosq().getDevicePointer(), &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
-//    integration.applyConstraints(integrator.getConstraintTolerance());
+    cu.executeKernel(kernel2, args2, numAtoms);
-//
+    integration.computeVirtualSites();
-//    // Call the second integration kernel.
-//
+    // Update the time and step count.
-//    cu.executeKernel(kernel2, numAtoms);
-//    integration.computeVirtualSites();
+    cu.setTime(cu.getTime()+stepSize);
-//
+    cu.setStepCount(cu.getStepCount()+1);
-//    // Update the time and step count.
+}
-//
-//    cu.setTime(cu.getTime()+stepSize);
+CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel() {
-//    cu.setStepCount(cu.getStepCount()+1);
+}
-//}
-//
+void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
-//CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel() {
+    cu.setAsCurrent();
-//}
+    cu.getPlatformData().initializeContexts(system);
-//
+    map<string, string> defines;
-//void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//    cuCtxSetCurrent(cu.getContext());
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    cu.getPlatformData().initializeContexts(system);
+    CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
-//    CUmodule module = cu.createModule(CudaKernelSources::verlet, "");
+    kernel1 = cu.getKernel(module, "integrateVerletPart1");
-//    kernel1 = cu.getKernel(module, "integrateVerletPart1");
+    kernel2 = cu.getKernel(module, "integrateVerletPart2");
-//    kernel2 = cu.getKernel(module, "integrateVerletPart2");
+    selectSizeKernel = cu.getKernel(module, "selectVerletStepSize");
-//    selectSizeKernel = cu.getKernel(module, "selectVerletStepSize");
+    blockSize = min(256, system.getNumParticles());
-//    blockSize = min(min(256, system.getNumParticles()), (int) cu.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+}
-//}
-//
+double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
-//double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-//    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+    int numAtoms = cu.getNumAtoms();
-//    int numAtoms = cu.getNumAtoms();
-//    if (!hasInitializedKernels) {
+    // Select the step size to use.
-//        hasInitializedKernels = true;
-//        kernel1.setArg<cl_int>(0, numAtoms);
+    double maxStepSize = maxTime-cu.getTime();
-//        kernel1.setArg<cu::Buffer>(1, cu.getIntegrationUtilities().getStepSize().getDevicePointer());
+    float maxStepSizeFloat = (float) maxStepSize;
-//        kernel1.setArg<cu::Buffer>(2, cu.getPosq().getDevicePointer());
+    double tol = integrator.getErrorTolerance();
-//        kernel1.setArg<cu::Buffer>(3, cu.getVelm().getDevicePointer());
+    float tolFloat = (float) tol;
-//        kernel1.setArg<cu::Buffer>(4, cu.getForce().getDevicePointer());
+    void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
-//        kernel1.setArg<cu::Buffer>(5, integration.getPosDelta().getDevicePointer());
+            cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat,
-//        kernel2.setArg<cl_int>(0, numAtoms);
+            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
-//        kernel2.setArg<cu::Buffer>(1, cu.getIntegrationUtilities().getStepSize().getDevicePointer());
+            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()};
-//        kernel2.setArg<cu::Buffer>(2, cu.getPosq().getDevicePointer());
+    int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-//        kernel2.setArg<cu::Buffer>(3, cu.getVelm().getDevicePointer());
+    cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);
-//        kernel2.setArg<cu::Buffer>(4, integration.getPosDelta().getDevicePointer());
-//        selectSizeKernel.setArg<cl_int>(0, numAtoms);
+    // Call the first integration kernel.
-//        selectSizeKernel.setArg<cu::Buffer>(3, cu.getIntegrationUtilities().getStepSize().getDevicePointer());
-//        selectSizeKernel.setArg<cu::Buffer>(4, cu.getVelm().getDevicePointer());
+    void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(),
-//        selectSizeKernel.setArg<cu::Buffer>(5, cu.getForce().getDevicePointer());
+            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
-//        selectSizeKernel.setArg(6, blockSize*sizeof(cl_float), NULL);
+    cu.executeKernel(kernel1, args1, numAtoms);
-//    }
-//
+    // Apply constraints.
-//    // Select the step size to use.
-//
+    integration.applyConstraints(integrator.getConstraintTolerance());
-//    float maxStepSize = (float)(maxTime-cu.getTime());
-//    selectSizeKernel.setArg<cl_float>(1, maxStepSize);
+    // Call the second integration kernel.
-//    selectSizeKernel.setArg<cl_float>(2, (cl_float) integrator.getErrorTolerance());
-//    cu.executeKernel(selectSizeKernel, blockSize, blockSize);
+    void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(),
-//
+            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
-//    // Call the first integration kernel.
+    cu.executeKernel(kernel2, args2, numAtoms);
-//
+    integration.computeVirtualSites();
-//    cu.executeKernel(kernel1, numAtoms);
-//
+    // Update the time and step count.
-//    // Apply constraints.
-//
+    double dt, time;
-//    integration.applyConstraints(integrator.getConstraintTolerance());
+    if (cu.getUseDoublePrecision()) {
-//
+        double2 stepSize;
-//    // Call the second integration kernel.
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-//
+        dt = stepSize.y;
-//    cu.executeKernel(kernel2, numAtoms);
+        time = cu.getTime()+dt;
-//    integration.computeVirtualSites();
+        if (dt == maxStepSize)
-//
+            time = maxTime; // Avoid round-off error
-//    // Update the time and step count.
+    }
-//
+    else {
-//    cu.getIntegrationUtilities().getStepSize().download();
+        float2 stepSize;
-//    double dt = cu.getIntegrationUtilities().getStepSize()[0].y;
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-//    double time = cu.getTime()+dt;
+        dt = stepSize.y;
-//    if (dt == maxStepSize)
+        time = cu.getTime()+dt;
-//        time = maxTime; // Avoid round-off error
+        if (dt == maxStepSizeFloat)
-//    cu.setTime(time);
+            time = maxTime; // Avoid round-off error
-//    cu.setStepCount(cu.getStepCount()+1);
+    }
-//    return dt;
+    cu.setTime(time);
-//}
+    cu.setStepCount(cu.getStepCount()+1);
-//
+    return dt;
-//CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
+}
-//    cuCtxSetCurrent(cu.getContext());
-//    if (params != NULL)
+CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
-//        delete params;
+    cu.setAsCurrent();
-//}
+    if (params != NULL)
-//
+        delete params;
-//void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
+}
-//    cuCtxSetCurrent(cu.getContext());
-//    cu.getPlatformData().initializeContexts(system);
+void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
-//    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    cu.setAsCurrent();
-//    map<string, string> defines;
+    cu.getPlatformData().initializeContexts(system);
-//    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
+    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
-//    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    map<string, string> defines;
-//    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
+    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
-//    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
-//    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
+    CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
-//    selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
+    kernel1 = cu.getKernel(module, "integrateLangevinPart1");
-//    params = new CudaArray<cl_float>(cu, 3, "langevinParams");
+    kernel2 = cu.getKernel(module, "integrateLangevinPart2");
-//    blockSize = min(256, system.getNumParticles());
+    selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
-//    blockSize = max(blockSize, params->getSize());
+    params = CudaArray::create<float>(cu, 3, "langevinParams");
-//    blockSize = min(blockSize, (int) cu.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+    blockSize = min(256, system.getNumParticles());
-//}
+    blockSize = max(blockSize, params->getSize());
-//
+}
-//double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
-//    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
+double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
-//    int numAtoms = cu.getNumAtoms();
+    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
-//    if (!hasInitializedKernels) {
+    int numAtoms = cu.getNumAtoms();
-//        hasInitializedKernels = true;
-//        kernel1.setArg<cu::Buffer>(0, cu.getVelm().getDevicePointer());
+    // Select the step size to use.
-//        kernel1.setArg<cu::Buffer>(1, cu.getForce().getDevicePointer());
-//        kernel1.setArg<cu::Buffer>(2, integration.getPosDelta().getDevicePointer());
+    double maxStepSize = maxTime-cu.getTime();
-//        kernel1.setArg<cu::Buffer>(3, params->getDevicePointer());
+    float maxStepSizeFloat = (float) maxStepSize;
-//        kernel1.setArg<cu::Buffer>(4, integration.getStepSize().getDevicePointer());
+    double tol = integrator.getErrorTolerance();
-//        kernel1.setArg<cu::Buffer>(5, integration.getRandom().getDevicePointer());
+    float tolFloat = (float) tol;
-//        kernel2.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
+    double tau = integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction();
-//        kernel2.setArg<cu::Buffer>(1, integration.getPosDelta().getDevicePointer());
+    float tauFloat = (float) tau;
-//        kernel2.setArg<cu::Buffer>(2, cu.getVelm().getDevicePointer());
+    double kT = BOLTZ*integrator.getTemperature();
-//        kernel2.setArg<cu::Buffer>(3, integration.getStepSize().getDevicePointer());
+    float kTFloat = (float) kT;
-//        selectSizeKernel.setArg<cu::Buffer>(4, integration.getStepSize().getDevicePointer());
+    void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
-//        selectSizeKernel.setArg<cu::Buffer>(5, cu.getVelm().getDevicePointer());
+            cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat,
-//        selectSizeKernel.setArg<cu::Buffer>(6, cu.getForce().getDevicePointer());
+            cu.getUseDoublePrecision() ? (void*) &tau : (void*) &tauFloat,
-//        selectSizeKernel.setArg<cu::Buffer>(7, params->getDevicePointer());
+            cu.getUseDoublePrecision() ? (void*) &kT : (void*) &kTFloat,
-//        selectSizeKernel.setArg(8, params->getSize()*sizeof(cl_float), NULL);
+            &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
-//        selectSizeKernel.setArg(9, blockSize*sizeof(cl_float), NULL);
+            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &params->getDevicePointer()};
-//    }
+    int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-//
+    cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);
-//    // Select the step size to use.
-//
+    // Call the first integration kernel.
-//    float maxStepSize = (float)(maxTime-cu.getTime());
-//    selectSizeKernel.setArg<cl_float>(0, maxStepSize);
+    int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
-//    selectSizeKernel.setArg<cl_float>(1, (cl_float) integrator.getErrorTolerance());
+    void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-//    selectSizeKernel.setArg<cl_float>(2, (cl_float) (integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction()));
+            &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
-//    selectSizeKernel.setArg<cl_float>(3, (cl_float) (BOLTZ*integrator.getTemperature()));
+    cu.executeKernel(kernel1, args1, numAtoms);
-//    cu.executeKernel(selectSizeKernel, blockSize, blockSize);
-//
+    // Apply constraints.
-//    // Call the first integration kernel.
-//
+    integration.applyConstraints(integrator.getConstraintTolerance());
-//    kernel1.setArg<cl_uint>(6, integration.prepareRandomNumbers(cu.getPaddedNumAtoms()));
-//    cu.executeKernel(kernel1, numAtoms);
+    // Call the second integration kernel.
-//
-//    // Apply constraints.
+    void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
-//
+            &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
-//    integration.applyConstraints(integrator.getConstraintTolerance());
+    cu.executeKernel(kernel2, args2, numAtoms);
-//
+    integration.computeVirtualSites();
-//    // Call the second integration kernel.
-//
+    // Update the time and step count.
-//    cu.executeKernel(kernel2, numAtoms);
-//    integration.computeVirtualSites();
+    double dt, time;
-//
+    if (cu.getUseDoublePrecision()) {
-//    // Update the time and step count.
+        double2 stepSize;
-//
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-//    cu.getIntegrationUtilities().getStepSize().download();
+        dt = stepSize.y;
-//    double dt = cu.getIntegrationUtilities().getStepSize()[0].y;
+        time = cu.getTime()+dt;
-//    double time = cu.getTime()+dt;
+        if (dt == maxStepSize)
-//    if (dt == maxStepSize)
+            time = maxTime; // Avoid round-off error
-//        time = maxTime; // Avoid round-off error
+    }
-//    cu.setTime(time);
+    else {
-//    cu.setStepCount(cu.getStepCount()+1);
+        float2 stepSize;
-//    return dt;
+        cu.getIntegrationUtilities().getStepSize().download(&stepSize);
-//}
+        dt = stepSize.y;
-//
+        time = cu.getTime()+dt;
+        if (dt == maxStepSizeFloat)
+            time = maxTime; // Avoid round-off error
+    }
+    cu.setTime(time);
+    cu.setStepCount(cu.getStepCount()+1);
+    return dt;
+}
 //class CudaIntegrateCustomStepKernel::ReorderListener : public CudaContext::ReorderListener {
 //public:
 //    ReorderListener(CudaContext& cu, CudaParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValues, bool& deviceValuesAreCurrent) :
@@ -4433,7 +4440,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //};
 //
 //CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (globalValues != NULL)
 //        delete globalValues;
 //    if (contextParameterValues != NULL)
@@ -4451,7 +4458,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    cu.getPlatformData().initializeContexts(system);
 //    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
 //    numGlobalVariables = integrator.getNumGlobalVariables();
@@ -4956,13 +4963,13 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (atomGroups != NULL)
 //        delete atomGroups;
 //}
 //
 //void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    randomSeed = thermostat.getRandomNumberSeed();
 //    map<string, string> defines;
 //    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -4997,7 +5004,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (savedPositions != NULL)
 //        delete savedPositions;
 //    if (moleculeAtoms != NULL)
@@ -5007,7 +5014,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 //
 //void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    savedPositions = new CudaArray<mm_float4>(cu, cu.getPaddedNumAtoms(), "savedPositions");
 //    CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
 //    kernel = cu.getKernel(module, "scalePositions");
@@ -5056,7 +5063,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
 //}
 void CudaCalcKineticEnergyKernel::initialize(const System& system) {
-    cuCtxSetCurrent(cu.getContext());
+    cu.setAsCurrent();
    int numParticles = system.getNumParticles();
    masses.resize(numParticles);
    for (int i = 0; i < numParticles; ++i)
@@ -5089,13 +5096,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
 }
 //CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    if (cmMomentum != NULL)
 //        delete cmMomentum;
 //}
 //
 //void CudaRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
-//    cuCtxSetCurrent(cu.getContext());
+//    cu.setAsCurrent();
 //    frequency = force.getFrequency();
 //    int numAtoms = cu.getNumAtoms();
 //    cmMomentum = new CudaArray<mm_float4>(cu, (numAtoms+CudaContext::ThreadBlockSize-1)/CudaContext::ThreadBlockSize, "cmMomentum");

--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -942,133 +942,126 @@ private:
    CUfunction kernel1, kernel2;
 };
-///**
+/**
-// * This kernel is invoked by LangevinIntegrator to take one time step.
+ * This kernel is invoked by LangevinIntegrator to take one time step.
-// */
+ */
-//class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
+class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
-//public:
+public:
-//    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateLangevinStepKernel(name, platform), cu(cu),
+    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateLangevinStepKernel(name, platform), cu(cu), params(NULL) {
-//            hasInitializedKernels(false), params(NULL) {
+    }
-//    }
+    ~CudaIntegrateLangevinStepKernel();
-//    ~CudaIntegrateLangevinStepKernel();
+    /**
-//    /**
+     * Initialize the kernel, setting up the particle masses.
-//     * Initialize the kernel, setting up the particle masses.
+     *
-//     *
+     * @param system     the System this kernel will be applied to
-//     * @param system     the System this kernel will be applied to
+     * @param integrator the LangevinIntegrator this kernel will be used for
-//     * @param integrator the LangevinIntegrator this kernel will be used for
+     */
-//     */
+    void initialize(const System& system, const LangevinIntegrator& integrator);
-//    void initialize(const System& system, const LangevinIntegrator& integrator);
+    /**
-//    /**
+     * Execute the kernel.
-//     * Execute the kernel.
+     *
-//     *
+     * @param context    the context in which to execute this kernel
-//     * @param context    the context in which to execute this kernel
+     * @param integrator the LangevinIntegrator this kernel is being used for
-//     * @param integrator the LangevinIntegrator this kernel is being used for
+     */
-//     */
+    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
-//    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
+private:
-//private:
+    CudaContext& cu;
-//    CudaContext& cu;
+    double prevTemp, prevFriction, prevStepSize;
-//    double prevTemp, prevFriction, prevStepSize;
+    CudaArray* params;
-//    bool hasInitializedKernels;
+    CUfunction kernel1, kernel2;
-//    CudaArray<cl_float>* params;
+};
-//    CUfunction kernel1, kernel2;
-//};
+/**
-//
+ * This kernel is invoked by BrownianIntegrator to take one time step.
-///**
+ */
-// * This kernel is invoked by BrownianIntegrator to take one time step.
+class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
-// */
+public:
-//class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
+    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateBrownianStepKernel(name, platform), cu(cu) {
-//public:
+    }
-//    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateBrownianStepKernel(name, platform), cu(cu),
+    ~CudaIntegrateBrownianStepKernel();
-//            hasInitializedKernels(false), prevTemp(-1), prevFriction(-1), prevStepSize(-1) {
+    /**
-//    }
+     * Initialize the kernel.
-//    ~CudaIntegrateBrownianStepKernel();
+     *
-//    /**
+     * @param system     the System this kernel will be applied to
-//     * Initialize the kernel.
+     * @param integrator the BrownianIntegrator this kernel will be used for
-//     *
+     */
-//     * @param system     the System this kernel will be applied to
+    void initialize(const System& system, const BrownianIntegrator& integrator);
-//     * @param integrator the BrownianIntegrator this kernel will be used for
+    /**
-//     */
+     * Execute the kernel.
-//    void initialize(const System& system, const BrownianIntegrator& integrator);
+     *
-//    /**
+     * @param context    the context in which to execute this kernel
-//     * Execute the kernel.
+     * @param integrator the BrownianIntegrator this kernel is being used for
-//     *
+     */
-//     * @param context    the context in which to execute this kernel
+    void execute(ContextImpl& context, const BrownianIntegrator& integrator);
-//     * @param integrator the BrownianIntegrator this kernel is being used for
+private:
-//     */
+    CudaContext& cu;
-//    void execute(ContextImpl& context, const BrownianIntegrator& integrator);
+    double prevTemp, prevFriction, prevStepSize;
-//private:
+    CUfunction kernel1, kernel2;
-//    CudaContext& cu;
+};
-//    double prevTemp, prevFriction, prevStepSize;
-//    bool hasInitializedKernels;
+/**
-//    CUfunction kernel1, kernel2;
+ * This kernel is invoked by VariableVerletIntegrator to take one time step.
-//};
+ */
-//
+class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
-///**
+public:
-// * This kernel is invoked by VariableVerletIntegrator to take one time step.
+    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableVerletStepKernel(name, platform), cu(cu) {
-// */
+    }
-//class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
+    ~CudaIntegrateVariableVerletStepKernel();
-//public:
+    /**
-//    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableVerletStepKernel(name, platform), cu(cu),
+     * Initialize the kernel.
-//            hasInitializedKernels(false) {
+     *
-//    }
+     * @param system     the System this kernel will be applied to
-//    ~CudaIntegrateVariableVerletStepKernel();
+     * @param integrator the VerletIntegrator this kernel will be used for
-//    /**
+     */
-//     * Initialize the kernel.
+    void initialize(const System& system, const VariableVerletIntegrator& integrator);
-//     *
+    /**
-//     * @param system     the System this kernel will be applied to
+     * Execute the kernel.
-//     * @param integrator the VerletIntegrator this kernel will be used for
+     *
-//     */
+     * @param context    the context in which to execute this kernel
-//    void initialize(const System& system, const VariableVerletIntegrator& integrator);
+     * @param integrator the VerletIntegrator this kernel is being used for
-//    /**
+     * @param maxTime    the maximum time beyond which the simulation should not be advanced
-//     * Execute the kernel.
+     * @return the size of the step that was taken
-//     *
+     */
-//     * @param context    the context in which to execute this kernel
+    double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
-//     * @param integrator the VerletIntegrator this kernel is being used for
+private:
-//     * @param maxTime    the maximum time beyond which the simulation should not be advanced
+    CudaContext& cu;
-//     * @return the size of the step that was taken
+    int blockSize;
-//     */
+    CUfunction kernel1, kernel2, selectSizeKernel;
-//    double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
+};
-//private:
-//    CudaContext& cu;
+/**
-//    bool hasInitializedKernels;
+ * This kernel is invoked by VariableLangevinIntegrator to take one time step.
-//    int blockSize;
+ */
-//    CUfunction kernel1, kernel2, selectSizeKernel;
+class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
-//};
+public:
-//
+    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableLangevinStepKernel(name, platform),
-///**
+            cu(cu), params(NULL) {
-// * This kernel is invoked by VariableLangevinIntegrator to take one time step.
+    }
-// */
+    ~CudaIntegrateVariableLangevinStepKernel();
-//class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
+    /**
-//public:
+     * Initialize the kernel, setting up the particle masses.
-//    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableLangevinStepKernel(name, platform), cu(cu),
+     *
-//            hasInitializedKernels(false), params(NULL) {
+     * @param system     the System this kernel will be applied to
-//    }
+     * @param integrator the VariableLangevinIntegrator this kernel will be used for
-//    ~CudaIntegrateVariableLangevinStepKernel();
+     */
-//    /**
+    void initialize(const System& system, const VariableLangevinIntegrator& integrator);
-//     * Initialize the kernel, setting up the particle masses.
+    /**
-//     *
+     * Execute the kernel.
-//     * @param system     the System this kernel will be applied to
+     *
-//     * @param integrator the VariableLangevinIntegrator this kernel will be used for
+     * @param context    the context in which to execute this kernel
-//     */
+     * @param integrator the VariableLangevinIntegrator this kernel is being used for
-//    void initialize(const System& system, const VariableLangevinIntegrator& integrator);
+     * @param maxTime    the maximum time beyond which the simulation should not be advanced
-//    /**
+     * @return the size of the step that was taken
-//     * Execute the kernel.
+     */
-//     *
+    double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
-//     * @param context    the context in which to execute this kernel
+private:
-//     * @param integrator the VariableLangevinIntegrator this kernel is being used for
+    CudaContext& cu;
-//     * @param maxTime    the maximum time beyond which the simulation should not be advanced
+    int blockSize;
-//     * @return the size of the step that was taken
+    CudaArray* params;
-//     */
+    CUfunction kernel1, kernel2, selectSizeKernel;
-//    double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
+    double prevTemp, prevFriction, prevErrorTol;
-//private:
+};
-//    CudaContext& cu;
-//    bool hasInitializedKernels;
-//    int blockSize;
-//    CudaArray<cl_float>* params;
-//    CUfunction kernel1, kernel2, selectSizeKernel;
-//    double prevTemp, prevFriction, prevErrorTol;
-//};
-//
 ///**
 // * This kernel is invoked by CustomIntegrator to take one time step.
 // */

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -169,14 +169,14 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        exclusionIndicesVec.push_back(iter->second);
    }
    exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
-    exclusionIndices = CudaArray::create<unsigned int>(exclusionIndicesVec.size(), "exclusionIndices");
+    exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
-    exclusionRowIndices = CudaArray::create<unsigned int>(exclusionRowIndicesVec.size(), "exclusionRowIndices");
+    exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices->upload(exclusionIndicesVec);
    exclusionRowIndices->upload(exclusionRowIndicesVec);
    // Record the exclusion data.
-    exclusions = CudaArray::create<unsigned int>(tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
+    exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
    vector<unsigned int> exclusionVec(exclusions->getSize());
    for (int i = 0; i < exclusions->getSize(); ++i)
        exclusionVec[i] = 0xFFFFFFFF;
@@ -231,11 +231,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            maxTiles = numTiles;
        if (maxTiles < 1)
            maxTiles = 1;
-        interactingTiles = CudaArray::create<ushort2>(maxTiles, "interactingTiles");
+        interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
-        interactionFlags = CudaArray::create<unsigned int>(maxTiles, "interactionFlags");
+        interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
-        interactionCount = CudaArray::create<unsigned int>(1, "interactionCount");
+        interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
-        blockCenter = CudaArray::create<float4>(numAtomBlocks, "blockCenter");
+        blockCenter = CudaArray::create<float4>(context, numAtomBlocks, "blockCenter");
-        blockBoundingBox = CudaArray::create<float4>(numAtomBlocks, "blockBoundingBox");
+        blockBoundingBox = CudaArray::create<float4>(context, numAtomBlocks, "blockBoundingBox");
        CHECK_RESULT(cuMemHostAlloc((void**) &pinnedInteractionCount, sizeof(unsigned int), 0));
        pinnedInteractionCount[0] = 0;
        interactionCount->upload(pinnedInteractionCount);
@@ -330,11 +330,11 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
    if (maxTiles > numTiles)
        maxTiles = numTiles;
    delete interactingTiles;
-    interactingTiles = CudaArray::create<ushort2>(maxTiles, "interactingTiles");
+    interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
    forceArgs[8] = &interactingTiles->getDevicePointer();
    findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer();
    delete interactionFlags;
-    interactionFlags = CudaArray::create<unsigned int>(maxTiles, "interactionFlags");
+    interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
    forceArgs[13] = &interactionFlags->getDevicePointer();
    findInteractingBlocksArgs[6] = &interactionFlags->getDevicePointer();
    findInteractionsWithinBlocksArgs[3] = &interactingTiles->getDevicePointer();

--- a/platforms/cuda2/src/CudaSort.cpp
+++ b/platforms/cuda2/src/CudaSort.cpp
@@ -73,11 +73,11 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    // Create workspace arrays.
-    dataRange = new CudaArray(2, trait->getKeySize(), "sortDataRange");
+    dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
-    bucketOffset = CudaArray::create<uint1>(numBuckets, "bucketOffset");
+    bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
-    bucketOfElement = CudaArray::create<uint1>(length, "bucketOfElement");
+    bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
-    offsetInBucket = CudaArray::create<uint1>(length, "offsetInBucket");
+    offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
-    buckets = new CudaArray(length, trait->getDataSize(), "buckets");
+    buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
 }
 CudaSort::~CudaSort() {

--- a/platforms/cuda2/src/kernels/brownian.cu
+++ b/platforms/cuda2/src/kernels/brownian.cu
+/**
+ * Perform the first step of Brownian integration.
+ */
+extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmplitude, const long long* __restrict__ force,
+        real4* __restrict__ posDelta, const real4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
+    randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
+    const real fscale = tauDeltaT/(real) 0xFFFFFFFF;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real invMass = velm[index].w;
+        if (invMass != 0) {
+            posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
+            posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
+            posDelta[index].z = fscale*invMass*force[index+PADDED_NUM_ATOMS*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
+        }
+        randomIndex += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Perform the second step of Brownian integration.
+ */
+extern "C" __global__ void integrateBrownianPart2(real deltaT, real4* posq, real4* velm, const real4* __restrict__ posDelta) {
+    const real oneOverDeltaT = RECIP(deltaT);
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        if (velm[index].w != 0) {
+            real4 delta = posDelta[index];
+            velm[index].x = oneOverDeltaT*delta.x;
+            velm[index].y = oneOverDeltaT*delta.y;
+            velm[index].z = oneOverDeltaT*delta.z;
+            posq[index].x = posq[index].x + delta.x;
+            posq[index].y = posq[index].y + delta.y;
+            posq[index].z = posq[index].z + delta.z;
+        }
+    }
+}
--- a/platforms/cuda2/src/kernels/ccma.cu
+++ b/platforms/cuda2/src/kernels/ccma.cu
+/**
+ * Compute the direction each constraint is pointing in.  This is called once at the beginning of constraint evaluation.
+ */
+extern "C" __global__ void computeConstraintDirections(const int2* __restrict__ constraintAtoms, real4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+        // Compute the direction for this constraint.
+        int2 atoms = constraintAtoms[index];
+        real4 dir = constraintDistance[index];
+        real4 oldPos1 = atomPositions[atoms.x];
+        real4 oldPos2 = atomPositions[atoms.y];
+        dir.x = oldPos1.x-oldPos2.x;
+        dir.y = oldPos1.y-oldPos2.y;
+        dir.z = oldPos1.z-oldPos2.z;
+        constraintDistance[index] = dir;
+    }
+}
+/**
+ * Compute the force applied by each constraint.
+ */
+extern "C" __global__ void computeConstraintForce(const int2* __restrict__ constraintAtoms, const real4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions,
+        const real* __restrict__ reducedMass, real* __restrict__ delta1, int* __restrict__ converged, float tol, int iteration) {
+    __shared__ int groupConverged;
+    if (converged[1-iteration%2]) {
+        if (blockIdx.x == 0 && threadIdx.x == 0)
+            converged[iteration%2] = 1;
+        return; // The constraint iteration has already converged.
+    }
+    if (threadIdx.x == 0)
+        groupConverged = 1;
+    __syncthreads();
+    real lowerTol = 1.0f-2.0f*tol+tol*tol;
+    real upperTol = 1.0f+2.0f*tol+tol*tol;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+        // Compute the force due to this constraint.
+        int2 atoms = constraintAtoms[index];
+        real4 dir = constraintDistance[index];
+        real4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
+#ifndef CONSTRAIN_VELOCITIES
+        rp_ij.x += dir.x;
+        rp_ij.y += dir.y;
+        rp_ij.z += dir.z;
+#endif
+        real rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
+        real d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
+#ifdef CONSTRAIN_VELOCITIES
+        delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2;
+        // See whether it has converged.
+        if (groupConverged && fabs(delta1[index]) > tol) {
+            groupConverged = 0;
+            converged[iteration%2] = 0;
+        }
+#else
+        real rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
+        real dist2 = dir.w*dir.w;
+        real diff = dist2 - rp2;
+        delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
+        // See whether it has converged.
+        if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
+            groupConverged = 0;
+            converged[iteration%2] = 0;
+        }
+#endif
+    }
+}
+/**
+ * Multiply the vector of constraint forces by the constraint matrix.
+ */
+extern "C" __global__ void multiplyByConstraintMatrix(const real* __restrict__ delta1, real* __restrict__ delta2, const int* __restrict__ constraintMatrixColumn,
+        const real* __restrict__ constraintMatrixValue, const int* __restrict__ converged, int iteration) {
+    if (converged[iteration%2])
+        return; // The constraint iteration has already converged.
+    // Multiply by the inverse constraint matrix.
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+        real sum = 0.0f;
+        for (int i = 0; ; i++) {
+            int element = index+i*NUM_CONSTRAINTS;
+            int column = constraintMatrixColumn[element];
+            if (column >= NUM_CONSTRAINTS)
+                break;
+            sum += delta1[column]*constraintMatrixValue[element];
+        }
+        delta2[index] = sum;
+    }
+}
+/**
+ * Update the atom positions based on constraint forces.
+ */
+extern "C" __global__ void updateAtomPositions(const int* __restrict__ numAtomConstraints, const int* __restrict__ atomConstraints, const real4* __restrict__ constraintDistance,
+        real4* __restrict__ atomPositions, const real4* __restrict__ velm, const real* __restrict__ delta1, const real* __restrict__ delta2, int* __restrict__ converged, int iteration) {
+    if (blockIdx.x == 0 && threadIdx.x == 0)
+        converged[1-iteration%2] = 1;
+    if (converged[iteration%2])
+        return; // The constraint iteration has already converged.
+    real damping = (iteration < 2 ? 0.5f : 1.0f);
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Compute the new position of this atom.
+        real4 atomPos = atomPositions[index];
+        real invMass = velm[index].w;
+        int num = numAtomConstraints[index];
+        for (int i = 0; i < num; i++) {
+            int constraint = atomConstraints[index+i*NUM_ATOMS];
+            bool forward = (constraint > 0);
+            constraint = (forward ? constraint-1 : -constraint-1);
+            real constraintForce = damping*invMass*delta2[constraint];
+            constraintForce = (forward ? constraintForce : -constraintForce);
+            real4 dir = constraintDistance[constraint];
+            atomPos.x += constraintForce*dir.x;
+            atomPos.y += constraintForce*dir.y;
+            atomPos.z += constraintForce*dir.z;
+        }
+        atomPositions[index] = atomPos;
+    }
+}
--- a/platforms/cuda2/src/kernels/constraints.cu
+++ b/platforms/cuda2/src/kernels/constraints.cu
+extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 position = posq[index];
+        position.x += posDelta[index].x;
+        position.y += posDelta[index].y;
+        position.z += posDelta[index].z;
+        posq[index] = position;
+    }
+}
--- a/platforms/cuda2/src/kernels/langevin.cu
+++ b/platforms/cuda2/src/kernels/langevin.cu
+enum {VelScale, ForceScale, NoiseScale, MaxParams};
+/**
+ * Perform the first step of Langevin integration.
+ */
+extern "C" __global__ void integrateLangevinPart1(real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta,
+        const real* __restrict__ paramBuffer, const real2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+    real vscale = paramBuffer[VelScale];
+    real fscale = paramBuffer[ForceScale]/(real) 0xFFFFFFFF;
+    real noisescale = paramBuffer[NoiseScale];
+    real stepSize = dt[0].y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    randomIndex += index;
+    while (index < NUM_ATOMS) {
+        real4 velocity = velm[index];
+        if (velocity.w != 0) {
+            real sqrtInvMass = SQRT(velocity.w);
+            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
+            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z;
+            velm[index] = velocity;
+            posDelta[index] = make_real4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
+        }
+        randomIndex += blockDim.x*gridDim.x;
+        index += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Perform the second step of Langevin integration.
+ */
+extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, const real4* __restrict__ posDelta, real4* __restrict__ velm, const real2* __restrict__ dt) {
+    double invStepSize = 1.0/dt[0].y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    while (index < NUM_ATOMS) {
+        real4 vel = velm[index];
+        if (vel.w != 0) {
+            real4 pos = posq[index];
+            real4 delta = posDelta[index];
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+            vel.x = (real) invStepSize*delta.x;
+            vel.y = (real) invStepSize*delta.y;
+            vel.z = (real) invStepSize*delta.z;
+            posq[index] = pos;
+            velm[index] = vel;
+        }
+        index += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Select the step size to use for the next step.
+ */
+extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTol, real tau, real kT, real2* __restrict__ dt,
+        const real4* __restrict__ velm, const long long* __restrict__ force, real* __restrict__ paramBuffer) {
+    // Calculate the error.
+    extern __shared__ real params[];
+    real* error = &params[MaxParams];
+    real err = 0;
+    unsigned int index = threadIdx.x;
+    const real scale = RECIP((real) 0xFFFFFFFF);
+    while (index < NUM_ATOMS) {
+        real3 f = make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+        real invMass = velm[index].w;
+        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
+        index += blockDim.x*gridDim.x;
+    }
+    error[threadIdx.x] = err;
+    __syncthreads();
+    // Sum the errors from all threads.
+    for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) {
+        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
+            error[threadIdx.x] += error[threadIdx.x+offset];
+        __syncthreads();
+    }
+    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
+        // Select the new step size.
+        real totalError = sqrt(error[0]/(NUM_ATOMS*3));
+        real newStepSize = sqrt(errorTol/totalError);
+        real oldStepSize = dt[0].y;
+        if (oldStepSize > 0.0f)
+            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
+        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
+            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
+        if (newStepSize > maxStepSize)
+            newStepSize = maxStepSize;
+        dt[0].y = newStepSize;
+        // Recalculate the integration parameters.
+        real vscale = exp(-newStepSize/tau);
+        real fscale = (1-vscale)*tau;
+        real noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        params[VelScale] = vscale;
+        params[ForceScale] = fscale;
+        params[NoiseScale] = noisescale;
+    }
+    __syncthreads();
+    if (threadIdx.x < MaxParams)
+        paramBuffer[threadIdx.x] = params[threadIdx.x];
+}
--- a/platforms/cuda2/src/kernels/settle.cu
+++ b/platforms/cuda2/src/kernels/settle.cu
+/**
+ * Enforce constraints on SETTLE clusters
+ */
+extern "C" __global__ void applySettle(int numClusters, float tol, const real4* __restrict__ oldPos, real4* __restrict__ posDelta, const real4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    while (index < numClusters) {
+        // Load the data for this cluster.
+        int4 atoms = clusterAtoms[index];
+        float2 params = clusterParams[index];
+        real4 apos0 = oldPos[atoms.x];
+        real4 xp0 = posDelta[atoms.x];
+        real4 apos1 = oldPos[atoms.y];
+        real4 xp1 = posDelta[atoms.y];
+        real4 apos2 = oldPos[atoms.z];
+        real4 xp2 = posDelta[atoms.z];
+        real m0 = RECIP(velm[atoms.x].w);
+        real m1 = RECIP(velm[atoms.y].w);
+        real m2 = RECIP(velm[atoms.z].w);
+        // Apply the SETTLE algorithm.
+        real xb0 = apos1.x-apos0.x;
+        real yb0 = apos1.y-apos0.y;
+        real zb0 = apos1.z-apos0.z;
+        real xc0 = apos2.x-apos0.x;
+        real yc0 = apos2.y-apos0.y;
+        real zc0 = apos2.z-apos0.z;
+        real invTotalMass = RECIP(m0+m1+m2);
+        real xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
+        real ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
+        real zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
+        real xa1 = xp0.x - xcom;
+        real ya1 = xp0.y - ycom;
+        real za1 = xp0.z - zcom;
+        real xb1 = xb0 + xp1.x - xcom;
+        real yb1 = yb0 + xp1.y - ycom;
+        real zb1 = zb0 + xp1.z - zcom;
+        real xc1 = xc0 + xp2.x - xcom;
+        real yc1 = yc0 + xp2.y - ycom;
+        real zc1 = zc0 + xp2.z - zcom;
+        real xaksZd = yb0*zc0 - zb0*yc0;
+        real yaksZd = zb0*xc0 - xb0*zc0;
+        real zaksZd = xb0*yc0 - yb0*xc0;
+        real xaksXd = ya1*zaksZd - za1*yaksZd;
+        real yaksXd = za1*xaksZd - xa1*zaksZd;
+        real zaksXd = xa1*yaksZd - ya1*xaksZd;
+        real xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
+        real yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
+        real zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
+        real axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        real aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        real azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        real trns11 = xaksXd / axlng;
+        real trns21 = yaksXd / axlng;
+        real trns31 = zaksXd / axlng;
+        real trns12 = xaksYd / aylng;
+        real trns22 = yaksYd / aylng;
+        real trns32 = zaksYd / aylng;
+        real trns13 = xaksZd / azlng;
+        real trns23 = yaksZd / azlng;
+        real trns33 = zaksZd / azlng;
+        real xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
+        real yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
+        real xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
+        real yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
+        real za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
+        real xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
+        real yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
+        real zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
+        real xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
+        real yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
+        real zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
+        //                                        --- Step2  A2' ---
+        float rc = 0.5f*params.y;
+        float rb = SQRT(params.x*params.x-rc*rc);
+        real ra = rb*(m1+m2)*invTotalMass;
+        rb -= ra;
+        real sinphi = za1d/ra;
+        real cosphi = SQRT(1-sinphi*sinphi);
+        real sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
+        real cospsi = SQRT(1-sinpsi*sinpsi);
+        real ya2d =   ra*cosphi;
+        real xb2d = - rc*cospsi;
+        real yb2d = - rb*cosphi - rc*sinpsi*sinphi;
+        real yc2d = - rb*cosphi + rc*sinpsi*sinphi;
+        real xb2d2 = xb2d*xb2d;
+        real hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
+        real deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
+        xb2d -= deltx*0.5f;
+        //                                        --- Step3  al,be,ga ---
+        real alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
+        real beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
+        real gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
+        real al2be2 = alpha*alpha + beta*beta;
+        real sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
+        //                                        --- Step4  A3' ---
+        real costheta = SQRT(1-sintheta*sintheta);
+        real xa3d = - ya2d*sintheta;
+        real ya3d =   ya2d*costheta;
+        real za3d = za1d;
+        real xb3d =   xb2d*costheta - yb2d*sintheta;
+        real yb3d =   xb2d*sintheta + yb2d*costheta;
+        real zb3d = zb1d;
+        real xc3d = - xb2d*costheta - yc2d*sintheta;
+        real yc3d = - xb2d*sintheta + yc2d*costheta;
+        real zc3d = zc1d;
+        //                                        --- Step5  A3 ---
+        real xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
+        real ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
+        real za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
+        real xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
+        real yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
+        real zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
+        real xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
+        real yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
+        real zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
+        xp0.x = xcom + xa3;
+        xp0.y = ycom + ya3;
+        xp0.z = zcom + za3;
+        xp1.x = xcom + xb3 - xb0;
+        xp1.y = ycom + yb3 - yb0;
+        xp1.z = zcom + zb3 - zb0;
+        xp2.x = xcom + xc3 - xc0;
+        xp2.y = ycom + yc3 - yc0;
+        xp2.z = zcom + zc3 - zc0;
+        // Record the new positions.
+        posDelta[atoms.x] = xp0;
+        posDelta[atoms.y] = xp1;
+        posDelta[atoms.z] = xp2;
+        index += blockDim.x*gridDim.x;
+    }
+}
+/**
+ * Enforce velocity constraints on SETTLE clusters
+ */
+extern "C" __global__ void constrainVelocities(int numClusters, float tol, const real4* __restrict__ oldPos, real4* __restrict__ posDelta, real4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numClusters; index += blockDim.x*gridDim.x) {
+        // Load the data for this cluster.
+        int4 atoms = clusterAtoms[index];
+        real4 apos0 = oldPos[atoms.x];
+        real4 apos1 = oldPos[atoms.y];
+        real4 apos2 = oldPos[atoms.z];
+        real4 v0 = velm[atoms.x];
+        real4 v1 = velm[atoms.y];
+        real4 v2 = velm[atoms.z];
+        // Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
+        // and the angle cosines and sines.
+        real mA = RECIP(v0.w);
+        real mB = RECIP(v1.w);
+        real mC = RECIP(v2.w);
+        real3 eAB = make_real3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
+        real3 eBC = make_real3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
+        real3 eCA = make_real3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
+        eAB *= RECIP(SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z));
+        eBC *= RECIP(SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z));
+        eCA *= RECIP(SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z));
+        real vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
+        real vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
+        real vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
+        real cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
+        real cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
+        real cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
+        real s2A = 1-cA*cA;
+        real s2B = 1-cB*cB;
+        real s2C = 1-cC*cC;
+        // Solve the equations.  These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
+        // in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
+        // making that assumption).  We allow all three atoms to have different masses.
+        real mABCinv = RECIP(mA*mB*mC);
+        real denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
+        real tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
+        real tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
+        real tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
+        v0.x += (tab*eAB.x - tca*eCA.x)*v0.w;
+        v0.y += (tab*eAB.y - tca*eCA.y)*v0.w;
+        v0.z += (tab*eAB.z - tca*eCA.z)*v0.w;
+        v1.x += (tbc*eBC.x - tab*eAB.x)*v1.w;
+        v1.y += (tbc*eBC.y - tab*eAB.y)*v1.w;
+        v1.z += (tbc*eBC.z - tab*eAB.z)*v1.w;
+        v2.x += (tca*eCA.x - tbc*eBC.x)*v2.w;
+        v2.y += (tca*eCA.y - tbc*eBC.y)*v2.w;
+        v2.z += (tca*eCA.z - tbc*eBC.z)*v2.w;
+        velm[atoms.x] = v0;
+        velm[atoms.y] = v1;
+        velm[atoms.z] = v2;
+    }
+}
\ No newline at end of file
--- a/platforms/cuda2/src/kernels/shakeHydrogens.cu
+++ b/platforms/cuda2/src/kernels/shakeHydrogens.cu
+/**
+ * Enforce constraints on SHAKE clusters
+ */
+extern "C" __global__ void applyShakeToHydrogens(int numClusters, real tol, const real4* __restrict__ oldPos, real4* __restrict__ posDelta, const int4* __restrict__ clusterAtoms, const float4* __restrict__ clusterParams) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    while (index < numClusters) {
+        // Load the data for this cluster.
+        int4 atoms = clusterAtoms[index];
+        float4 params = clusterParams[index];
+        real4 pos = oldPos[atoms.x];
+        real4 xpi = posDelta[atoms.x];
+        real4 pos1 = oldPos[atoms.y];
+        real4 xpj1 = posDelta[atoms.y];
+        real4 pos2 = make_real4(0);
+        real4 xpj2 = make_real4(0);
+        real invMassCentral = params.x;
+        real avgMass = params.y;
+        float d2 = params.z;
+        float invMassPeripheral = params.w;
+        if (atoms.z != -1) {
+            pos2 = oldPos[atoms.z];
+            xpj2 = posDelta[atoms.z];
+        }
+        real4 pos3 = make_real4(0);
+        real4 xpj3 = make_real4(0);
+        if (atoms.w != -1) {
+            pos3 = oldPos[atoms.w];
+            xpj3 = posDelta[atoms.w];
+        }
+        // Precompute quantities.
+        real3 rij1 = make_real3(pos.x-pos1.x, pos.y-pos1.y, pos.z-pos1.z);
+        real3 rij2 = make_real3(pos.x-pos2.x, pos.y-pos2.y, pos.z-pos2.z);
+        real3 rij3 = make_real3(pos.x-pos3.x, pos.y-pos3.y, pos.z-pos3.z);
+        real rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
+        real rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
+        real rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
+        real ld1 = d2-rij1sq;
+        real ld2 = d2-rij2sq;
+        real ld3 = d2-rij3sq;
+        // Iterate until convergence.
+        bool converged = false;
+        int iteration = 0;
+        while (iteration < 15 && !converged) {
+            converged = true;
+#ifdef CONSTRAIN_VELOCITIES
+            real3 rpij = make_real3(xpi.x-xpj1.x, xpi.y-xpj1.y, xpi.z-xpj1.z);
+            real rrpr = rpij.x*rij1.x + rpij.y*rij1.y + rpij.z*rij1.z;
+            real delta = -2.0f*avgMass*rrpr/rij1sq;
+            real3 dr = rij1*delta;
+            xpi.x += dr.x*invMassCentral;
+            xpi.y += dr.y*invMassCentral;
+            xpi.z += dr.z*invMassCentral;
+            xpj1.x -= dr.x*invMassPeripheral;
+            xpj1.y -= dr.y*invMassPeripheral;
+            xpj1.z -= dr.z*invMassPeripheral;
+            if (fabs(delta) > tol)
+                converged = false;
+            if (atoms.z != -1) {
+                rpij = make_real3(xpi.x-xpj2.x, xpi.y-xpj2.y, xpi.z-xpj2.z);
+                rrpr = rpij.x*rij2.x + rpij.y*rij2.y + rpij.z*rij2.z;
+                delta = -2.0f*avgMass*rrpr/rij2sq;
+                dr = rij2*delta;
+                xpi.x += dr.x*invMassCentral;
+                xpi.y += dr.y*invMassCentral;
+                xpi.z += dr.z*invMassCentral;
+                xpj2.x -= dr.x*invMassPeripheral;
+                xpj2.y -= dr.y*invMassPeripheral;
+                xpj2.z -= dr.z*invMassPeripheral;
+                if (fabs(delta) > tol)
+                    converged = false;
+            }
+            if (atoms.w != -1) {
+                rpij = make_real3(xpi.x-xpj3.x, xpi.y-xpj3.y, xpi.z-xpj3.z);
+                rrpr = rpij.x*rij3.x + rpij.y*rij3.y + rpij.z*rij3.z;
+                delta = -2.0f*avgMass*rrpr/rij3sq;
+                dr = rij3*delta;
+                xpi.x += dr.x*invMassCentral;
+                xpi.y += dr.y*invMassCentral;
+                xpi.z += dr.z*invMassCentral;
+                xpj3.x -= dr.x*invMassPeripheral;
+                xpj3.y -= dr.y*invMassPeripheral;
+                xpj3.z -= dr.z*invMassPeripheral;
+                if (fabs(delta) > tol)
+                    converged = false;
+            }
+#else
+            real3 rpij = make_real3(xpi.x-xpj1.x, xpi.y-xpj1.y, xpi.z-xpj1.z);
+            real rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
+            real rrpr = rij1.x*rpij.x + rij1.y*rpij.y + rij1.z*rpij.z;
+            real diff = fabs(ld1-2.0f*rrpr-rpsqij) / (d2*tol);
+            if (diff >= 1.0f) {
+                real acor  = (ld1-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij1sq);
+                real3 dr = rij1*acor;
+                xpi.x += dr.x*invMassCentral;
+                xpi.y += dr.y*invMassCentral;
+                xpi.z += dr.z*invMassCentral;
+                xpj1.x -= dr.x*invMassPeripheral;
+                xpj1.y -= dr.y*invMassPeripheral;
+                xpj1.z -= dr.z*invMassPeripheral;
+                converged = false;
+            }
+            if (atoms.z != -1) {
+                rpij = make_real3(xpi.x-xpj2.x, xpi.y-xpj2.y, xpi.z-xpj2.z);
+                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
+                rrpr = rij2.x*rpij.x + rij2.y*rpij.y + rij2.z*rpij.z;
+                diff = fabs(ld2-2.0f*rrpr-rpsqij) / (d2*tol);
+                if (diff >= 1.0f) {
+                    real acor  = (ld2 - 2.0f*rrpr - rpsqij)*avgMass / (rrpr + rij2sq);
+                    real3 dr = rij2*acor;
+                    xpi.x += dr.x*invMassCentral;
+                    xpi.y += dr.y*invMassCentral;
+                    xpi.z += dr.z*invMassCentral;
+                    xpj2.x -= dr.x*invMassPeripheral;
+                    xpj2.y -= dr.y*invMassPeripheral;
+                    xpj2.z -= dr.z*invMassPeripheral;
+                    converged = false;
+                }
+            }
+            if (atoms.w != -1) {
+                rpij = make_real3(xpi.x-xpj3.x, xpi.y-xpj3.y, xpi.z-xpj3.z);
+                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
+                rrpr = rij3.x*rpij.x + rij3.y*rpij.y + rij3.z*rpij.z;
+                diff = fabs(ld3 - 2.0f*rrpr - rpsqij) / (d2*tol);
+                if (diff >= 1.0f) {
+                    real acor  = (ld3-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij3sq);
+                    real3 dr = rij3*acor;
+                    xpi.x += dr.x*invMassCentral;
+                    xpi.y += dr.y*invMassCentral;
+                    xpi.z += dr.z*invMassCentral;
+                    xpj3.x -= dr.x*invMassPeripheral;
+                    xpj3.y -= dr.y*invMassPeripheral;
+                    xpj3.z -= dr.z*invMassPeripheral;
+                    converged = false;
+                }
+            }
+#endif
+            iteration++;
+        }
+        // Record the new positions.
+        posDelta[atoms.x] = xpi;
+        posDelta[atoms.y] = xpj1;
+        if (atoms.z != -1)
+            posDelta[atoms.z] = xpj2;
+        if (atoms.w != -1)
+            posDelta[atoms.w] = xpj3;
+        index += blockDim.x*gridDim.x;
+    }
+}
--- a/platforms/cuda2/src/kernels/verlet.cu
+++ b/platforms/cuda2/src/kernels/verlet.cu
@@ -51,36 +51,38 @@ extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* _
 /**
 * Select the step size to use for the next step.
 */
-//
-//extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const real4* __restrict__ force, __local real* __restrict__ error) {
+extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const long long* __restrict__ force) {
-//    // Calculate the error.
+    // Calculate the error.
-//
-//    real err = 0.0f;
+    extern __shared__ real error[];
-//    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    real err = 0.0f;
-//        real4 f = force[index];
+    const real scale = RECIP((real) 0xFFFFFFFF);
-//        real invMass = velm[index].w;
+    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-//        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
+        real3 f = make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
-//    }
+        real invMass = velm[index].w;
-//    error[threadIdx.x] = err;
+        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
-//    __syncthreads;
+    }
-//
+    error[threadIdx.x] = err;
-//    // Sum the errors from all threads.
+    __syncthreads();
-//
-//    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
+    // Sum the errors from all threads.
-//        if (threadIdx.x+offset < get_local_size(0) && (threadIdx.x&(2*offset-1)) == 0)
-//            error[threadIdx.x] += error[threadIdx.x+offset];
+    for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) {
-//        __syncthreads;
+        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
-//    }
+            error[threadIdx.x] += error[threadIdx.x+offset];
-//    if (threadIdx.x == 0) {
+        __syncthreads();
-//        real totalError = sqrt(error[0]/(NUM_ATOMS*3));
+    }
-//        real newStepSize = sqrt(errorTol/totalError);
+    if (threadIdx.x == 0) {
-//        real oldStepSize = dt[0].y;
+        real totalError = SQRT(error[0]/(NUM_ATOMS*3));
-//        if (oldStepSize > 0.0f)
+        real newStepSize = SQRT(errorTol/totalError);
-//            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
+        real oldStepSize = dt[0].y;
-//        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
+        if (oldStepSize > 0.0f)
-//            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
+            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
-//        if (newStepSize > maxStepSize)
+        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
-//            newStepSize = maxStepSize;
+            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
-//        dt[0].y = newStepSize;
+        if (newStepSize > maxStepSize)
-//    }
+            newStepSize = maxStepSize;
-//}
+        dt[0].y = newStepSize;
+    }
+}
--- a/platforms/cuda2/tests/TestCudaBrownianIntegrator.cpp
+++ b/platforms/cuda2/tests/TestCudaBrownianIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/System.h"
+/**
+ * This tests the CUDA implementation of BrownianIntegrator.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/BrownianIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testSingleBond() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    double dt = 0.01;
+    BrownianIntegrator integrator(0, 0.1, dt);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    // This is simply an overdamped harmonic oscillator, so compare it to the analytical solution.
+    double rate = 2*1.0/(0.1*2.0);
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Velocities);
+        double time = state.getTime();
+        double expectedDist = 1.5+0.5*std::exp(-rate*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
+        if (i > 0) {
+            double expectedSpeed = -0.5*rate*std::exp(-rate*(time-0.5*dt));
+            ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.11);
+            ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.11);
+        }
+        integrator.step(1);
+    }
+}
+void testTemperature() {
+    const int numParticles = 8;
+    const int numBonds = numParticles-1;
+    const double temp = 10.0;
+    CudaPlatform platform;
+    System system;
+    BrownianIntegrator integrator(temp, 2.0, 0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    for (int i = 0; i < numParticles; ++i)
+        system.addParticle(2.0);
+    for (int i = 0; i < numBonds; ++i)
+        forceField->addBond(i, i+1, 1.0, 5.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = Vec3(i, 0, 0);
+    context.setPositions(positions);
+    // Let it equilibrate.
+    integrator.step(10000);
+    // Now run it for a while and see if the temperature is correct.
+    double pe = 0.0;
+    const int steps = 50000;
+    for (int i = 0; i < steps; ++i) {
+        State state = context.getState(State::Energy);
+        pe += state.getPotentialEnergy();
+        integrator.step(1);
+    }
+    pe /= steps;
+    double expected = 0.5*numBonds*BOLTZ*temp;
+    ASSERT_USUALLY_EQUAL_TOL(expected, pe, 0.1*expected);
+}
+void testConstraints() {
+    const int numParticles = 8;
+    const int numConstraints = 5;
+    const double temp = 20.0;
+    CudaPlatform platform;
+    System system;
+    BrownianIntegrator integrator(temp, 2.0, 0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(1, 2, 1.0);
+    system.addConstraint(2, 3, 1.0);
+    system.addConstraint(4, 5, 1.0);
+    system.addConstraint(6, 7, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i, 0, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    // Simulate it and see whether the constraints remain satisfied.
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions);
+        for (int j = 0; j < numConstraints; ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
+        }
+        integrator.step(1);
+    }
+}
+void testRandomSeed() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    const double collisionFreq = 10.0;
+    CudaPlatform platform;
+    System system;
+    BrownianIntegrator integrator(temp, 2.0, 0.001);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(2.0);
+        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
+    }
+    system.addForce(forceField);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
+        velocities[i] = Vec3(0, 0, 0);
+    }
+    // Try twice with the same random seed.
+    integrator.setRandomNumberSeed(5);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state1 = context.getState(State::Positions);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state2 = context.getState(State::Positions);
+    // Try twice with a different random seed.
+    integrator.setRandomNumberSeed(10);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state3 = context.getState(State::Positions);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state4 = context.getState(State::Positions);
+    // Compare the results.
+    for (int i = 0; i < numParticles; i++) {
+        for (int j = 0; j < 3; j++) {
+            ASSERT(state1.getPositions()[i][j] == state2.getPositions()[i][j]);
+            ASSERT(state3.getPositions()[i][j] == state4.getPositions()[i][j]);
+            ASSERT(state1.getPositions()[i][j] != state3.getPositions()[i][j]);
+        }
+    }
+}
+int main() {
+    try {
+        testSingleBond();
+        testTemperature();
+        testConstraints();
+        testRandomSeed();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}