Continuing to implement new CUDA platform: HarmonicBondForce and VerletIntegrator

6e434a02 · Peter Eastman · 1f0ec7b5 · 6e434a02 · 6e434a02 · 6e434a02
Commit 6e434a02 authored Jun 08, 2012 by Peter Eastman
16 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */

 #include "CudaArray.h"
+#include "CudaContext.h"
 #include <iostream>
 #include <sstream>
 #include <vector>
@@ -36,7 +37,7 @@ CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
    CUresult result = cuMemAlloc(&pointer, size*elementSize);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
-        str<<"Error creating array "<<name<<": "<<result;
+        str<<"Error creating array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
 }
@@ -46,7 +47,7 @@ CudaArray::~CudaArray() {
        CUresult result = cuMemFree(pointer);
        if (result != CUDA_SUCCESS) {
            std::stringstream str;
-            str<<"Error deleting array "<<name<<": "<<result;
+            str<<"Error deleting array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
            throw OpenMMException(str.str());
        }
    }
@@ -60,7 +61,7 @@ void CudaArray::upload(void* data, bool blocking) {
        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
-        str<<"Error uploading array "<<name<<": "<<result;
+        str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
 }
@@ -73,7 +74,7 @@ void CudaArray::download(void* data, bool blocking) const {
        result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
-        str<<"Error downloading array "<<name<<": "<<result;
+        str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
 }
--- a/platforms/cuda2/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaBondedUtilities.cpp
@@ -26,6 +26,7 @@

 #include "CudaBondedUtilities.h"
 #include "CudaExpressionUtilities.h"
+#include "CudaKernelSources.h"
 #include "openmm/OpenMMException.h"
 #include "CudaNonbondedUtilities.h"
 #include <iostream>
@@ -81,8 +82,8 @@ void CudaBondedUtilities::initialize(const System& system) {
                for (int atom = 0; atom < width; atom++)
                    indexVec[bond*width+atom] = forceAtoms[i][bond][startAtom+atom];
            }
-            CudaArray* indices = CudaArray::create<unsigned int>(indexVec.size(), "bondedIndices");
-            indices->upload(indexVec);
+            CudaArray* indices = new CudaArray(numBonds, 4*width, "bondedIndices");
+            indices->upload(&indexVec[0]);
            atomIndices[i].push_back(indices);
            startAtom += width;
        }
@@ -91,13 +92,14 @@ void CudaBondedUtilities::initialize(const System& system) {
    // Create the kernel.

    stringstream s;
+    s<<CudaKernelSources::vectorOps;
    for (int i = 0; i < (int) prefixCode.size(); i++)
        s<<prefixCode[i];
-    s<<"extern \"C\" __global__ void computeBondedForces(long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
+    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
    for (int force = 0; force < numForces; force++) {
        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
            int indexWidth = atomIndices[force][i]->getElementSize()/4;
-            string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+            string indexType = "uint"+context.intToString(indexWidth);
            s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
        }
    }
@@ -129,10 +131,10 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
    for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
        int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4;
        suffix = (indexWidth == 1 ? suffix1 : suffix4);
-        string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+        string indexType = "uint"+context.intToString(indexWidth);
        s<<"    "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
-        s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
-        for (int j = 0; j < indexWidth; j++) {
+        int atomsToLoad = min(indexWidth, numAtoms-startAtom);
+        for (int j = 0; j < atomsToLoad; j++) {
            s<<"    unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
            s<<"    real4 pos"<<(j+1)<<" = posq[atom"<<(j+1)<<"];\n";
        }
@@ -140,34 +142,28 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
    }
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
-        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], (long) (force.x*0xFFFFFFFF));\n";
-        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force.x*0xFFFFFFFF));\n";
-        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0xFFFFFFFF)));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0xFFFFFFFF)));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0xFFFFFFFF)));\n";
+        s<<"    __threadfence_block();\n";
    }
    s<<"}\n";
    return s.str();
 }

 void CudaBondedUtilities::computeInteractions(int groups) {
-//    if (!hasInitializedKernels) {
-//        hasInitializedKernels = true;
-//        for (int i = 0; i < (int) forceSets.size(); i++) {
-//            int index = 0;
-//            cl::Kernel& kernel = kernels[i];
-//            kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
-//            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
-//            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
-//            index++;
-//            for (int j = 0; j < (int) forceSets[i].size(); j++) {
-//                kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]]->getDeviceBuffer());
-//                kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]]->getDeviceBuffer());
-//            }
-//            for (int j = 0; j < (int) arguments.size(); j++)
-//                kernel.setArg<cl::Memory>(index++, *arguments[j]);
-//        }
-//    }
-//    for (int i = 0; i < (int) kernels.size(); i++) {
-//        kernels[i].setArg<cl_int>(3, groups);
-//        context.executeKernel(kernels[i], maxBonds);
-//    }
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        kernelArgs.push_back(&context.getForce().getDevicePointer());
+        kernelArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
+        kernelArgs.push_back(&context.getPosq().getDevicePointer());
+        kernelArgs.push_back(NULL);
+        for (int i = 0; i < (int) atomIndices.size(); i++)
+            for (int j = 0; j < (int) atomIndices[i].size(); j++)
+                kernelArgs.push_back(&atomIndices[i][j]->getDevicePointer());
+        for (int i = 0; i < (int) arguments.size(); i++)
+            kernelArgs.push_back(&arguments[i]);
+    }
+    kernelArgs[3] = &groups;
+    context.executeKernel(kernel, &kernelArgs[0], maxBonds);
 }
--- a/platforms/cuda2/src/CudaBondedUtilities.h
+++ b/platforms/cuda2/src/CudaBondedUtilities.h
@@ -59,7 +59,7 @@ namespace OpenMM {
 * <li>The positions of those atoms will have been stored in the real4 variables "pos1", "pos2", ....</li>
 * <li>A real variable called "energy" will exist.  Your code should add the potential energy of the
 * bond to that variable.</li>
- * <li>Your code should define real4 variables called "force1", "force2", ... that contain the force to
+ * <li>Your code should define real3 variables called "force1", "force2", ... that contain the force to
 * apply to each atom.</li>
 * </ol>
 * 
@@ -69,8 +69,8 @@ namespace OpenMM {
 * <tt><pre>
 * real4 delta = pos2-pos1;
 * energy += delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
- * real4 force1 = 2.0f*delta;
- * real4 force2 = -2.0f*delta;
+ * real3 force1 = 2.0f*delta;
+ * real3 force2 = -2.0f*delta;
 * </pre></tt>
 * 
 * Interactions will often depend on parameters or other data.  Call addArgument() to provide the data
@@ -129,6 +129,7 @@ private:
    std::vector<std::string> argTypes;
    std::vector<std::vector<CudaArray*> > atomIndices;
    std::vector<std::string> prefixCode;
+    std::vector<void*> kernelArgs;
    int numForceBuffers, maxBonds;
    bool hasInitializedKernels;
 };

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -30,7 +30,7 @@
 #include <cmath>
 #include "CudaContext.h"
 #include "CudaArray.h"
-//#include "CudaBondedUtilities.h"
+#include "CudaBondedUtilities.h"
 #include "CudaForceInfo.h"
 #include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
@@ -67,8 +67,8 @@ bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
-        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL),*/ integration(NULL), expression(NULL),
-        /*bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
+        velm(NULL), force(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), expression(NULL),
+        bonded(NULL), /*nonbonded(NULL),*/ thread(NULL) {
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -138,10 +138,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
-//    bonded = new CudaBondedUtilities(*this);
+    bonded = new CudaBondedUtilities(*this);
 //    nonbonded = new CudaNonbondedUtilities(*this);
    if (useDoublePrecision) {
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_double2";
@@ -149,7 +148,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_real4"] = "make_double4";
    }
    else {
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_float2";
@@ -203,22 +201,16 @@ CudaContext::~CudaContext() {
        delete posq;
    if (velm != NULL)
        delete velm;
-//    if (force != NULL)
-//        delete force;
-//    if (forceBuffers != NULL)
-//        delete forceBuffers;
-//    if (longForceBuffer != NULL)
-//        delete longForceBuffer;
-//    if (energyBuffer != NULL)
-//        delete energyBuffer;
-//    if (atomIndex != NULL)
-//        delete atomIndex;
+    if (force != NULL)
+        delete force;
+    if (energyBuffer != NULL)
+        delete energyBuffer;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
        delete expression;
-//    if (bonded != NULL)
-//        delete bonded;
+    if (bonded != NULL)
+        delete bonded;
 //    if (nonbonded != NULL)
 //        delete nonbonded;
    if (thread != NULL)
@@ -229,6 +221,17 @@ CudaContext::~CudaContext() {
 }

 void CudaContext::initialize() {
+    string errorMessage = "Error initializing Context";
+    if (useDoublePrecision) {
+        energyBuffer = CudaArray::create<double>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numThreadBlocks*ThreadBlockSize);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else {
+        energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numThreadBlocks*ThreadBlockSize);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
+    }
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision)
@@ -237,11 +240,10 @@ void CudaContext::initialize() {
            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
    }
    velm->upload(pinnedBuffer);
-//    bonded->initialize(system);
-    force = CudaArray::create<long3>(paddedNumAtoms, "force");
-    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*6);
-    energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
-    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize());
+    bonded->initialize(system);
+    force = CudaArray::create<long long>(paddedNumAtoms*3, "force");
+    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize());
+    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize());
    atomIndexDevice = CudaArray::create<int>(paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
@@ -448,82 +450,63 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
 }

 void CudaContext::clearBuffer(CudaArray& array) {
-    clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize()/4);
+    clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
 }

 void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
-    void* args[] = {&memory, &size};
+    int words = size/4;
+    void* args[] = {&memory, &words};
    executeKernel(clearBufferKernel, args, size, 128);
 }

 void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
    autoclearBuffers.push_back(memory);
-    autoclearBufferSizes.push_back(size);
+    autoclearBufferSizes.push_back(size/4);
 }

-//void CudaContext::clearAutoclearBuffers() {
-//    int base = 0;
-//    int total = autoclearBufferSizes.size();
-//    while (total-base >= 6) {
-//        clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
-//        clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
-//        clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
-//        clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
-//        clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
-//        clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
-//        clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
-//        clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
-//        executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
-//        base += 6;
-//    }
-//    if (total-base == 5) {
-//        clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
-//        clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
-//        clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
-//        clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
-//        clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
-//        clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
-//        executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
-//    }
-//    else if (total-base == 4) {
-//        clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
-//        clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
-//        clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
-//        clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
-//        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
-//    }
-//    else if (total-base == 3) {
-//        clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
-//        clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
-//        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
-//    }
-//    else if (total-base == 2) {
-//        clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
-//    }
-//    else if (total-base == 1) {
-//        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
-//    }
-//}
+void CudaContext::clearAutoclearBuffers() {
+    int base = 0;
+    int total = autoclearBufferSizes.size();
+    while (total-base >= 6) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
+                        &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
+        executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
+        base += 6;
+    }
+    if (total-base == 5) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
+        executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
+    }
+    else if (total-base == 4) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
+        executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
+    }
+    else if (total-base == 3) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
+        executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
+    }
+    else if (total-base == 2) {
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
+        executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
+    }
+    else if (total-base == 1) {
+        clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
+    }
+}

 void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
    // Recursively tag atoms as belonging to a particular molecule.
@@ -539,7 +522,7 @@ void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMo
 */
 class CudaContext::VirtualSiteInfo : public CudaForceInfo {
 public:
-    VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
+    VirtualSiteInfo(const System& system) {
        for (int i = 0; i < system.getNumParticles(); i++) {
            if (system.isVirtualSite(i)) {
                siteTypes.push_back(&typeid(system.getVirtualSite(i)));

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -125,41 +125,42 @@ public:
        return *velm;
    }
    /**
-     * Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
+     * Get the array which contains the force on each atom (represented as three long longs in 64 bit fixed point).
     */
    CudaArray& getForce() {
        return *force;
    }
-//    /**
-//     * Get the array which contains the buffers in which forces are computed.
-//     */
-//    CudaArray<mm_float4>& getForceBuffers() {
-//        return *forceBuffers;
-//    }
-//    /**
-//     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
-//     */
-//    CudaArray<cl_long>& getLongForceBuffer() {
-//        return *longForceBuffer;
-//    }
-//    /**
-//     * Get the array which contains the buffer in which energy is computed.
-//     */
-//    CudaArray<cl_float>& getEnergyBuffer() {
-//        return *energyBuffer;
-//    }
-//    /**
-//     * Get the array which contains the index of each atom.
-//     */
-//    CudaArray<cl_int>& getAtomIndex() {
-//        return *atomIndex;
-//    }
-//    /**
-//     * Get the number of cells by which the positions are offset.
-//     */
-//    std::vector<mm_int4>& getPosCellOffsets() {
-//        return posCellOffsets;
-//    }
+    /**
+     * Get the array which contains the buffer in which energy is computed.
+     */
+    CudaArray& getEnergyBuffer() {
+        return *energyBuffer;
+    }
+    /**
+     * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
+     * This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
+     */
+    void* getPinnedBuffer() {
+        return pinnedBuffer;
+    }
+    /**
+     * Get the host-side vector which contains the index of each atom.
+     */
+    const std::vector<int>& getAtomIndex() const {
+        return atomIndex;
+    }
+    /**
+     * Get the array which contains the index of each atom.
+     */
+    CudaArray& getAtomIndexArray() {
+        return *atomIndexDevice;
+    }
+    /**
+     * Get the number of cells by which the positions are offset.
+     */
+    std::vector<int4>& getPosCellOffsets() {
+        return posCellOffsets;
+    }
    /**
     * Replace all occurrences of a list of substrings.
     *
@@ -210,20 +211,20 @@ public:
     * Set all elements of an array to 0.
     *
     * @param memory     the memory to clear
-     * @param size       the number of 4-byte elements in the buffer
+     * @param size       the size of the buffer in bytes
     */
    void clearBuffer(CUdeviceptr memory, int size);
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *
     * @param memory     the memory to clear
-     * @param size       the number of 4-byte elements in the buffer
+     * @param size       the size of the buffer in bytes
     */
    void addAutoclearBuffer(CUdeviceptr memory, int size);
-//    /**
-//     * Clear all buffers that have been registered with addAutoclearBuffer().
-//     */
-//    void clearAutoclearBuffers();
+    /**
+     * Clear all buffers that have been registered with addAutoclearBuffer().
+     */
+    void clearAutoclearBuffers();
    /**
     * Get the current simulation time.
     */
@@ -309,27 +310,27 @@ public:
    /**
     * Convert a CUDA result code to the corresponding string description.
     */
-    std::string getErrorString(CUresult result);
+    static std::string getErrorString(CUresult result);
    
-//    /**
-//     * Get the size of the periodic box.
-//     */
-//    float4 getPeriodicBoxSize() const {
-//        return periodicBoxSize;
-//    }
-//    /**
-//     * Set the size of the periodic box.
-//     */
-//    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
-//        periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
-//        invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
-//    }
-//    /**
-//     * Get the inverse of the size of the periodic box.
-//     */
-//    float4 getInvPeriodicBoxSize() const {
-//        return invPeriodicBoxSize;
-//    }
+    /**
+     * Get the size of the periodic box.
+     */
+    double4 getPeriodicBoxSize() const {
+        return periodicBoxSize;
+    }
+    /**
+     * Set the size of the periodic box.
+     */
+    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
+        periodicBoxSize = make_double4(xsize, ysize, zsize, 0.0);
+        invPeriodicBoxSize = make_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0.0);
+    }
+    /**
+     * Get the inverse of the size of the periodic box.
+     */
+    double4 getInvPeriodicBoxSize() const {
+        return invPeriodicBoxSize;
+    }
    /**
     * Get the CudaIntegrationUtilities for this context.
     */
@@ -342,12 +343,12 @@ public:
    CudaExpressionUtilities& getExpressionUtilities() {
        return *expression;
    }
-//    /**
-//     * Get the CudaBondedUtilities for this context.
-//     */
-//    CudaBondedUtilities& getBondedUtilities() {
-//        return *bonded;
-//    }
+    /**
+     * Get the CudaBondedUtilities for this context.
+     */
+    CudaBondedUtilities& getBondedUtilities() {
+        return *bonded;
+    }
 //    /**
 //     * Get the CudaNonbondedUtilities for this context.
 //     */
@@ -428,8 +429,8 @@ private:
    int numThreadBlocks;
    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
    std::string compiler, tempDir, gpuArchitecture;
-    float4 periodicBoxSize;
-    float4 invPeriodicBoxSize;
+    double4 periodicBoxSize;
+    double4 invPeriodicBoxSize;
    std::string defaultOptimizationOptions;
    std::map<std::string, std::string> compilationDefines;
    CUcontext context;
@@ -456,7 +457,7 @@ private:
    std::vector<ReorderListener*> reorderListeners;
    CudaIntegrationUtilities* integration;
    CudaExpressionUtilities* expression;
-//    CudaBondedUtilities* bonded;
+    CudaBondedUtilities* bonded;
 //    CudaNonbondedUtilities* nonbonded;
    WorkThread* thread;
 };

--- a/platforms/cuda2/src/CudaForceInfo.h
+++ b/platforms/cuda2/src/CudaForceInfo.h
@@ -39,13 +39,7 @@ namespace OpenMM {

 class OPENMM_EXPORT CudaForceInfo {
 public:
-    CudaForceInfo(int requiredForceBuffers) : requiredForceBuffers(requiredForceBuffers) {
-    }
-    /**
-     * Get the number of force buffers this force requires.
-     */
-    int getRequiredForceBuffers() {
-        return requiredForceBuffers;
+    CudaForceInfo() {
    }
    /**
     * Get whether or not two particles have identical force field parameters.
@@ -63,8 +57,6 @@ public:
     * Get whether two particle groups are identical.
     */
    virtual bool areGroupsIdentical(int group1, int group2);
-private:
-    int requiredForceBuffers;
 };

 } // namespace OpenMM

--- a/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.cpp
@@ -672,15 +672,15 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete vsiteOutOfPlaneWeights;
 }

-//void CudaIntegrationUtilities::applyConstraints(double tol) {
-//    applyConstraints(false, tol);
-//}
-//
-//void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
-//    applyConstraints(true, tol);
-//}
-//
-//void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
+void CudaIntegrationUtilities::applyConstraints(double tol) {
+    applyConstraints(false, tol);
+}
+
+void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
+    applyConstraints(true, tol);
+}
+
+void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
 //    bool hasInitialized;
 //    CUfunction settleKernel, shakeKernel, ccmaForceKernel, ccmaUpdateKernel;
 //    if (constrainVelocities) {
@@ -772,19 +772,19 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
 //            }
 //        }
 //    }
-//}
-//
-//void CudaIntegrationUtilities::computeVirtualSites() {
+}
+
+void CudaIntegrationUtilities::computeVirtualSites() {
 //    if (numVsites > 0)
 //        context.executeKernel(vsitePositionKernel, numVsites);
-//}
-//
-//void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
+}
+
+void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
 //    if (numVsites > 0) {
 //        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
 //        context.executeKernel(vsiteForceKernel, numVsites);
 //    }
-//}
+}

 void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
    if (random != NULL) {

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */

 #include "CudaKernelFactory.h"
+#include "CudaKernels.h"
 //#include "CudaParallelKernels.h"
 #include "CudaPlatform.h"
 #include "openmm/internal/ContextImpl.h"
@@ -66,64 +67,64 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
 //        if (name == CalcCustomCompoundBondForceKernel::Name())
 //            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
 //    }
-//    CudaContext& cl = *data.contexts[0];
-//    if (name == CalcForcesAndEnergyKernel::Name())
-//        return new CudaCalcForcesAndEnergyKernel(name, platform, cl);
-//    if (name == UpdateStateDataKernel::Name())
-//        return new CudaUpdateStateDataKernel(name, platform, cl);
-//    if (name == ApplyConstraintsKernel::Name())
-//        return new CudaApplyConstraintsKernel(name, platform, cl);
-//    if (name == VirtualSitesKernel::Name())
-//        return new CudaVirtualSitesKernel(name, platform, cl);
-//    if (name == CalcHarmonicBondForceKernel::Name())
-//        return new CudaCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
+    CudaContext& cu = *data.contexts[0];
+    if (name == CalcForcesAndEnergyKernel::Name())
+        return new CudaCalcForcesAndEnergyKernel(name, platform, cu);
+    if (name == UpdateStateDataKernel::Name())
+        return new CudaUpdateStateDataKernel(name, platform, cu);
+    if (name == ApplyConstraintsKernel::Name())
+        return new CudaApplyConstraintsKernel(name, platform, cu);
+    if (name == VirtualSitesKernel::Name())
+        return new CudaVirtualSitesKernel(name, platform, cu);
+    if (name == CalcHarmonicBondForceKernel::Name())
+        return new CudaCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomBondForceKernel::Name())
-//        return new CudaCalcCustomBondForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcHarmonicAngleForceKernel::Name())
-//        return new CudaCalcHarmonicAngleForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomAngleForceKernel::Name())
-//        return new CudaCalcCustomAngleForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcPeriodicTorsionForceKernel::Name())
-//        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcRBTorsionForceKernel::Name())
-//        return new CudaCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCMAPTorsionForceKernel::Name())
-//        return new CudaCalcCMAPTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomTorsionForceKernel::Name())
-//        return new CudaCalcCustomTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcNonbondedForceKernel::Name())
-//        return new CudaCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomNonbondedForceKernel::Name())
-//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())
-//        return new CudaCalcGBSAOBCForceKernel(name, platform, cl);
+//        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
 //    if (name == CalcCustomGBForceKernel::Name())
-//        return new CudaCalcCustomGBForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomExternalForceKernel::Name())
-//        return new CudaCalcCustomExternalForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomHbondForceKernel::Name())
-//        return new CudaCalcCustomHbondForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomCompoundBondForceKernel::Name())
-//        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cl, context.getSystem());
-//    if (name == IntegrateVerletStepKernel::Name())
-//        return new CudaIntegrateVerletStepKernel(name, platform, cl);
+//        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
+    if (name == IntegrateVerletStepKernel::Name())
+        return new CudaIntegrateVerletStepKernel(name, platform, cu);
 //    if (name == IntegrateLangevinStepKernel::Name())
-//        return new CudaIntegrateLangevinStepKernel(name, platform, cl);
+//        return new CudaIntegrateLangevinStepKernel(name, platform, cu);
 //    if (name == IntegrateBrownianStepKernel::Name())
-//        return new CudaIntegrateBrownianStepKernel(name, platform, cl);
+//        return new CudaIntegrateBrownianStepKernel(name, platform, cu);
 //    if (name == IntegrateVariableVerletStepKernel::Name())
-//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cl);
+//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
 //    if (name == IntegrateVariableLangevinStepKernel::Name())
-//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cl);
+//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
 //    if (name == IntegrateCustomStepKernel::Name())
-//        return new CudaIntegrateCustomStepKernel(name, platform, cl);
+//        return new CudaIntegrateCustomStepKernel(name, platform, cu);
 //    if (name == ApplyAndersenThermostatKernel::Name())
-//        return new CudaApplyAndersenThermostatKernel(name, platform, cl);
+//        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
 //    if (name == ApplyMonteCarloBarostatKernel::Name())
-//        return new CudaApplyMonteCarloBarostatKernel(name, platform, cl);
-//    if (name == CalcKineticEnergyKernel::Name())
-//        return new CudaCalcKineticEnergyKernel(name, platform, cl);
+//        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
+    if (name == CalcKineticEnergyKernel::Name())
+        return new CudaCalcKineticEnergyKernel(name, platform, cu);
 //    if (name == RemoveCMMotionKernel::Name())
-//        return new CudaRemoveCMMotionKernel(name, platform, cl);
+//        return new CudaRemoveCMMotionKernel(name, platform, cu);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -28,236 +28,242 @@
 * -------------------------------------------------------------------------- */

 #include "CudaPlatform.h"
-//#include "CudaArray.h"
+#include "CudaArray.h"
 #include "CudaContext.h"
-//#include "CudaFFT3D.h"
 //#include "CudaParameterSet.h"
-//#include "CudaSort.h"
+#include "CudaSort.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"

 namespace OpenMM {

-///**
-// * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
-// * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
-// * necessary work at the end to determine the final results.
-// */
-//class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
-//public:
-//    CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcForcesAndEnergyKernel(name, platform), cl(cl) {
-//    }
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     */
-//    void initialize(const System& system);
-//    /**
-//     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
-//     * any ForceImpl.
-//     *
-//     * @param context       the context in which to execute this kernel
-//     * @param includeForce  true if forces should be computed
-//     * @param includeEnergy true if potential energy should be computed
-//     * @param groups        a set of bit flags for which force groups to include
-//     */
-//    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
-//    /**
-//     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
-//     * every ForceImpl.
-//     *
-//     * @param context       the context in which to execute this kernel
-//     * @param includeForce  true if forces should be computed
-//     * @param includeEnergy true if potential energy should be computed
-//     * @param groups        a set of bit flags for which force groups to include
-//     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
-//     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
-//     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
-//     */
-//    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
-//private:
-//   CudaContext& cl;
-//};
-//
-///**
-// * This kernel provides methods for setting and retrieving various state data: time, positions,
-// * velocities, and forces.
-// */
-//class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
-//public:
-//    CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaContext& cl) : UpdateStateDataKernel(name, platform), cl(cl) {
-//    }
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     */
-//    void initialize(const System& system);
-//    /**
-//     * Get the current time (in picoseconds).
-//     *
-//     * @param context    the context in which to execute this kernel
-//     */
-//    double getTime(const ContextImpl& context) const;
-//    /**
-//     * Set the current time (in picoseconds).
-//     *
-//     * @param context    the context in which to execute this kernel
-//     */
-//    void setTime(ContextImpl& context, double time);
-//    /**
-//     * Get the positions of all particles.
-//     *
-//     * @param positions  on exit, this contains the particle positions
-//     */
-//    void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
-//    /**
-//     * Set the positions of all particles.
-//     *
-//     * @param positions  a vector containg the particle positions
-//     */
-//    void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
-//    /**
-//     * Get the velocities of all particles.
-//     *
-//     * @param velocities  on exit, this contains the particle velocities
-//     */
-//    void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
-//    /**
-//     * Set the velocities of all particles.
-//     *
-//     * @param velocities  a vector containg the particle velocities
-//     */
-//    void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
-//    /**
-//     * Get the current forces on all particles.
-//     *
-//     * @param forces  on exit, this contains the forces
-//     */
-//    void getForces(ContextImpl& context, std::vector<Vec3>& forces);
-//    /**
-//     * Get the current periodic box vectors.
-//     *
-//     * @param a      on exit, this contains the vector defining the first edge of the periodic box
-//     * @param b      on exit, this contains the vector defining the second edge of the periodic box
-//     * @param c      on exit, this contains the vector defining the third edge of the periodic box
-//     */
-//    void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
-//    /**
-//     * Set the current periodic box vectors.
-//     *
-//     * @param a      the vector defining the first edge of the periodic box
-//     * @param b      the vector defining the second edge of the periodic box
-//     * @param c      the vector defining the third edge of the periodic box
-//     */
-//    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
-//    /**
-//     * Create a checkpoint recording the current state of the Context.
-//     * 
-//     * @param stream    an output stream the checkpoint data should be written to
-//     */
-//    void createCheckpoint(ContextImpl& context, std::ostream& stream);
-//    /**
-//     * Load a checkpoint that was written by createCheckpoint().
-//     * 
-//     * @param stream    an input stream the checkpoint data should be read from
-//     */
-//    void loadCheckpoint(ContextImpl& context, std::istream& stream);
-//private:
-//    CudaContext& cl;
-//};
-//
-///**
-// * This kernel modifies the positions of particles to enforce distance constraints.
-// */
-//class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
-//public:
-//    CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyConstraintsKernel(name, platform),
-//            cl(cl), hasInitializedKernel(false) {
-//    }
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     */
-//    void initialize(const System& system);
-//    /**
-//     * Update particle positions to enforce constraints.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     * @param tol        the distance tolerance within which constraints must be satisfied.
-//     */
-//    void apply(ContextImpl& context, double tol);
-//private:
-//    CudaContext& cl;
-//    bool hasInitializedKernel;
-//    cl::Kernel applyDeltasKernel;
-//};
-//
-///**
-// * This kernel recomputes the positions of virtual sites.
-// */
-//class CudaVirtualSitesKernel : public VirtualSitesKernel {
-//public:
-//    CudaVirtualSitesKernel(std::string name, const Platform& platform, CudaContext& cl) : VirtualSitesKernel(name, platform), cl(cl) {
-//    }
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     */
-//    void initialize(const System& system);
-//    /**
-//     * Compute the virtual site locations.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     */
-//    void computePositions(ContextImpl& context);
-//private:
-//    CudaContext& cl;
-//};
-//
-///**
-// * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
-// */
-//class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
-//public:
-//    CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicBondForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
-//    }
-//    ~CudaCalcHarmonicBondForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the HarmonicBondForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const HarmonicBondForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces and/or energy.
-//     *
-//     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
-//     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//private:
-//    int numBonds;
-//    bool hasInitializedKernel;
-//    CudaContext& cl;
-//    System& system;
-//    CudaArray<mm_float2>* params;
-//};
-//
+/**
+ * This kernel is invoked at the beginning and end of force and energy computations.  It gives the
+ * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
+ * necessary work at the end to determine the final results.
+ */
+class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
+public:
+    CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcForcesAndEnergyKernel(name, platform), cu(cu) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
+     * any ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+    /**
+     * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
+     * every ForceImpl.
+     *
+     * @param context       the context in which to execute this kernel
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return the potential energy of the system.  This value is added to all values returned by ForceImpls'
+     * calcForcesAndEnergy() methods.  That is, each force kernel may <i>either</i> return its contribution to the
+     * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
+     */
+    double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
+private:
+   CudaContext& cu;
+};
+
+/**
+ * This kernel provides methods for setting and retrieving various state data: time, positions,
+ * velocities, and forces.
+ */
+class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
+public:
+    CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaContext& cu) : UpdateStateDataKernel(name, platform), cu(cu) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * Get the current time (in picoseconds).
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    double getTime(const ContextImpl& context) const;
+    /**
+     * Set the current time (in picoseconds).
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void setTime(ContextImpl& context, double time);
+    /**
+     * Get the positions of all particles.
+     *
+     * @param positions  on exit, this contains the particle positions
+     */
+    void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
+    /**
+     * Set the positions of all particles.
+     *
+     * @param positions  a vector containg the particle positions
+     */
+    void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
+    /**
+     * Get the velocities of all particles.
+     *
+     * @param velocities  on exit, this contains the particle velocities
+     */
+    void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
+    /**
+     * Set the velocities of all particles.
+     *
+     * @param velocities  a vector containg the particle velocities
+     */
+    void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
+    /**
+     * Get the current forces on all particles.
+     *
+     * @param forces  on exit, this contains the forces
+     */
+    void getForces(ContextImpl& context, std::vector<Vec3>& forces);
+    /**
+     * Get the current periodic box vectors.
+     *
+     * @param a      on exit, this contains the vector defining the first edge of the periodic box
+     * @param b      on exit, this contains the vector defining the second edge of the periodic box
+     * @param c      on exit, this contains the vector defining the third edge of the periodic box
+     */
+    void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
+    /**
+     * Set the current periodic box vectors.
+     *
+     * @param a      the vector defining the first edge of the periodic box
+     * @param b      the vector defining the second edge of the periodic box
+     * @param c      the vector defining the third edge of the periodic box
+     */
+    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
+    /**
+     * Create a checkpoint recording the current state of the Context.
+     * 
+     * @param stream    an output stream the checkpoint data should be written to
+     */
+    void createCheckpoint(ContextImpl& context, std::ostream& stream);
+    /**
+     * Load a checkpoint that was written by createCheckpoint().
+     * 
+     * @param stream    an input stream the checkpoint data should be read from
+     */
+    void loadCheckpoint(ContextImpl& context, std::istream& stream);
+private:
+    CudaContext& cu;
+};
+
+/**
+ * This kernel modifies the positions of particles to enforce distance constraints.
+ */
+class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
+public:
+    CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyConstraintsKernel(name, platform),
+            cu(cu), hasInitializedKernel(false) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * Update particle positions to enforce constraints.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param tol        the distance tolerance within which constraints must be satisfied.
+     */
+    void apply(ContextImpl& context, double tol);
+private:
+    CudaContext& cu;
+    bool hasInitializedKernel;
+    CUfunction applyDeltasKernel;
+};
+
+/**
+ * This kernel recomputes the positions of virtual sites.
+ */
+class CudaVirtualSitesKernel : public VirtualSitesKernel {
+public:
+    CudaVirtualSitesKernel(std::string name, const Platform& platform, CudaContext& cu) : VirtualSitesKernel(name, platform), cu(cu) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * Compute the virtual site locations.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void computePositions(ContextImpl& context);
+private:
+    CudaContext& cu;
+};
+
+/**
+ * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
+ */
+class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
+public:
+    CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcHarmonicBondForceKernel(name, platform),
+            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
+    }
+    ~CudaCalcHarmonicBondForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the HarmonicBondForce this kernel will be used for
+     */
+    void initialize(const System& system, const HarmonicBondForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the HarmonicBondForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
+private:
+    int numBonds;
+    bool hasInitializedKernel;
+    CudaContext& cu;
+    System& system;
+    CudaArray* params;
+};
+
 ///**
 // * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
 // */
 //class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
 //public:
-//    CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomBondForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomBondForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL), globals(NULL) {
 //    }
 //    ~CudaCalcCustomBondForceKernel();
 //    /**
@@ -276,10 +282,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomBondForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
 //private:
 //    int numBonds;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaParameterSet* params;
 //    CudaArray<cl_float>* globals;
@@ -292,8 +305,8 @@ namespace OpenMM {
 // */
 //class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
 //public:
-//    CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicAngleForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcHarmonicAngleForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
 //    }
 //    ~CudaCalcHarmonicAngleForceKernel();
 //    /**
@@ -312,10 +325,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the HarmonicAngleForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
 //private:
 //    int numAngles;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaArray<mm_float2>* params;
 //};
@@ -325,8 +345,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
 //public:
-//    CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomAngleForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomAngleForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL), globals(NULL) {
 //    }
 //    ~CudaCalcCustomAngleForceKernel();
 //    /**
@@ -345,10 +365,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomAngleForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
 //private:
 //    int numAngles;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaParameterSet* params;
 //    CudaArray<cl_float>* globals;
@@ -361,8 +388,8 @@ namespace OpenMM {
 // */
 //class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
 //public:
-//    CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcPeriodicTorsionForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcPeriodicTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
 //    }
 //    ~CudaCalcPeriodicTorsionForceKernel();
 //    /**
@@ -381,10 +408,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the PeriodicTorsionForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
 //private:
 //    int numTorsions;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaArray<mm_float4>* params;
 //};
@@ -394,8 +428,8 @@ namespace OpenMM {
 // */
 //class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
 //public:
-//    CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcRBTorsionForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
+//    CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcRBTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL) {
 //    }
 //    ~CudaCalcRBTorsionForceKernel();
 //    /**
@@ -414,10 +448,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the RBTorsionForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
 //private:
 //    int numTorsions;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaArray<mm_float8>* params;
 //};
@@ -427,8 +468,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
 //public:
-//    CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCMAPTorsionForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), coefficients(NULL), mapPositions(NULL), torsionMaps(NULL) {
+//    CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCMAPTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), coefficients(NULL), mapPositions(NULL), torsionMaps(NULL) {
 //    }
 //    ~CudaCalcCMAPTorsionForceKernel();
 //    /**
@@ -450,7 +491,7 @@ namespace OpenMM {
 //private:
 //    int numTorsions;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaArray<mm_float4>* coefficients;
 //    CudaArray<mm_int2>* mapPositions;
@@ -462,8 +503,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
 //public:
-//    CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomTorsionForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomTorsionForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL), globals(NULL) {
 //    }
 //    ~CudaCalcCustomTorsionForceKernel();
 //    /**
@@ -482,10 +523,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomTorsionForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
 //private:
 //    int numTorsions;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaParameterSet* params;
 //    CudaArray<cl_float>* globals;
@@ -498,8 +546,8 @@ namespace OpenMM {
 // */
 //class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 //public:
-//    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
+//    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
 //            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
 //            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
 //    }
@@ -540,7 +588,7 @@ namespace OpenMM {
 //        static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
 //        static const char* clSortKey() {return "value.y";}
 //    };
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    bool hasInitializedKernel;
 //    CudaArray<mm_float2>* sigmaEpsilon;
 //    CudaArray<mm_float4>* exceptionParams;
@@ -556,16 +604,16 @@ namespace OpenMM {
 //    CudaArray<mm_int2>* pmeAtomGridIndex;
 //    CudaSort<SortTrait>* sort;
 //    CudaFFT3D* fft;
-//    cl::Kernel ewaldSumsKernel;
-//    cl::Kernel ewaldForcesKernel;
-//    cl::Kernel pmeGridIndexKernel;
-//    cl::Kernel pmeAtomRangeKernel;
-//    cl::Kernel pmeZIndexKernel;
-//    cl::Kernel pmeUpdateBsplinesKernel;
-//    cl::Kernel pmeSpreadChargeKernel;
-//    cl::Kernel pmeFinishSpreadChargeKernel;
-//    cl::Kernel pmeConvolutionKernel;
-//    cl::Kernel pmeInterpolateForceKernel;
+//    CUfunction ewaldSumsKernel;
+//    CUfunction ewaldForcesKernel;
+//    CUfunction pmeGridIndexKernel;
+//    CUfunction pmeAtomRangeKernel;
+//    CUfunction pmeZIndexKernel;
+//    CUfunction pmeUpdateBsplinesKernel;
+//    CUfunction pmeSpreadChargeKernel;
+//    CUfunction pmeFinishSpreadChargeKernel;
+//    CUfunction pmeConvolutionKernel;
+//    CUfunction pmeInterpolateForceKernel;
 //    std::map<std::string, std::string> pmeDefines;
 //    double ewaldSelfEnergy, dispersionCoefficient, alpha;
 //    int interpolateForceThreads;
@@ -578,8 +626,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
 //public:
-//    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomNonbondedForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+//    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomNonbondedForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
 //    }
 //    ~CudaCalcCustomNonbondedForceKernel();
 //    /**
@@ -598,9 +646,16 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomNonbondedForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 //private:
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    CudaParameterSet* params;
 //    CudaArray<cl_float>* globals;
 //    CudaArray<mm_float4>* tabulatedFunctionParams;
@@ -615,7 +670,7 @@ namespace OpenMM {
 // */
 //class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
 //public:
-//    CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcGBSAOBCForceKernel(name, platform), cl(cl),
+//    CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGBSAOBCForceKernel(name, platform), cu(cu),
 //            hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
 //            longBornForce(NULL), obcChain(NULL) {
 //    }
@@ -636,11 +691,18 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the GBSAOBCForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 //private:
 //    double prefactor;
 //    bool hasCreatedKernels;
 //    int maxTiles;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    CudaArray<mm_float2>* params;
 //    CudaArray<cl_float>* bornSum;
 //    CudaArray<cl_long>* longBornSum;
@@ -648,10 +710,10 @@ namespace OpenMM {
 //    CudaArray<cl_float>* bornForce;
 //    CudaArray<cl_long>* longBornForce;
 //    CudaArray<cl_float>* obcChain;
-//    cl::Kernel computeBornSumKernel;
-//    cl::Kernel reduceBornSumKernel;
-//    cl::Kernel force1Kernel;
-//    cl::Kernel reduceBornForceKernel;
+//    CUfunction computeBornSumKernel;
+//    CUfunction reduceBornSumKernel;
+//    CUfunction force1Kernel;
+//    CUfunction reduceBornForceKernel;
 //};
 //
 ///**
@@ -659,8 +721,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
 //public:
-//    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomGBForceKernel(name, platform),
-//            hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
+//    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
+//            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
 //            valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
 //    }
 //    ~CudaCalcCustomGBForceKernel();
@@ -680,10 +742,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomGBForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 //private:
 //    bool hasInitializedKernels, needParameterGradient;
 //    int maxTiles, numComputedValues;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    CudaParameterSet* params;
 //    CudaParameterSet* computedValues;
 //    CudaParameterSet* energyDerivs;
@@ -697,7 +766,7 @@ namespace OpenMM {
 //    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
 //    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
 //    System& system;
-//    cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
+//    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
 //};
 //
 ///**
@@ -705,8 +774,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
 //public:
-//    CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomExternalForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
+//    CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomExternalForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), system(system), params(NULL), globals(NULL) {
 //    }
 //    ~CudaCalcCustomExternalForceKernel();
 //    /**
@@ -725,10 +794,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomExternalForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
 //private:
 //    int numParticles;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    System& system;
 //    CudaParameterSet* params;
 //    CudaArray<cl_float>* globals;
@@ -741,8 +817,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
 //public:
-//    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomHbondForceKernel(name, platform),
-//            hasInitializedKernel(false), cl(cl), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
+//    CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomHbondForceKernel(name, platform),
+//            hasInitializedKernel(false), cu(cu), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
 //            donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
 //            tabulatedFunctionParams(NULL), system(system) {
 //    }
@@ -763,10 +839,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomHbondForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomHbondForce& force);
 //private:
 //    int numDonors, numAcceptors;
 //    bool hasInitializedKernel;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    CudaParameterSet* donorParams;
 //    CudaParameterSet* acceptorParams;
 //    CudaArray<cl_float>* globals;
@@ -781,7 +864,7 @@ namespace OpenMM {
 //    std::vector<cl_float> globalParamValues;
 //    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
 //    System& system;
-//    cl::Kernel donorKernel, acceptorKernel;
+//    CUfunction donorKernel, acceptorKernel;
 //};
 //
 ///**
@@ -789,8 +872,8 @@ namespace OpenMM {
 // */
 //class CudaCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
 //public:
-//    CudaCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomCompoundBondForceKernel(name, platform),
-//            cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
+//    CudaCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomCompoundBondForceKernel(name, platform),
+//            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
 //    }
 //    ~CudaCalcCustomCompoundBondForceKernel();
 //    /**
@@ -809,9 +892,17 @@ namespace OpenMM {
 //     * @return the potential energy due to the force
 //     */
 //    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+//    /**
+//     * Copy changed parameters over to a context.
+//     *
+//     * @param context    the context to copy parameters to
+//     * @param force      the CustomCompoundBondForce to copy the parameters from
+//     */
+//    void copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force);
+//
 //private:
 //    int numBonds;
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    CudaParameterSet* params;
 //    CudaArray<cl_float>* globals;
 //    CudaArray<mm_float4>* tabulatedFunctionParams;
@@ -820,43 +911,41 @@ namespace OpenMM {
 //    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
 //    System& system;
 //};
-//
-///**
-// * This kernel is invoked by VerletIntegrator to take one time step.
-// */
-//class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
-//public:
-//    CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVerletStepKernel(name, platform), cl(cl),
-//            hasInitializedKernels(false) {
-//    }
-//    ~CudaIntegrateVerletStepKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param integrator the VerletIntegrator this kernel will be used for
-//     */
-//    void initialize(const System& system, const VerletIntegrator& integrator);
-//    /**
-//     * Execute the kernel.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     * @param integrator the VerletIntegrator this kernel is being used for
-//     */
-//    void execute(ContextImpl& context, const VerletIntegrator& integrator);
-//private:
-//    CudaContext& cl;
-//    double prevStepSize;
-//    bool hasInitializedKernels;
-//    cl::Kernel kernel1, kernel2;
-//};
-//
+
+/**
+ * This kernel is invoked by VerletIntegrator to take one time step.
+ */
+class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
+public:
+    CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVerletStepKernel(name, platform), cu(cu) {
+    }
+    ~CudaIntegrateVerletStepKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param integrator the VerletIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const VerletIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the VerletIntegrator this kernel is being used for
+     */
+    void execute(ContextImpl& context, const VerletIntegrator& integrator);
+private:
+    CudaContext& cu;
+    double prevStepSize;
+    CUfunction kernel1, kernel2;
+};
+
 ///**
 // * This kernel is invoked by LangevinIntegrator to take one time step.
 // */
 //class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
 //public:
-//    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl),
+//    CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateLangevinStepKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false), params(NULL) {
 //    }
 //    ~CudaIntegrateLangevinStepKernel();
@@ -875,11 +964,11 @@ namespace OpenMM {
 //     */
 //    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    double prevTemp, prevFriction, prevStepSize;
 //    bool hasInitializedKernels;
 //    CudaArray<cl_float>* params;
-//    cl::Kernel kernel1, kernel2;
+//    CUfunction kernel1, kernel2;
 //};
 //
 ///**
@@ -887,7 +976,7 @@ namespace OpenMM {
 // */
 //class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
 //public:
-//    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateBrownianStepKernel(name, platform), cl(cl),
+//    CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateBrownianStepKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false), prevTemp(-1), prevFriction(-1), prevStepSize(-1) {
 //    }
 //    ~CudaIntegrateBrownianStepKernel();
@@ -906,10 +995,10 @@ namespace OpenMM {
 //     */
 //    void execute(ContextImpl& context, const BrownianIntegrator& integrator);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    double prevTemp, prevFriction, prevStepSize;
 //    bool hasInitializedKernels;
-//    cl::Kernel kernel1, kernel2;
+//    CUfunction kernel1, kernel2;
 //};
 //
 ///**
@@ -917,7 +1006,7 @@ namespace OpenMM {
 // */
 //class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
 //public:
-//    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableVerletStepKernel(name, platform), cl(cl),
+//    CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableVerletStepKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false) {
 //    }
 //    ~CudaIntegrateVariableVerletStepKernel();
@@ -938,10 +1027,10 @@ namespace OpenMM {
 //     */
 //    double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    bool hasInitializedKernels;
 //    int blockSize;
-//    cl::Kernel kernel1, kernel2, selectSizeKernel;
+//    CUfunction kernel1, kernel2, selectSizeKernel;
 //};
 //
 ///**
@@ -949,7 +1038,7 @@ namespace OpenMM {
 // */
 //class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
 //public:
-//    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableLangevinStepKernel(name, platform), cl(cl),
+//    CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateVariableLangevinStepKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false), params(NULL) {
 //    }
 //    ~CudaIntegrateVariableLangevinStepKernel();
@@ -970,11 +1059,11 @@ namespace OpenMM {
 //     */
 //    double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    bool hasInitializedKernels;
 //    int blockSize;
 //    CudaArray<cl_float>* params;
-//    cl::Kernel kernel1, kernel2, selectSizeKernel;
+//    CUfunction kernel1, kernel2, selectSizeKernel;
 //    double prevTemp, prevFriction, prevErrorTol;
 //};
 //
@@ -983,7 +1072,7 @@ namespace OpenMM {
 // */
 //class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
 //public:
-//    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
+//    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
 //            uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
 //    }
@@ -1041,7 +1130,7 @@ namespace OpenMM {
 //    std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
 //    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
 //    void recordChangedParameters(ContextImpl& context);
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    double prevStepSize;
 //    int numGlobalVariables;
 //    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
@@ -1054,8 +1143,8 @@ namespace OpenMM {
 //    CudaArray<mm_int4>* randomSeed;
 //    CudaParameterSet* perDofValues;
 //    mutable std::vector<std::vector<cl_float> > localPerDofValues;
-//    std::vector<std::vector<cl::Kernel> > kernels;
-//    cl::Kernel sumEnergyKernel, randomKernel;
+//    std::vector<std::vector<CUfunction> > kernels;
+//    CUfunction sumEnergyKernel, randomKernel;
 //    std::vector<CustomIntegrator::ComputationType> stepType;
 //    std::vector<bool> needsForces;
 //    std::vector<bool> needsEnergy;
@@ -1072,7 +1161,7 @@ namespace OpenMM {
 // */
 //class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
 //public:
-//    CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyAndersenThermostatKernel(name, platform), cl(cl),
+//    CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyAndersenThermostatKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false), atomGroups(NULL) {
 //    }
 //    ~CudaApplyAndersenThermostatKernel();
@@ -1090,11 +1179,11 @@ namespace OpenMM {
 //     */
 //    void execute(ContextImpl& context);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    bool hasInitializedKernels;
 //    int randomSeed;
 //    CudaArray<cl_int>* atomGroups;
-//    cl::Kernel kernel;
+//    CUfunction kernel;
 //};
 //
 ///**
@@ -1102,7 +1191,7 @@ namespace OpenMM {
 // */
 //class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
 //public:
-//    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
+//    CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cu) : ApplyMonteCarloBarostatKernel(name, platform), cu(cu),
 //            hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
 //    }
 //    ~CudaApplyMonteCarloBarostatKernel();
@@ -1131,45 +1220,45 @@ namespace OpenMM {
 //     */
 //    void restoreCoordinates(ContextImpl& context);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    bool hasInitializedKernels;
 //    int numMolecules;
 //    CudaArray<mm_float4>* savedPositions;
 //    CudaArray<cl_int>* moleculeAtoms;
 //    CudaArray<cl_int>* moleculeStartIndex;
-//    cl::Kernel kernel;
+//    CUfunction kernel;
 //};
-//
-///**
-// * This kernel is invoked to calculate the kinetic energy of the system.
-// */
-//class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
-//public:
-//    CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcKineticEnergyKernel(name, platform), cl(cl) {
-//    }
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     */
-//    void initialize(const System& system);
-//    /**
-//     * Execute the kernel.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     */
-//    double execute(ContextImpl& context);
-//private:
-//    CudaContext& cl;
-//    std::vector<double> masses;
-//};
-//
+
+/**
+ * This kernel is invoked to calculate the kinetic energy of the system.
+ */
+class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
+public:
+    CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcKineticEnergyKernel(name, platform), cu(cu) {
+    }
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     */
+    void initialize(const System& system);
+    /**
+     * Execute the kernel.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    double execute(ContextImpl& context);
+private:
+    CudaContext& cu;
+    std::vector<double> masses;
+};
+
 ///**
 // * This kernel is invoked to remove center of mass motion from the system.
 // */
 //class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
 //public:
-//    CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaContext& cl) : RemoveCMMotionKernel(name, platform), cl(cl), cmMomentum(NULL) {
+//    CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaContext& cu) : RemoveCMMotionKernel(name, platform), cu(cu), cmMomentum(NULL) {
 //    }
 //    ~CudaRemoveCMMotionKernel();
 //    /**
@@ -1186,12 +1275,13 @@ namespace OpenMM {
 //     */
 //    void execute(ContextImpl& context);
 //private:
-//    CudaContext& cl;
+//    CudaContext& cu;
 //    int frequency;
 //    CudaArray<mm_float4>* cmMomentum;
-//    cl::Kernel kernel1, kernel2;
+//    CUfunction kernel1, kernel2;
 //};

 } // namespace OpenMM

 #endif /*OPENMM_CUDAKERNELS_H_*/
+
--- a/platforms/cuda2/src/kernels/bondForce.cu
+++ b/platforms/cuda2/src/kernels/bondForce.cu
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+COMPUTE_FORCE
+dEdR = (r > 0) ? (dEdR / r) : 0;
+delta *= dEdR;
+real3 force1 = delta;
+real3 force2 = -delta;
--- a/platforms/cuda2/src/kernels/harmonicBondForce.cu
+++ b/platforms/cuda2/src/kernels/harmonicBondForce.cu
+float2 bondParams = PARAMS[index];
+real deltaIdeal = r-bondParams.x;
+energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
+real dEdR = bondParams.y * deltaIdeal;
--- a/platforms/cuda2/src/kernels/vectorOps.cu
+++ b/platforms/cuda2/src/kernels/vectorOps.cu
@@ -42,7 +42,7 @@ inline __device__ double4 make_double4(double a) {

 // Negate a vector.

-inline __device__ int2 operator*(int2 a) {
+inline __device__ int2 operator-(int2 a) {
    return make_int2(-a.x, -a.y);
 }

@@ -455,3 +455,41 @@ inline __device__ double3 operator*(double a, double3 b) {
 inline __device__ double4 operator*(double a, double4 b) {
    return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
 }
+
+// *= operator (multiply vector by constant)
+
+inline __device__ void operator*=(int2& a, int b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(int3& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(int4& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __device__ void operator*=(float2& a, float b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(float3& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(float4& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __device__ void operator*=(double2& a, double b) {
+    a.x *= b; a.y *= b;
+}
+
+inline __device__ void operator*=(double3& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __device__ void operator*=(double4& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
--- a/platforms/cuda2/src/kernels/verlet.cu
+++ b/platforms/cuda2/src/kernels/verlet.cu
+/**
+ * Perform the first step of Verlet integration.
+ */
+
+extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, const real4* __restrict__ posq, real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta) {
+    const real2 stepSize = dt[0];
+    const real dtPos = stepSize.y;
+    const real dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const real scale = dtVel/(real) 0xFFFFFFFF;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            real4 pos = posq[index];
+            velocity.x += scale*force[index]*velocity.w;
+            velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
+            velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
+            pos.x = velocity.x*dtPos;
+            pos.y = velocity.y*dtPos;
+            pos.z = velocity.z*dtPos;
+            posDelta[index] = pos;
+            velm[index] = velocity;
+        }
+    }
+}
+
+/**
+ * Perform the second step of Verlet integration.
+ */
+
+extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* __restrict__ posq, real4* __restrict__ velm, const real4* __restrict__ posDelta) {
+    real2 stepSize = dt[0];
+    double oneOverDt = 1.0/stepSize.y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            real4 pos = posq[index];
+            real4 delta = posDelta[index];
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+            velocity = make_real4((real) (delta.x*oneOverDt), (real) (delta.y*oneOverDt), (real) (delta.z*oneOverDt), velocity.w);
+            posq[index] = pos;
+            velm[index] = velocity;
+        }
+    }
+}
+
+/**
+ * Select the step size to use for the next step.
+ */
+//
+//extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const real4* __restrict__ force, __local real* __restrict__ error) {
+//    // Calculate the error.
+//
+//    real err = 0.0f;
+//    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+//        real4 f = force[index];
+//        real invMass = velm[index].w;
+//        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
+//    }
+//    error[threadIdx.x] = err;
+//    __syncthreads;
+//
+//    // Sum the errors from all threads.
+//
+//    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
+//        if (threadIdx.x+offset < get_local_size(0) && (threadIdx.x&(2*offset-1)) == 0)
+//            error[threadIdx.x] += error[threadIdx.x+offset];
+//        __syncthreads;
+//    }
+//    if (threadIdx.x == 0) {
+//        real totalError = sqrt(error[0]/(NUM_ATOMS*3));
+//        real newStepSize = sqrt(errorTol/totalError);
+//        real oldStepSize = dt[0].y;
+//        if (oldStepSize > 0.0f)
+//            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
+//        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
+//            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
+//        if (newStepSize > maxStepSize)
+//            newStepSize = maxStepSize;
+//        dt[0].y = newStepSize;
+//    }
+//}
--- a/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of HarmonicBondForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <map>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testBonds() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 0.8);
+    forceField->addBond(1, 2, 1.2, 0.7);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 2, 0);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        ASSERT_EQUAL_VEC(Vec3(0, -0.8*0.5, 0), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0.7*0.2, 0, 0), forces[2], TOL);
+        ASSERT_EQUAL_VEC(Vec3(-forces[0][0]-forces[2][0], -forces[0][1]-forces[2][1], -forces[0][2]-forces[2][2]), forces[1], TOL);
+        ASSERT_EQUAL_TOL(0.5*0.8*0.5*0.5 + 0.5*0.7*0.2*0.2, state.getPotentialEnergy(), TOL);
+    }
+    
+    // Try changing the bond parameters and make sure it's still correct.
+    
+    forceField->setBondParameters(0, 0, 1, 1.6, 0.9);
+    forceField->setBondParameters(1, 1, 2, 1.3, 0.8);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        ASSERT_EQUAL_VEC(Vec3(0, -0.9*0.4, 0), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0.8*0.3, 0, 0), forces[2], TOL);
+        ASSERT_EQUAL_VEC(Vec3(-forces[0][0]-forces[2][0], -forces[0][1]-forces[2][1], -forces[0][2]-forces[2][2]), forces[1], TOL);
+        ASSERT_EQUAL_TOL(0.5*0.9*0.4*0.4 + 0.5*0.8*0.3*0.3, state.getPotentialEnergy(), TOL);
+    }
+}
+
+void testParallelComputation() {
+    CudaPlatform platform;
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    HarmonicBondForce* force = new HarmonicBondForce();
+    for (int i = 1; i < numParticles; i++)
+        force->addBond(i-1, i, 1.1, i);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, 0, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, CudaPlatform::CudaDeviceIndex());
+    map<string, string> props;
+    props[CudaPlatform::CudaDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+
+int main() {
+    try {
+        testBonds();
+//        testParallelComputation();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+
--- a/platforms/cuda2/tests/TestCudaVerletIntegrator.cpp
+++ b/platforms/cuda2/tests/TestCudaVerletIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of VerletIntegrator.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+/**
+ * Compute the energy of a state, taking into account the half step offset between
+ * positions and velocities.
+ */
+
+static double computeEnergy(const State& state, const System& system, double dt) {
+    const vector<Vec3>& v = state.getVelocities();
+    const vector<Vec3>& f = state.getForces();
+    double energy = 0.0;
+    for (int i = 0; i < system.getNumParticles(); i++) {
+        double m = system.getParticleMass(i);
+        Vec3 vel = v[i]+f[i]*(0.5*dt/m);
+        energy += 0.5*m*vel.dot(vel);
+    }
+    return energy+state.getPotentialEnergy();
+}
+
+void testSingleBond() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    VerletIntegrator integrator(0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+
+    // This is simply a harmonic oscillator, so compare it to the analytical solution.
+
+    const double freq = 1.0;;
+    State state = context.getState(State::Energy);
+    const double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
+    for (int i = 0; i < 1000; ++i) {
+        state = context.getState(State::Positions | State::Velocities | State::Energy);
+        double time = state.getTime();
+        double expectedDist = 1.5+0.5*std::cos(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
+        double expectedSpeed = -0.5*freq*std::sin(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.02);
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+    ASSERT_EQUAL_TOL(10.0, context.getState(0).getTime(), 1e-5);
+}
+
+void testConstraints() {
+    const int numParticles = 8;
+    const int numConstraints = 5;
+    const double temp = 100.0;
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(1, 2, 1.0);
+    system.addConstraint(2, 3, 1.0);
+    system.addConstraint(4, 5, 1.0);
+    system.addConstraint(6, 7, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+
+    // Simulate it and see whether the constraints remain satisfied.
+
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy | State::Velocities | State::Forces);
+        for (int j = 0; j < numConstraints; ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
+        }
+        double energy = computeEnergy(state, system, integrator.getStepSize());
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+
+void testConstrainedClusters() {
+    const int numParticles = 7;
+    const double temp = 500.0;
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(i > 1 ? 1.0 : 10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(0, 2, 1.0);
+    system.addConstraint(0, 3, 1.0);
+    system.addConstraint(0, 4, 1.0);
+    system.addConstraint(1, 5, 1.0);
+    system.addConstraint(1, 6, 1.0);
+    system.addConstraint(2, 3, sqrt(2.0));
+    system.addConstraint(2, 4, sqrt(2.0));
+    system.addConstraint(3, 4, sqrt(2.0));
+    system.addConstraint(5, 6, sqrt(2.0));
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    positions[2] = Vec3(-1, 0, 0);
+    positions[3] = Vec3(0, 1, 0);
+    positions[4] = Vec3(0, 0, 1);
+    positions[5] = Vec3(2, 0, 0);
+    positions[6] = Vec3(1, 1, 0);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numParticles; ++i)
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+
+    // Simulate it and see whether the constraints remain satisfied.
+
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy | State::Velocities | State::Forces);
+        for (int j = 0; j < system.getNumConstraints(); ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+        }
+        double energy = computeEnergy(state, system, integrator.getStepSize());
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+
+int main() {
+    try {
+        testSingleBond();
+//        testConstraints();
+//        testConstrainedClusters();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+