Continuing to implement new CUDA platform: HarmonicBondForce and VerletIntegrator

6e434a02 · Peter Eastman · 1f0ec7b5 · 6e434a02 · 6e434a02 · 6e434a02
Commit 6e434a02 authored Jun 08, 2012 by Peter Eastman
16 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */
 #include "CudaArray.h"
+#include "CudaContext.h"
 #include <iostream>
 #include <sstream>
 #include <vector>
@@ -36,7 +37,7 @@ CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
    CUresult result = cuMemAlloc(&pointer, size*elementSize);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
-        str<<"Error creating array "<<name<<": "<<result;
+        str<<"Error creating array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
 }
@@ -46,7 +47,7 @@ CudaArray::~CudaArray() {
        CUresult result = cuMemFree(pointer);
        if (result != CUDA_SUCCESS) {
            std::stringstream str;
-            str<<"Error deleting array "<<name<<": "<<result;
+            str<<"Error deleting array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
            throw OpenMMException(str.str());
        }
    }
@@ -60,7 +61,7 @@ void CudaArray::upload(void* data, bool blocking) {
        result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
-        str<<"Error uploading array "<<name<<": "<<result;
+        str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
 }
@@ -73,7 +74,7 @@ void CudaArray::download(void* data, bool blocking) const {
        result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
    if (result != CUDA_SUCCESS) {
        std::stringstream str;
-        str<<"Error downloading array "<<name<<": "<<result;
+        str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
 }
--- a/platforms/cuda2/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaBondedUtilities.cpp
@@ -26,6 +26,7 @@
 #include "CudaBondedUtilities.h"
 #include "CudaExpressionUtilities.h"
+#include "CudaKernelSources.h"
 #include "openmm/OpenMMException.h"
 #include "CudaNonbondedUtilities.h"
 #include <iostream>
@@ -81,8 +82,8 @@ void CudaBondedUtilities::initialize(const System& system) {
                for (int atom = 0; atom < width; atom++)
                    indexVec[bond*width+atom] = forceAtoms[i][bond][startAtom+atom];
            }
-            CudaArray* indices = CudaArray::create<unsigned int>(indexVec.size(), "bondedIndices");
+            CudaArray* indices = new CudaArray(numBonds, 4*width, "bondedIndices");
-            indices->upload(indexVec);
+            indices->upload(&indexVec[0]);
            atomIndices[i].push_back(indices);
            startAtom += width;
        }
@@ -91,13 +92,14 @@ void CudaBondedUtilities::initialize(const System& system) {
    // Create the kernel.
    stringstream s;
+    s<<CudaKernelSources::vectorOps;
    for (int i = 0; i < (int) prefixCode.size(); i++)
        s<<prefixCode[i];
-    s<<"extern \"C\" __global__ void computeBondedForces(long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
+    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
    for (int force = 0; force < numForces; force++) {
        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
            int indexWidth = atomIndices[force][i]->getElementSize()/4;
-            string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+            string indexType = "uint"+context.intToString(indexWidth);
            s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
        }
    }
@@ -129,10 +131,10 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
    for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
        int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4;
        suffix = (indexWidth == 1 ? suffix1 : suffix4);
-        string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+        string indexType = "uint"+context.intToString(indexWidth);
        s<<"    "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
-        s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
+        int atomsToLoad = min(indexWidth, numAtoms-startAtom);
-        for (int j = 0; j < indexWidth; j++) {
+        for (int j = 0; j < atomsToLoad; j++) {
            s<<"    unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
            s<<"    real4 pos"<<(j+1)<<" = posq[atom"<<(j+1)<<"];\n";
        }
@@ -140,34 +142,28 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
    }
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
-        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0xFFFFFFFF)));\n";
-        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0xFFFFFFFF)));\n";
-        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0xFFFFFFFF)));\n";
+        s<<"    __threadfence_block();\n";
    }
    s<<"}\n";
    return s.str();
 }
 void CudaBondedUtilities::computeInteractions(int groups) {
-//    if (!hasInitializedKernels) {
+    if (!hasInitializedKernels) {
-//        hasInitializedKernels = true;
+        hasInitializedKernels = true;
-//        for (int i = 0; i < (int) forceSets.size(); i++) {
+        kernelArgs.push_back(&context.getForce().getDevicePointer());
-//            int index = 0;
+        kernelArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
-//            cl::Kernel& kernel = kernels[i];
+        kernelArgs.push_back(&context.getPosq().getDevicePointer());
-//            kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
+        kernelArgs.push_back(NULL);
-//            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
+        for (int i = 0; i < (int) atomIndices.size(); i++)
-//            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+            for (int j = 0; j < (int) atomIndices[i].size(); j++)
-//            index++;
+                kernelArgs.push_back(&atomIndices[i][j]->getDevicePointer());
-//            for (int j = 0; j < (int) forceSets[i].size(); j++) {
+        for (int i = 0; i < (int) arguments.size(); i++)
-//                kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]]->getDeviceBuffer());
+            kernelArgs.push_back(&arguments[i]);
-//                kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]]->getDeviceBuffer());
+    }
-//            }
+    kernelArgs[3] = &groups;
-//            for (int j = 0; j < (int) arguments.size(); j++)
+    context.executeKernel(kernel, &kernelArgs[0], maxBonds);
-//                kernel.setArg<cl::Memory>(index++, *arguments[j]);
-//        }
-//    }
-//    for (int i = 0; i < (int) kernels.size(); i++) {
-//        kernels[i].setArg<cl_int>(3, groups);
-//        context.executeKernel(kernels[i], maxBonds);
-//    }
 }
--- a/platforms/cuda2/src/CudaBondedUtilities.h
+++ b/platforms/cuda2/src/CudaBondedUtilities.h
@@ -59,7 +59,7 @@ namespace OpenMM {
 * <li>The positions of those atoms will have been stored in the real4 variables "pos1", "pos2", ....</li>
 * <li>A real variable called "energy" will exist.  Your code should add the potential energy of the
 * bond to that variable.</li>
- * <li>Your code should define real4 variables called "force1", "force2", ... that contain the force to
+ * <li>Your code should define real3 variables called "force1", "force2", ... that contain the force to
 * apply to each atom.</li>
 * </ol>
 * 
@@ -69,8 +69,8 @@ namespace OpenMM {
 * <tt><pre>
 * real4 delta = pos2-pos1;
 * energy += delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
- * real4 force1 = 2.0f*delta;
+ * real3 force1 = 2.0f*delta;
- * real4 force2 = -2.0f*delta;
+ * real3 force2 = -2.0f*delta;
 * </pre></tt>
 * 
 * Interactions will often depend on parameters or other data.  Call addArgument() to provide the data
@@ -129,6 +129,7 @@ private:
    std::vector<std::string> argTypes;
    std::vector<std::vector<CudaArray*> > atomIndices;
    std::vector<std::string> prefixCode;
+    std::vector<void*> kernelArgs;
    int numForceBuffers, maxBonds;
    bool hasInitializedKernels;
 };

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -30,7 +30,7 @@
 #include <cmath>
 #include "CudaContext.h"
 #include "CudaArray.h"
-//#include "CudaBondedUtilities.h"
+#include "CudaBondedUtilities.h"
 #include "CudaForceInfo.h"
 #include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
@@ -67,8 +67,8 @@ bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
-        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL),*/ integration(NULL), expression(NULL),
+        velm(NULL), force(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), expression(NULL),
-        /*bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
+        bonded(NULL), /*nonbonded(NULL),*/ thread(NULL) {
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -138,10 +138,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
-//    bonded = new CudaBondedUtilities(*this);
+    bonded = new CudaBondedUtilities(*this);
 //    nonbonded = new CudaNonbondedUtilities(*this);
    if (useDoublePrecision) {
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_double2";
@@ -149,7 +148,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_real4"] = "make_double4";
    }
    else {
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
        compilationDefines["make_real2"] = "make_float2";
@@ -203,22 +201,16 @@ CudaContext::~CudaContext() {
        delete posq;
    if (velm != NULL)
        delete velm;
-//    if (force != NULL)
+    if (force != NULL)
-//        delete force;
+        delete force;
-//    if (forceBuffers != NULL)
+    if (energyBuffer != NULL)
-//        delete forceBuffers;
+        delete energyBuffer;
-//    if (longForceBuffer != NULL)
-//        delete longForceBuffer;
-//    if (energyBuffer != NULL)
-//        delete energyBuffer;
-//    if (atomIndex != NULL)
-//        delete atomIndex;
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
        delete expression;
-//    if (bonded != NULL)
+    if (bonded != NULL)
-//        delete bonded;
+        delete bonded;
 //    if (nonbonded != NULL)
 //        delete nonbonded;
    if (thread != NULL)
@@ -229,6 +221,17 @@ CudaContext::~CudaContext() {
 }
 void CudaContext::initialize() {
+    string errorMessage = "Error initializing Context";
+    if (useDoublePrecision) {
+        energyBuffer = CudaArray::create<double>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numThreadBlocks*ThreadBlockSize);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else {
+        energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numThreadBlocks*ThreadBlockSize);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
+    }
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision)
@@ -237,11 +240,10 @@ void CudaContext::initialize() {
            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
    }
    velm->upload(pinnedBuffer);
-//    bonded->initialize(system);
+    bonded->initialize(system);
-    force = CudaArray::create<long3>(paddedNumAtoms, "force");
+    force = CudaArray::create<long long>(paddedNumAtoms*3, "force");
-    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*6);
+    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize());
-    energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
+    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize());
-    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize());
    atomIndexDevice = CudaArray::create<int>(paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
@@ -448,82 +450,63 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
 }
 void CudaContext::clearBuffer(CudaArray& array) {
-    clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize()/4);
+    clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
 }
 void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
-    void* args[] = {&memory, &size};
+    int words = size/4;
+    void* args[] = {&memory, &words};
    executeKernel(clearBufferKernel, args, size, 128);
 }
 void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
    autoclearBuffers.push_back(memory);
-    autoclearBufferSizes.push_back(size);
+    autoclearBufferSizes.push_back(size/4);
 }
-//void CudaContext::clearAutoclearBuffers() {
+void CudaContext::clearAutoclearBuffers() {
-//    int base = 0;
+    int base = 0;
-//    int total = autoclearBufferSizes.size();
+    int total = autoclearBufferSizes.size();
-//    while (total-base >= 6) {
+    while (total-base >= 6) {
-//        clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
-//        clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
-//        clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
-//        clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
-//        clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
-//        clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+                        &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
-//        clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+        executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
-//        clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+        base += 6;
-//        clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
+    }
-//        clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
+    if (total-base == 5) {
-//        clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
-//        clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
-//        executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
-//        base += 6;
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
-//    }
+                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
-//    if (total-base == 5) {
+        executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
-//        clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+    }
-//        clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+    else if (total-base == 4) {
-//        clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
-//        clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
-//        clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
-//        clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
-//        clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+        executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
-//        clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+    }
-//        clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
+    else if (total-base == 3) {
-//        clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
-//        executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
-//    }
+                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
-//    else if (total-base == 4) {
+        executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
-//        clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
+    }
-//        clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
+    else if (total-base == 2) {
-//        clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
+        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
-//        clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
+                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
-//        clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
+        executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
-//        clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
+    }
-//        clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
+    else if (total-base == 1) {
-//        clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
+        clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
-//        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
+    }
-//    }
+}
-//    else if (total-base == 3) {
-//        clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
-//        clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
-//        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
-//    }
-//    else if (total-base == 2) {
-//        clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
-//        clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
-//        clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
-//        clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
-//        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
-//    }
-//    else if (total-base == 1) {
-//        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
-//    }
-//}
 void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
    // Recursively tag atoms as belonging to a particular molecule.
@@ -539,7 +522,7 @@ void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMo
 */
 class CudaContext::VirtualSiteInfo : public CudaForceInfo {
 public:
-    VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
+    VirtualSiteInfo(const System& system) {
        for (int i = 0; i < system.getNumParticles(); i++) {
            if (system.isVirtualSite(i)) {
                siteTypes.push_back(&typeid(system.getVirtualSite(i)));

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -125,41 +125,42 @@ public:
        return *velm;
    }
    /**
-     * Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
+     * Get the array which contains the force on each atom (represented as three long longs in 64 bit fixed point).
     */
    CudaArray& getForce() {
        return *force;
    }
-//    /**
+    /**
-//     * Get the array which contains the buffers in which forces are computed.
+     * Get the array which contains the buffer in which energy is computed.
-//     */
+     */
-//    CudaArray<mm_float4>& getForceBuffers() {
+    CudaArray& getEnergyBuffer() {
-//        return *forceBuffers;
+        return *energyBuffer;
-//    }
+    }
-//    /**
+    /**
-//     * Get the array which contains a contribution to each force represented as 64 bit fixed point.
+     * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
-//     */
+     * This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
-//    CudaArray<cl_long>& getLongForceBuffer() {
+     */
-//        return *longForceBuffer;
+    void* getPinnedBuffer() {
-//    }
+        return pinnedBuffer;
-//    /**
+    }
-//     * Get the array which contains the buffer in which energy is computed.
+    /**
-//     */
+     * Get the host-side vector which contains the index of each atom.
-//    CudaArray<cl_float>& getEnergyBuffer() {
+     */
-//        return *energyBuffer;
+    const std::vector<int>& getAtomIndex() const {
-//    }
+        return atomIndex;
-//    /**
+    }
-//     * Get the array which contains the index of each atom.
+    /**
-//     */
+     * Get the array which contains the index of each atom.
-//    CudaArray<cl_int>& getAtomIndex() {
+     */
-//        return *atomIndex;
+    CudaArray& getAtomIndexArray() {
-//    }
+        return *atomIndexDevice;
-//    /**
+    }
-//     * Get the number of cells by which the positions are offset.
+    /**
-//     */
+     * Get the number of cells by which the positions are offset.
-//    std::vector<mm_int4>& getPosCellOffsets() {
+     */
-//        return posCellOffsets;
+    std::vector<int4>& getPosCellOffsets() {
-//    }
+        return posCellOffsets;
+    }
    /**
     * Replace all occurrences of a list of substrings.
     *
@@ -210,20 +211,20 @@ public:
     * Set all elements of an array to 0.
     *
     * @param memory     the memory to clear
-     * @param size       the number of 4-byte elements in the buffer
+     * @param size       the size of the buffer in bytes
     */
    void clearBuffer(CUdeviceptr memory, int size);
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *
     * @param memory     the memory to clear
-     * @param size       the number of 4-byte elements in the buffer
+     * @param size       the size of the buffer in bytes
     */
    void addAutoclearBuffer(CUdeviceptr memory, int size);
-//    /**
+    /**
-//     * Clear all buffers that have been registered with addAutoclearBuffer().
+     * Clear all buffers that have been registered with addAutoclearBuffer().
-//     */
+     */
-//    void clearAutoclearBuffers();
+    void clearAutoclearBuffers();
    /**
     * Get the current simulation time.
     */
@@ -309,27 +310,27 @@ public:
    /**
     * Convert a CUDA result code to the corresponding string description.
     */
-    std::string getErrorString(CUresult result);
+    static std::string getErrorString(CUresult result);
-//    /**
+    /**
-//     * Get the size of the periodic box.
+     * Get the size of the periodic box.
-//     */
+     */
-//    float4 getPeriodicBoxSize() const {
+    double4 getPeriodicBoxSize() const {
-//        return periodicBoxSize;
+        return periodicBoxSize;
-//    }
+    }
-//    /**
+    /**
-//     * Set the size of the periodic box.
+     * Set the size of the periodic box.
-//     */
+     */
-//    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
+    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
-//        periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
+        periodicBoxSize = make_double4(xsize, ysize, zsize, 0.0);
-//        invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
+        invPeriodicBoxSize = make_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0.0);
-//    }
+    }
-//    /**
+    /**
-//     * Get the inverse of the size of the periodic box.
+     * Get the inverse of the size of the periodic box.
-//     */
+     */
-//    float4 getInvPeriodicBoxSize() const {
+    double4 getInvPeriodicBoxSize() const {
-//        return invPeriodicBoxSize;
+        return invPeriodicBoxSize;
-//    }
+    }
    /**
     * Get the CudaIntegrationUtilities for this context.
     */
@@ -342,12 +343,12 @@ public:
    CudaExpressionUtilities& getExpressionUtilities() {
        return *expression;
    }
-//    /**
+    /**
-//     * Get the CudaBondedUtilities for this context.
+     * Get the CudaBondedUtilities for this context.
-//     */
+     */
-//    CudaBondedUtilities& getBondedUtilities() {
+    CudaBondedUtilities& getBondedUtilities() {
-//        return *bonded;
+        return *bonded;
-//    }
+    }
 //    /**
 //     * Get the CudaNonbondedUtilities for this context.
 //     */
@@ -428,8 +429,8 @@ private:
    int numThreadBlocks;
    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
    std::string compiler, tempDir, gpuArchitecture;
-    float4 periodicBoxSize;
+    double4 periodicBoxSize;
-    float4 invPeriodicBoxSize;
+    double4 invPeriodicBoxSize;
    std::string defaultOptimizationOptions;
    std::map<std::string, std::string> compilationDefines;
    CUcontext context;
@@ -456,7 +457,7 @@ private:
    std::vector<ReorderListener*> reorderListeners;
    CudaIntegrationUtilities* integration;
    CudaExpressionUtilities* expression;
-//    CudaBondedUtilities* bonded;
+    CudaBondedUtilities* bonded;
 //    CudaNonbondedUtilities* nonbonded;
    WorkThread* thread;
 };

--- a/platforms/cuda2/src/CudaForceInfo.h
+++ b/platforms/cuda2/src/CudaForceInfo.h
@@ -39,13 +39,7 @@ namespace OpenMM {
 class OPENMM_EXPORT CudaForceInfo {
 public:
-    CudaForceInfo(int requiredForceBuffers) : requiredForceBuffers(requiredForceBuffers) {
+    CudaForceInfo() {
-    }
-    /**
-     * Get the number of force buffers this force requires.
-     */
-    int getRequiredForceBuffers() {
-        return requiredForceBuffers;
    }
    /**
     * Get whether or not two particles have identical force field parameters.
@@ -63,8 +57,6 @@ public:
     * Get whether two particle groups are identical.
     */
    virtual bool areGroupsIdentical(int group1, int group2);
-private:
-    int requiredForceBuffers;
 };
 } // namespace OpenMM

--- a/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.cpp
@@ -672,15 +672,15 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete vsiteOutOfPlaneWeights;
 }
-//void CudaIntegrationUtilities::applyConstraints(double tol) {
+void CudaIntegrationUtilities::applyConstraints(double tol) {
-//    applyConstraints(false, tol);
+    applyConstraints(false, tol);
-//}
+}
-//
-//void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
+void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
-//    applyConstraints(true, tol);
+    applyConstraints(true, tol);
-//}
+}
-//
-//void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
+void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
 //    bool hasInitialized;
 //    CUfunction settleKernel, shakeKernel, ccmaForceKernel, ccmaUpdateKernel;
 //    if (constrainVelocities) {
@@ -772,19 +772,19 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
 //            }
 //        }
 //    }
-//}
+}
-//
-//void CudaIntegrationUtilities::computeVirtualSites() {
+void CudaIntegrationUtilities::computeVirtualSites() {
 //    if (numVsites > 0)
 //        context.executeKernel(vsitePositionKernel, numVsites);
-//}
+}
-//
-//void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
+void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
 //    if (numVsites > 0) {
 //        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
 //        context.executeKernel(vsiteForceKernel, numVsites);
 //    }
-//}
+}
 void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
    if (random != NULL) {

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */
 #include "CudaKernelFactory.h"
+#include "CudaKernels.h"
 //#include "CudaParallelKernels.h"
 #include "CudaPlatform.h"
 #include "openmm/internal/ContextImpl.h"
@@ -66,64 +67,64 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
 //        if (name == CalcCustomCompoundBondForceKernel::Name())
 //            return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
 //    }
-//    CudaContext& cl = *data.contexts[0];
+    CudaContext& cu = *data.contexts[0];
-//    if (name == CalcForcesAndEnergyKernel::Name())
+    if (name == CalcForcesAndEnergyKernel::Name())
-//        return new CudaCalcForcesAndEnergyKernel(name, platform, cl);
+        return new CudaCalcForcesAndEnergyKernel(name, platform, cu);
-//    if (name == UpdateStateDataKernel::Name())
+    if (name == UpdateStateDataKernel::Name())
-//        return new CudaUpdateStateDataKernel(name, platform, cl);
+        return new CudaUpdateStateDataKernel(name, platform, cu);
-//    if (name == ApplyConstraintsKernel::Name())
+    if (name == ApplyConstraintsKernel::Name())
-//        return new CudaApplyConstraintsKernel(name, platform, cl);
+        return new CudaApplyConstraintsKernel(name, platform, cu);
-//    if (name == VirtualSitesKernel::Name())
+    if (name == VirtualSitesKernel::Name())
-//        return new CudaVirtualSitesKernel(name, platform, cl);
+        return new CudaVirtualSitesKernel(name, platform, cu);
-//    if (name == CalcHarmonicBondForceKernel::Name())
+    if (name == CalcHarmonicBondForceKernel::Name())
-//        return new CudaCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
+        return new CudaCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomBondForceKernel::Name())
-//        return new CudaCalcCustomBondForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcHarmonicAngleForceKernel::Name())
-//        return new CudaCalcHarmonicAngleForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomAngleForceKernel::Name())
-//        return new CudaCalcCustomAngleForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcPeriodicTorsionForceKernel::Name())
-//        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcRBTorsionForceKernel::Name())
-//        return new CudaCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCMAPTorsionForceKernel::Name())
-//        return new CudaCalcCMAPTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomTorsionForceKernel::Name())
-//        return new CudaCalcCustomTorsionForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcNonbondedForceKernel::Name())
-//        return new CudaCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomNonbondedForceKernel::Name())
-//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())
-//        return new CudaCalcGBSAOBCForceKernel(name, platform, cl);
+//        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
 //    if (name == CalcCustomGBForceKernel::Name())
-//        return new CudaCalcCustomGBForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomExternalForceKernel::Name())
-//        return new CudaCalcCustomExternalForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomHbondForceKernel::Name())
-//        return new CudaCalcCustomHbondForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
 //    if (name == CalcCustomCompoundBondForceKernel::Name())
-//        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cl, context.getSystem());
+//        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
-//    if (name == IntegrateVerletStepKernel::Name())
+    if (name == IntegrateVerletStepKernel::Name())
-//        return new CudaIntegrateVerletStepKernel(name, platform, cl);
+        return new CudaIntegrateVerletStepKernel(name, platform, cu);
 //    if (name == IntegrateLangevinStepKernel::Name())
-//        return new CudaIntegrateLangevinStepKernel(name, platform, cl);
+//        return new CudaIntegrateLangevinStepKernel(name, platform, cu);
 //    if (name == IntegrateBrownianStepKernel::Name())
-//        return new CudaIntegrateBrownianStepKernel(name, platform, cl);
+//        return new CudaIntegrateBrownianStepKernel(name, platform, cu);
 //    if (name == IntegrateVariableVerletStepKernel::Name())
-//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cl);
+//        return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
 //    if (name == IntegrateVariableLangevinStepKernel::Name())
-//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cl);
+//        return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
 //    if (name == IntegrateCustomStepKernel::Name())
-//        return new CudaIntegrateCustomStepKernel(name, platform, cl);
+//        return new CudaIntegrateCustomStepKernel(name, platform, cu);
 //    if (name == ApplyAndersenThermostatKernel::Name())
-//        return new CudaApplyAndersenThermostatKernel(name, platform, cl);
+//        return new CudaApplyAndersenThermostatKernel(name, platform, cu);
 //    if (name == ApplyMonteCarloBarostatKernel::Name())
-//        return new CudaApplyMonteCarloBarostatKernel(name, platform, cl);
+//        return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
-//    if (name == CalcKineticEnergyKernel::Name())
+    if (name == CalcKineticEnergyKernel::Name())
-//        return new CudaCalcKineticEnergyKernel(name, platform, cl);
+        return new CudaCalcKineticEnergyKernel(name, platform, cu);
 //    if (name == RemoveCMMotionKernel::Name())
-//        return new CudaRemoveCMMotionKernel(name, platform, cl);
+//        return new CudaRemoveCMMotionKernel(name, platform, cu);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
--- a/platforms/cuda2/src/kernels/bondForce.cu
+++ b/platforms/cuda2/src/kernels/bondForce.cu
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+COMPUTE_FORCE
+dEdR = (r > 0) ? (dEdR / r) : 0;
+delta *= dEdR;
+real3 force1 = delta;
+real3 force2 = -delta;
--- a/platforms/cuda2/src/kernels/harmonicBondForce.cu
+++ b/platforms/cuda2/src/kernels/harmonicBondForce.cu
+float2 bondParams = PARAMS[index];
+real deltaIdeal = r-bondParams.x;
+energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
+real dEdR = bondParams.y * deltaIdeal;
--- a/platforms/cuda2/src/kernels/vectorOps.cu
+++ b/platforms/cuda2/src/kernels/vectorOps.cu
@@ -42,7 +42,7 @@ inline __device__ double4 make_double4(double a) {
 // Negate a vector.
-inline __device__ int2 operator*(int2 a) {
+inline __device__ int2 operator-(int2 a) {
    return make_int2(-a.x, -a.y);
 }
@@ -455,3 +455,41 @@ inline __device__ double3 operator*(double a, double3 b) {
 inline __device__ double4 operator*(double a, double4 b) {
    return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
 }
+// *= operator (multiply vector by constant)
+inline __device__ void operator*=(int2& a, int b) {
+    a.x *= b; a.y *= b;
+}
+inline __device__ void operator*=(int3& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+inline __device__ void operator*=(int4& a, int b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+inline __device__ void operator*=(float2& a, float b) {
+    a.x *= b; a.y *= b;
+}
+inline __device__ void operator*=(float3& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+inline __device__ void operator*=(float4& a, float b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+inline __device__ void operator*=(double2& a, double b) {
+    a.x *= b; a.y *= b;
+}
+inline __device__ void operator*=(double3& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b;
+}
+inline __device__ void operator*=(double4& a, double b) {
+    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
--- a/platforms/cuda2/src/kernels/verlet.cu
+++ b/platforms/cuda2/src/kernels/verlet.cu
+/**
+ * Perform the first step of Verlet integration.
+ */
+extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, const real4* __restrict__ posq, real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta) {
+    const real2 stepSize = dt[0];
+    const real dtPos = stepSize.y;
+    const real dtVel = 0.5f*(stepSize.x+stepSize.y);
+    const real scale = dtVel/(real) 0xFFFFFFFF;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            real4 pos = posq[index];
+            velocity.x += scale*force[index]*velocity.w;
+            velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
+            velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
+            pos.x = velocity.x*dtPos;
+            pos.y = velocity.y*dtPos;
+            pos.z = velocity.z*dtPos;
+            posDelta[index] = pos;
+            velm[index] = velocity;
+        }
+    }
+}
+/**
+ * Perform the second step of Verlet integration.
+ */
+extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* __restrict__ posq, real4* __restrict__ velm, const real4* __restrict__ posDelta) {
+    real2 stepSize = dt[0];
+    double oneOverDt = 1.0/stepSize.y;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index == 0)
+        dt[0].x = stepSize.y;
+    for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            real4 pos = posq[index];
+            real4 delta = posDelta[index];
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+            velocity = make_real4((real) (delta.x*oneOverDt), (real) (delta.y*oneOverDt), (real) (delta.z*oneOverDt), velocity.w);
+            posq[index] = pos;
+            velm[index] = velocity;
+        }
+    }
+}
+/**
+ * Select the step size to use for the next step.
+ */
+//
+//extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const real4* __restrict__ force, __local real* __restrict__ error) {
+//    // Calculate the error.
+//
+//    real err = 0.0f;
+//    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+//        real4 f = force[index];
+//        real invMass = velm[index].w;
+//        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
+//    }
+//    error[threadIdx.x] = err;
+//    __syncthreads;
+//
+//    // Sum the errors from all threads.
+//
+//    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
+//        if (threadIdx.x+offset < get_local_size(0) && (threadIdx.x&(2*offset-1)) == 0)
+//            error[threadIdx.x] += error[threadIdx.x+offset];
+//        __syncthreads;
+//    }
+//    if (threadIdx.x == 0) {
+//        real totalError = sqrt(error[0]/(NUM_ATOMS*3));
+//        real newStepSize = sqrt(errorTol/totalError);
+//        real oldStepSize = dt[0].y;
+//        if (oldStepSize > 0.0f)
+//            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
+//        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
+//            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
+//        if (newStepSize > maxStepSize)
+//            newStepSize = maxStepSize;
+//        dt[0].y = newStepSize;
+//    }
+//}
--- a/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
+++ b/platforms/cuda2/tests/TestCudaHarmonicBondForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of HarmonicBondForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <map>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testBonds() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 0.8);
+    forceField->addBond(1, 2, 1.2, 0.7);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    positions[0] = Vec3(0, 2, 0);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        ASSERT_EQUAL_VEC(Vec3(0, -0.8*0.5, 0), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0.7*0.2, 0, 0), forces[2], TOL);
+        ASSERT_EQUAL_VEC(Vec3(-forces[0][0]-forces[2][0], -forces[0][1]-forces[2][1], -forces[0][2]-forces[2][2]), forces[1], TOL);
+        ASSERT_EQUAL_TOL(0.5*0.8*0.5*0.5 + 0.5*0.7*0.2*0.2, state.getPotentialEnergy(), TOL);
+    }
+    // Try changing the bond parameters and make sure it's still correct.
+    forceField->setBondParameters(0, 0, 1, 1.6, 0.9);
+    forceField->setBondParameters(1, 1, 2, 1.3, 0.8);
+    forceField->updateParametersInContext(context);
+    state = context.getState(State::Forces | State::Energy);
+    {
+        const vector<Vec3>& forces = state.getForces();
+        ASSERT_EQUAL_VEC(Vec3(0, -0.9*0.4, 0), forces[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0.8*0.3, 0, 0), forces[2], TOL);
+        ASSERT_EQUAL_VEC(Vec3(-forces[0][0]-forces[2][0], -forces[0][1]-forces[2][1], -forces[0][2]-forces[2][2]), forces[1], TOL);
+        ASSERT_EQUAL_TOL(0.5*0.9*0.4*0.4 + 0.5*0.8*0.3*0.3, state.getPotentialEnergy(), TOL);
+    }
+}
+void testParallelComputation() {
+    CudaPlatform platform;
+    System system;
+    const int numParticles = 200;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    HarmonicBondForce* force = new HarmonicBondForce();
+    for (int i = 1; i < numParticles; i++)
+        force->addBond(i-1, i, 1.1, i);
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        positions[i] = Vec3(i, 0, 0);
+    VerletIntegrator integrator1(0.01);
+    Context context1(system, integrator1, platform);
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+    VerletIntegrator integrator2(0.01);
+    string deviceIndex = platform.getPropertyValue(context1, CudaPlatform::CudaDeviceIndex());
+    map<string, string> props;
+    props[CudaPlatform::CudaDeviceIndex()] = deviceIndex+","+deviceIndex;
+    Context context2(system, integrator2, platform, props);
+    context2.setPositions(positions);
+    State state2 = context2.getState(State::Forces | State::Energy);
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
+}
+int main() {
+    try {
+        testBonds();
+//        testParallelComputation();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda2/tests/TestCudaVerletIntegrator.cpp
+++ b/platforms/cuda2/tests/TestCudaVerletIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of VerletIntegrator.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+/**
+ * Compute the energy of a state, taking into account the half step offset between
+ * positions and velocities.
+ */
+static double computeEnergy(const State& state, const System& system, double dt) {
+    const vector<Vec3>& v = state.getVelocities();
+    const vector<Vec3>& f = state.getForces();
+    double energy = 0.0;
+    for (int i = 0; i < system.getNumParticles(); i++) {
+        double m = system.getParticleMass(i);
+        Vec3 vel = v[i]+f[i]*(0.5*dt/m);
+        energy += 0.5*m*vel.dot(vel);
+    }
+    return energy+state.getPotentialEnergy();
+}
+void testSingleBond() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    VerletIntegrator integrator(0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    // This is simply a harmonic oscillator, so compare it to the analytical solution.
+    const double freq = 1.0;;
+    State state = context.getState(State::Energy);
+    const double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
+    for (int i = 0; i < 1000; ++i) {
+        state = context.getState(State::Positions | State::Velocities | State::Energy);
+        double time = state.getTime();
+        double expectedDist = 1.5+0.5*std::cos(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
+        double expectedSpeed = -0.5*freq*std::sin(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.02);
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+    ASSERT_EQUAL_TOL(10.0, context.getState(0).getTime(), 1e-5);
+}
+void testConstraints() {
+    const int numParticles = 8;
+    const int numConstraints = 5;
+    const double temp = 100.0;
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(1, 2, 1.0);
+    system.addConstraint(2, 3, 1.0);
+    system.addConstraint(4, 5, 1.0);
+    system.addConstraint(6, 7, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    // Simulate it and see whether the constraints remain satisfied.
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy | State::Velocities | State::Forces);
+        for (int j = 0; j < numConstraints; ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
+        }
+        double energy = computeEnergy(state, system, integrator.getStepSize());
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+void testConstrainedClusters() {
+    const int numParticles = 7;
+    const double temp = 500.0;
+    CudaPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(i > 1 ? 1.0 : 10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(0, 2, 1.0);
+    system.addConstraint(0, 3, 1.0);
+    system.addConstraint(0, 4, 1.0);
+    system.addConstraint(1, 5, 1.0);
+    system.addConstraint(1, 6, 1.0);
+    system.addConstraint(2, 3, sqrt(2.0));
+    system.addConstraint(2, 4, sqrt(2.0));
+    system.addConstraint(3, 4, sqrt(2.0));
+    system.addConstraint(5, 6, sqrt(2.0));
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    positions[2] = Vec3(-1, 0, 0);
+    positions[3] = Vec3(0, 1, 0);
+    positions[4] = Vec3(0, 0, 1);
+    positions[5] = Vec3(2, 0, 0);
+    positions[6] = Vec3(1, 1, 0);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; ++i)
+        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    // Simulate it and see whether the constraints remain satisfied.
+    double initialEnergy = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Energy | State::Velocities | State::Forces);
+        for (int j = 0; j < system.getNumConstraints(); ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+        }
+        double energy = computeEnergy(state, system, integrator.getStepSize());
+        if (i == 1)
+            initialEnergy = energy;
+        else if (i > 1)
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+int main() {
+    try {
+        testSingleBond();
+//        testConstraints();
+//        testConstrainedClusters();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}