Continuing to implement new CUDA platform

1f0ec7b5 · Peter Eastman · 99cebd08 · 1f0ec7b5 · 1f0ec7b5 · 1f0ec7b5
Commit 1f0ec7b5 authored Jun 07, 2012 by Peter Eastman
10 changed files
--- a/platforms/cuda2/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaBondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaBondedUtilities.h"
+#include "CudaExpressionUtilities.h"
+#include "openmm/OpenMMException.h"
+#include "CudaNonbondedUtilities.h"
+#include <iostream>
+
+using namespace OpenMM;
+using namespace std;
+
+CudaBondedUtilities::CudaBondedUtilities(CudaContext& context) : context(context), numForceBuffers(0), maxBonds(0), hasInitializedKernels(false) {
+}
+
+CudaBondedUtilities::~CudaBondedUtilities() {
+    for (int i = 0; i < (int) atomIndices.size(); i++)
+        for (int j = 0; j < (int) atomIndices[i].size(); j++)
+            delete atomIndices[i][j];
+}
+
+void CudaBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
+    if (atoms.size() > 0) {
+        forceAtoms.push_back(atoms);
+        forceSource.push_back(source);
+        forceGroup.push_back(group);
+    }
+}
+
+std::string CudaBondedUtilities::addArgument(CUdeviceptr data, const string& type) {
+    arguments.push_back(data);
+    argTypes.push_back(type);
+    return "customArg"+context.intToString(arguments.size());
+}
+
+void CudaBondedUtilities::addPrefixCode(const string& source) {
+    prefixCode.push_back(source);
+}
+
+void CudaBondedUtilities::initialize(const System& system) {
+    int numForces = forceAtoms.size();
+    if (numForces == 0)
+        return;
+    
+    // Build the lists of atom indices.
+    
+    atomIndices.resize(numForces);
+    for (int i = 0; i < numForces; i++) {
+        int numBonds = forceAtoms[i].size();
+        int numAtoms = forceAtoms[i][0].size();
+        int startAtom = 0;
+        while (startAtom < numAtoms) {
+            int width = max(numAtoms-startAtom, 4);
+            if (width == 3)
+                width = 4;
+            vector<unsigned int> indexVec(width*numBonds);
+            for (int bond = 0; bond < numBonds; bond++) {
+                for (int atom = 0; atom < width; atom++)
+                    indexVec[bond*width+atom] = forceAtoms[i][bond][startAtom+atom];
+            }
+            CudaArray* indices = CudaArray::create<unsigned int>(indexVec.size(), "bondedIndices");
+            indices->upload(indexVec);
+            atomIndices[i].push_back(indices);
+            startAtom += width;
+        }
+    }
+
+    // Create the kernel.
+
+    stringstream s;
+    for (int i = 0; i < (int) prefixCode.size(); i++)
+        s<<prefixCode[i];
+    s<<"extern \"C\" __global__ void computeBondedForces(long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
+    for (int force = 0; force < numForces; force++) {
+        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
+            int indexWidth = atomIndices[force][i]->getElementSize()/4;
+            string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+            s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
+        }
+    }
+    for (int i = 0; i < (int) arguments.size(); i++)
+        s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
+    s<<") {\n";
+    s<<"real energy = 0;\n";
+    for (int force = 0; force < numForces; force++)
+        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
+    s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
+    s<<"}\n";
+    map<string, string> defines;
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    CUmodule module = context.createModule(s.str(), defines);
+    kernel = context.getKernel(module, "computeBondedForces");
+    forceAtoms.clear();
+    forceSource.clear();
+}
+
+string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const string& computeForce) {
+    maxBonds = max(maxBonds, numBonds);
+    string suffix1[] = {""};
+    string suffix4[] = {".x", ".y", ".z", ".w"};
+    string* suffix;
+    stringstream s;
+    s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
+    s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
+    int startAtom = 0;
+    for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
+        int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4;
+        suffix = (indexWidth == 1 ? suffix1 : suffix4);
+        string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+        s<<"    "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
+        s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
+        for (int j = 0; j < indexWidth; j++) {
+            s<<"    unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
+            s<<"    real4 pos"<<(j+1)<<" = posq[atom"<<(j+1)<<"];\n";
+        }
+        startAtom += indexWidth;
+    }
+    s<<computeForce<<"\n";
+    for (int i = 0; i < numAtoms; i++) {
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], (long) (force.x*0xFFFFFFFF));\n";
+    }
+    s<<"}\n";
+    return s.str();
+}
+
+void CudaBondedUtilities::computeInteractions(int groups) {
+//    if (!hasInitializedKernels) {
+//        hasInitializedKernels = true;
+//        for (int i = 0; i < (int) forceSets.size(); i++) {
+//            int index = 0;
+//            cl::Kernel& kernel = kernels[i];
+//            kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
+//            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
+//            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+//            index++;
+//            for (int j = 0; j < (int) forceSets[i].size(); j++) {
+//                kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]]->getDeviceBuffer());
+//                kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]]->getDeviceBuffer());
+//            }
+//            for (int j = 0; j < (int) arguments.size(); j++)
+//                kernel.setArg<cl::Memory>(index++, *arguments[j]);
+//        }
+//    }
+//    for (int i = 0; i < (int) kernels.size(); i++) {
+//        kernels[i].setArg<cl_int>(3, groups);
+//        context.executeKernel(kernels[i], maxBonds);
+//    }
+}
--- a/platforms/cuda2/src/CudaBondedUtilities.h
+++ b/platforms/cuda2/src/CudaBondedUtilities.h
+#ifndef OPENMM_CUDABONDEDUTILITIES_H_
+#define OPENMM_CUDABONDEDUTILITIES_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaArray.h"
+#include "CudaContext.h"
+#include "openmm/System.h"
+#include <string>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class provides a generic mechanism for evaluating bonded interactions.  You write only
+ * the source code needed to compute one interaction, and this class takes care of creating
+ * and executing a complete kernel that loops over bonds, evaluates each one, and accumulates
+ * the resulting forces and energies.  This offers two advantages.  First, it simplifies the
+ * task of writing a new Force.  Second, it allows multiple forces to be evaluated by a single
+ * kernel, which reduces overhead and improves performance.
+ * 
+ * A "bonded interaction" means an interaction that affects a small, fixed set of particles.
+ * The interaction energy may depend on the positions of only those particles, and the list of
+ * particles forming a "bond" may not change with time.  Examples of bonded interactions
+ * include HarmonicBondForce, HarmonicAngleForce, and PeriodicTorsionForce.
+ * 
+ * To create a bonded interaction, call addInteraction().  You pass to it a block of source
+ * code for evaluating the interaction.  The inputs and outputs for that source code are as
+ * follows:
+ * 
+ * <ol>
+ * <li>The index of the bond being evaluated will have been stored in the unsigned int variable "index".</li>
+ * <li>The indices of the atoms forming that bond will have been stored in the unsigned int variables "atom1",
+ * "atom2", ....</li>
+ * <li>The positions of those atoms will have been stored in the real4 variables "pos1", "pos2", ....</li>
+ * <li>A real variable called "energy" will exist.  Your code should add the potential energy of the
+ * bond to that variable.</li>
+ * <li>Your code should define real4 variables called "force1", "force2", ... that contain the force to
+ * apply to each atom.</li>
+ * </ol>
+ * 
+ * As a simple example, the following source code would be used to implement a pairwise interaction of
+ * the form E=r^2:
+ * 
+ * <tt><pre>
+ * real4 delta = pos2-pos1;
+ * energy += delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+ * real4 force1 = 2.0f*delta;
+ * real4 force2 = -2.0f*delta;
+ * </pre></tt>
+ * 
+ * Interactions will often depend on parameters or other data.  Call addArgument() to provide the data
+ * to this class.  It will be passed to the interaction kernel as an argument, and you can refer to it
+ * from your interaction code.
+ */
+
+class OPENMM_EXPORT CudaBondedUtilities {
+public:
+    CudaBondedUtilities(CudaContext& context);
+    ~CudaBondedUtilities();
+    /**
+     * Add a bonded interaction.
+     *
+     * @param atoms    this should have one entry for each bond, and that entry should contain the list
+     *                 of atoms involved in the bond.  Every entry must have the same number of atoms.
+     * @param source   the code to evaluate the interaction
+     * @param group    the force group in which the interaction should be calculated
+     */
+    void addInteraction(const std::vector<std::vector<int> >& atoms, const std::string& source, int group);
+    /**
+     * Add an argument that should be passed to the interaction kernel.
+     * 
+     * @param data    the device memory containing the data to pass
+     * @param type    the data type contained in the memory (e.g. "float4")
+     * @return the name that will be used for the argument.  Any code you pass to addInteraction() should
+     * refer to it by this name.
+     */
+    std::string addArgument(CUdeviceptr data, const std::string& type);
+    /**
+     * Add some Cuda code that should be included in the program, before the start of the kernel.
+     * This can be used, for example, to define functions that will be called by the kernel.
+     * 
+     * @param source   the code to include
+     */
+    void addPrefixCode(const std::string& source);
+    /**
+     * Initialize this object in preparation for a simulation.
+     */
+    void initialize(const System& system);
+    /**
+     * Compute the bonded interactions.
+     * 
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    void computeInteractions(int groups);
+private:
+    std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
+    CudaContext& context;
+    CUfunction kernel;
+    std::vector<std::vector<std::vector<int> > > forceAtoms;
+    std::vector<std::vector<int> > indexWidth;
+    std::vector<std::string> forceSource;
+    std::vector<int> forceGroup;
+    std::vector<CUdeviceptr> arguments;
+    std::vector<std::string> argTypes;
+    std::vector<std::vector<CudaArray*> > atomIndices;
+    std::vector<std::string> prefixCode;
+    int numForceBuffers, maxBonds;
+    bool hasInitializedKernels;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CUDABONDEDUTILITIES_H_*/
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -32,7 +32,7 @@
 #include "CudaArray.h"
 //#include "CudaBondedUtilities.h"
 #include "CudaForceInfo.h"
-//#include "CudaIntegrationUtilities.h"
+#include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
 //#include "CudaNonbondedUtilities.h"
 #include "hilbert.h"
@@ -40,6 +40,7 @@
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/VirtualSite.h"
+#include "CudaExpressionUtilities.h"
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
@@ -66,8 +67,8 @@ bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
-        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
-        bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
+        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL),*/ integration(NULL), expression(NULL),
+        /*bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -143,11 +144,17 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
+        compilationDefines["make_real2"] = "make_double2";
+        compilationDefines["make_real3"] = "make_double3";
+        compilationDefines["make_real4"] = "make_double4";
    }
    else {
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));

@@ -160,8 +167,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
-    reduceFloat4Kernel = getKernel(utilities, "reduceFloat4Buffer");
-    reduceForcesKernel = getKernel(utilities, "reduceForces");

    // Set defines based on the requested precision.

@@ -170,14 +175,21 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
+    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
+    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
+    compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
+    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
+    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
+    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
    
    // Create the work thread used for parallelization when running on multiple devices.
    
    thread = new WorkThread();
-//    
-//    // Create the integration utilities object.
-//    
-//    integration = new CudaIntegrationUtilities(*this, system);
+    
+    // Create utilities objects.
+    
+    integration = new CudaIntegrationUtilities(*this, system);
+    expression = new CudaExpressionUtilities(*this);
 }

 CudaContext::~CudaContext() {
@@ -201,8 +213,10 @@ CudaContext::~CudaContext() {
 //        delete energyBuffer;
 //    if (atomIndex != NULL)
 //        delete atomIndex;
-//    if (integration != NULL)
-//        delete integration;
+    if (integration != NULL)
+        delete integration;
+    if (expression != NULL)
+        delete expression;
 //    if (bonded != NULL)
 //        delete bonded;
 //    if (nonbonded != NULL)
@@ -272,6 +286,18 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    }
    if (!compilationDefines.empty())
        src << endl;
+    if (useDoublePrecision) {
+        src << "typedef double real;\n";
+        src << "typedef double2 real2;\n";
+        src << "typedef double3 real3;\n";
+        src << "typedef double4 real4;\n";
+    }
+    else {
+        src << "typedef float real;\n";
+        src << "typedef float2 real2;\n";
+        src << "typedef float3 real3;\n";
+        src << "typedef float4 real4;\n";
+    }
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())
@@ -498,22 +524,7 @@ void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
 //        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
 //    }
 //}
-//
-//void CudaContext::reduceForces() {
-//    if (supports64BitGlobalAtomics)
-//        executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
-//    else
-//        reduceBuffer(*forceBuffers, numForceBuffers);
-//}
-//
-//void CudaContext::reduceBuffer(CudaArray<mm_float4>& array, int numBuffers) {
-//    int bufferSize = array.getSize()/numBuffers;
-//    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
-//    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
-//    reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
-//    executeKernel(reduceFloat4Kernel, bufferSize, 128);
-//}
-//
+
 void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
    // Recursively tag atoms as belonging to a particular molecule.


--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -46,6 +46,7 @@ namespace OpenMM {

 class CudaArray;
 class CudaForceInfo;
+class CudaExpressionUtilities;
 class CudaIntegrationUtilities;
 class CudaBondedUtilities;
 class CudaNonbondedUtilities;
@@ -216,25 +217,13 @@ public:
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *
     * @param memory     the memory to clear
-     * @param size       the number of float/double elements in the buffer
+     * @param size       the number of 4-byte elements in the buffer
     */
    void addAutoclearBuffer(CUdeviceptr memory, int size);
 //    /**
 //     * Clear all buffers that have been registered with addAutoclearBuffer().
 //     */
 //    void clearAutoclearBuffers();
-//    /**
-//     * Given a collection of buffers packed into an array, sum them and store
-//     * the sum in the first buffer.
-//     *
-//     * @param array       the array containing the buffers to reduce
-//     * @param numBuffers  the number of buffers packed into the array
-//     */
-//    void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
-//    /**
-//     * Sum the buffesr containing forces.
-//     */
-//    void reduceForces();
    /**
     * Get the current simulation time.
     */
@@ -341,12 +330,18 @@ public:
 //    float4 getInvPeriodicBoxSize() const {
 //        return invPeriodicBoxSize;
 //    }
-//    /**
-//     * Get the CudaIntegrationUtilities for this context.
-//     */
-//    CudaIntegrationUtilities& getIntegrationUtilities() {
-//        return *integration;
-//    }
+    /**
+     * Get the CudaIntegrationUtilities for this context.
+     */
+    CudaIntegrationUtilities& getIntegrationUtilities() {
+        return *integration;
+    }
+    /**
+     * Get the CudaExpressionUtilities for this context.
+     */
+    CudaExpressionUtilities& getExpressionUtilities() {
+        return *expression;
+    }
 //    /**
 //     * Get the CudaBondedUtilities for this context.
 //     */
@@ -445,8 +440,6 @@ private:
    CUfunction clearFourBuffersKernel;
    CUfunction clearFiveBuffersKernel;
    CUfunction clearSixBuffersKernel;
-    CUfunction reduceFloat4Kernel;
-    CUfunction reduceForcesKernel;
    std::vector<CudaForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -461,7 +454,8 @@ private:
    std::vector<CUdeviceptr> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
-//    CudaIntegrationUtilities* integration;
+    CudaIntegrationUtilities* integration;
+    CudaExpressionUtilities* expression;
 //    CudaBondedUtilities* bonded;
 //    CudaNonbondedUtilities* nonbonded;
    WorkThread* thread;

--- a/platforms/cuda2/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda2/src/CudaExpressionUtilities.cpp
@@ -33,19 +33,6 @@ using namespace OpenMM;
 using namespace Lepton;
 using namespace std;

-string CudaExpressionUtilities::doubleToString(double value) {
-    stringstream s;
-    s.precision(8);
-    s << scientific << value << "f";
-    return s.str();
-}
-
-string CudaExpressionUtilities::intToString(int value) {
-    stringstream s;
-    s << value;
-    return s.str();
-}
-
 string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
    vector<pair<ExpressionTreeNode, string> > variableNodes;
@@ -75,13 +62,13 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            return;
    for (int i = 0; i < (int) node.getChildren().size(); i++)
        processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
-    string name = prefix+intToString(temps.size());
+    string name = prefix+context.intToString(temps.size());
    bool hasRecordedNode = false;
    
    out << tempType << " " << name << " = ";
    switch (node.getOperation().getId()) {
        case Operation::CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+            out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
            break;
        case Operation::VARIABLE:
            throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
@@ -107,7 +94,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            string valueName = name;
            string derivName = name;
            if (valueNode != NULL && derivNode != NULL) {
-                string name2 = prefix+intToString(temps.size());
+                string name2 = prefix+context.intToString(temps.size());
                out << tempType << " " << name2 << " = 0.0f;\n";
                if (isDeriv) {
                    valueName = name2;
@@ -120,14 +107,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            }
            out << "{\n";
            out << "float4 params = " << functionParams << "[" << i << "];\n";
-            out << "float x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+            out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
            out << "if (x >= params.x && x <= params.y) {\n";
            out << "x = (x-params.x)*params.z;\n";
            out << "int index = (int) (floor(x));\n";
            out << "index = min(index, (int) params.w);\n";
            out << "float4 coeff = " << functions[i].second << "[index];\n";
-            out << "float b = x-index;\n";
-            out << "float a = 1.0f-b;\n";
+            out << "real b = x-index;\n";
+            out << "real a = 1.0f-b;\n";
            if (valueNode != NULL)
                out << valueName << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);\n";
            if (derivNode != NULL)
@@ -164,7 +151,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "-" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::SQRT:
-            out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "SQRT(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::EXP:
            out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
@@ -173,31 +160,31 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SIN:
-            out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "SIN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::COS:
-            out << "cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "COS(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SEC:
-            out << "1.0f/cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "RECIP(COS(" << getTempName(node.getChildren()[0], temps) << "))";
            break;
        case Operation::CSC:
-            out << "1.0f/sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "RECIP(SIN(" << getTempName(node.getChildren()[0], temps) << "))";
            break;
        case Operation::TAN:
-            out << "tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "TAN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::COT:
-            out << "1.0f/tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "RECIP(TAN(" << getTempName(node.getChildren()[0], temps) << "))";
            break;
        case Operation::ASIN:
-            out << "asin(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ACOS:
-            out << "acos(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ACSO(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ATAN:
-            out << "atan(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SINH:
            out << "sinh(" << getTempName(node.getChildren()[0], temps) << ")";
@@ -236,10 +223,10 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ADD_CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
+            out << context.doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::MULTIPLY_CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
+            out << context.doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::POWER_CONSTANT:
        {
@@ -266,14 +253,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
                    if (iter->first != exponent) {
                        exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
-                        string name2 = prefix+intToString(temps.size());
+                        string name2 = prefix+context.intToString(temps.size());
                        names.push_back(name2);
                        temps.push_back(make_pair(*iter->second, name2));
                        out << tempType << " " << name2 << " = 0.0f;\n";
                    }
                }
                out << "{\n";
-                out << "float multiplier = " << (exponent < 0.0 ? "1.0f/" : "") << getTempName(node.getChildren()[0], temps) << ";\n";
+                out << "real multiplier = " << (exponent < 0.0 ? "RECIP(" : "(") << getTempName(node.getChildren()[0], temps) << ");\n";
                bool done = false;
                while (!done) {
                    done = true;
@@ -295,7 +282,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                out << "}";
            }
            else
-                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
+                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
            break;
        }
        case Operation::MIN:

--- a/platforms/cuda2/src/CudaExpressionUtilities.h
+++ b/platforms/cuda2/src/CudaExpressionUtilities.h
@@ -45,6 +45,8 @@ namespace OpenMM {

 class OPENMM_EXPORT CudaExpressionUtilities {
 public:
+    CudaExpressionUtilities(CudaContext& context) : context(context) {
+    }
    /**
     * Generate the source code for calculating a set of expressions.
     *
@@ -54,10 +56,10 @@ public:
     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
     * @param prefix         a prefix to put in front of temporary variables
     * @param functionParams the variable name containing the parameters for each tabulated function
-     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     * @param tempType       the type of value to use for temporary variables (defaults to "real")
     */
-    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
-            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
    /**
     * Generate the source code for calculating a set of expressions.
     *
@@ -67,10 +69,10 @@ public:
     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
     * @param prefix         a prefix to put in front of temporary variables
     * @param functionParams the variable name containing the parameters for each tabulated function
-     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     * @param tempType       the type of value to use for temporary variables (defaults to "real")
     */
-    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
-            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
    /**
     * Calculate the spline coefficients for a tabulated function that appears in expressions.
     *
@@ -79,26 +81,19 @@ public:
     * @param max            the value of the independent variable corresponding to the last element of values
     * @return the spline coefficients
     */
-    static std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    static std::string doubleToString(double value);
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    static std::string intToString(int value);
+    std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
    class FunctionPlaceholder;
 private:
-    static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
+    void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
            std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
            const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
-    static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
-    static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+    std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
+    void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
            const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
-    static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+    void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
            std::map<int, const Lepton::ExpressionTreeNode*>& powers);
+    CudaContext& context;
 };

 /**

--- a/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaIntegrationUtilities.h"
+#include "CudaArray.h"
+#include "CudaKernelSources.h"
+#include "openmm/HarmonicAngleForce.h"
+#include "openmm/VirtualSite.h"
+#include "quern.h"
+#include "CudaExpressionUtilities.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <map>
+
+using namespace OpenMM;
+using namespace std;
+
+#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
+#define CHECK_RESULT2(result, prefix) \
+    if (result != CUDA_SUCCESS) { \
+        std::stringstream m; \
+        m<<prefix<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
+struct CudaIntegrationUtilities::ShakeCluster {
+    int centralID;
+    int peripheralID[3];
+    int size;
+    bool valid;
+    double distance;
+    double centralInvMass, peripheralInvMass;
+    ShakeCluster() : valid(true) {
+    }
+    ShakeCluster(int centralID, double invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
+    }
+    void addAtom(int id, double dist, double invMass) {
+        if (size == 3 || (size > 0 && abs(dist-distance)/distance > 1e-8) || (size > 0 && abs(invMass-peripheralInvMass)/peripheralInvMass > 1e-8))
+            valid = false;
+        else {
+            peripheralID[size++] = id;
+            distance = dist;
+            peripheralInvMass = invMass;
+        }
+    }
+    void markInvalid(map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
+    {
+        valid = false;
+        invalidForShake[centralID] = true;
+        for (int i = 0; i < size; i++) {
+            invalidForShake[peripheralID[i]] = true;
+            map<int, ShakeCluster>::iterator otherCluster = allClusters.find(peripheralID[i]);
+            if (otherCluster != allClusters.end() && otherCluster->second.valid)
+                otherCluster->second.markInvalid(allClusters, invalidForShake);
+        }
+    }
+};
+
+struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int, int, bool> {
+    const vector<int>& atom1;
+    const vector<int>& atom2;
+    const vector<int>& constraints;
+    ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
+    }
+    bool operator()(int x, int y) {
+        int ix = constraints[x];
+        int iy = constraints[y];
+        if (atom1[ix] != atom1[iy])
+            return atom1[ix] < atom1[iy];
+        return atom2[ix] < atom2[iy];
+    }
+};
+
+CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context),
+        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
+        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
+        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
+        ccmaConvergedMemory(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
+        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
+    // Create workspace arrays.
+
+    if (context.getUseDoublePrecision()) {
+        posDelta = CudaArray::create<double4>(context.getPaddedNumAtoms(), "posDelta");
+        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
+        posDelta->upload(deltas);
+        stepSize = CudaArray::create<double2>(1, "stepSize");
+        vector<double2> step(1, make_double2(0.0f, 0.0f));
+        stepSize->upload(step);
+    }
+    else {
+        posDelta = CudaArray::create<float4>(context.getPaddedNumAtoms(), "posDelta");
+        vector<float4> deltas(posDelta->getSize(), make_float4(0.0, 0.0, 0.0, 0.0));
+        posDelta->upload(deltas);
+        stepSize = CudaArray::create<float2>(1, "stepSize");
+        vector<float2> step(1, make_float2(0.0f, 0.0f));
+        stepSize->upload(step);
+    }
+
+    // Create kernels for enforcing constraints.
+
+    map<string, string> velocityDefines;
+    velocityDefines["CONSTRAIN_VELOCITIES"] = "1";
+//    CUmodule settleModule = context.createModule(CudaKernelSources::settle);
+//    settlePosKernel = context.getKernel(settleModule, "applySettle");
+//    settleVelKernel = context.getKernel(settleModule, "constrainVelocities");
+//    CUmodule shakeModule = context.createModule(CudaKernelSources::shakeHydrogens);
+//    shakePosKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
+//    shakeModule = context.createModule(CudaKernelSources::shakeHydrogens, velocityDefines);
+//    shakeVelKernel = context.getKernel(shakeModule, "applyShakeToHydrogens");
+
+    // Record the set of constraints and how many constraints each atom is involved in.
+
+    int numConstraints = system.getNumConstraints();
+    vector<int> atom1(numConstraints);
+    vector<int> atom2(numConstraints);
+    vector<double> distance(numConstraints);
+    vector<int> constraintCount(context.getNumAtoms(), 0);
+    for (int i = 0; i < numConstraints; i++) {
+        system.getConstraintParameters(i, atom1[i], atom2[i], distance[i]);
+        constraintCount[atom1[i]]++;
+        constraintCount[atom2[i]]++;
+    }
+
+    // Identify clusters of three atoms that can be treated with SETTLE.  First, for every
+    // atom that might be part of such a cluster, make a list of the two other atoms it is
+    // connected to.
+
+    int numAtoms = system.getNumParticles();
+    vector<map<int, float> > settleConstraints(numAtoms);
+    for (int i = 0; i < (int)atom1.size(); i++) {
+        if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
+            settleConstraints[atom1[i]][atom2[i]] = (float) distance[i];
+            settleConstraints[atom2[i]][atom1[i]] = (float) distance[i];
+        }
+    }
+
+    // Now remove the ones that don't actually form closed loops of three atoms.
+
+    vector<int> settleClusters;
+    for (int i = 0; i < (int)settleConstraints.size(); i++) {
+        if (settleConstraints[i].size() == 2) {
+            int partner1 = settleConstraints[i].begin()->first;
+            int partner2 = (++settleConstraints[i].begin())->first;
+            if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
+                    settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
+                settleConstraints[i].clear();
+            else if (i < partner1 && i < partner2)
+                settleClusters.push_back(i);
+        }
+        else
+            settleConstraints[i].clear();
+    }
+
+    // Record the SETTLE clusters.
+
+    vector<bool> isShakeAtom(numAtoms, false);
+    if (settleClusters.size() > 0) {
+        vector<int4> atoms;
+        vector<float2> params;
+        for (int i = 0; i < (int) settleClusters.size(); i++) {
+            int atom1 = settleClusters[i];
+            int atom2 = settleConstraints[atom1].begin()->first;
+            int atom3 = (++settleConstraints[atom1].begin())->first;
+            float dist12 = settleConstraints[atom1].find(atom2)->second;
+            float dist13 = settleConstraints[atom1].find(atom3)->second;
+            float dist23 = settleConstraints[atom2].find(atom3)->second;
+            if (dist12 == dist13) {
+                // atom1 is the central atom
+                atoms.push_back(make_int4(atom1, atom2, atom3, 0));
+                params.push_back(make_float2(dist12, dist23));
+            }
+            else if (dist12 == dist23) {
+                // atom2 is the central atom
+                atoms.push_back(make_int4(atom2, atom1, atom3, 0));
+                params.push_back(make_float2(dist12, dist13));
+            }
+            else if (dist13 == dist23) {
+                // atom3 is the central atom
+                atoms.push_back(make_int4(atom3, atom1, atom2, 0));
+                params.push_back(make_float2(dist13, dist12));
+            }
+            else
+                throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+            isShakeAtom[atom1] = true;
+            isShakeAtom[atom2] = true;
+            isShakeAtom[atom3] = true;
+        }
+        settleAtoms = CudaArray::create<int4>(atoms.size(), "settleAtoms");
+        settleParams = CudaArray::create<float2>(params.size(), "settleParams");
+        settleAtoms->upload(atoms);
+        settleParams->upload(params);
+    }
+
+    // Find clusters consisting of a central atom with up to three peripheral atoms.
+
+    map<int, ShakeCluster> clusters;
+    vector<bool> invalidForShake(numAtoms, false);
+    for (int i = 0; i < (int) atom1.size(); i++) {
+        if (isShakeAtom[atom1[i]])
+            continue; // This is being taken care of with SETTLE.
+
+        // Determine which is the central atom.
+
+        bool firstIsCentral;
+        if (constraintCount[atom1[i]] > 1)
+            firstIsCentral = true;
+        else if (constraintCount[atom2[i]] > 1)
+            firstIsCentral = false;
+        else if (atom1[i] < atom2[i])
+            firstIsCentral = true;
+        else
+            firstIsCentral = false;
+        int centralID, peripheralID;
+        if (firstIsCentral) {
+            centralID = atom1[i];
+            peripheralID = atom2[i];
+        }
+        else {
+            centralID = atom2[i];
+            peripheralID = atom1[i];
+        }
+
+        // Add it to the cluster.
+
+        if (clusters.find(centralID) == clusters.end()) {
+            clusters[centralID] = ShakeCluster(centralID, 1.0/system.getParticleMass(centralID));
+        }
+        ShakeCluster& cluster = clusters[centralID];
+        cluster.addAtom(peripheralID, distance[i], 1.0/system.getParticleMass(peripheralID));
+        if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
+            cluster.markInvalid(clusters, invalidForShake);
+            map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
+            if (otherCluster != clusters.end() && otherCluster->second.valid)
+                otherCluster->second.markInvalid(clusters, invalidForShake);
+        }
+    }
+    int validShakeClusters = 0;
+    for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        ShakeCluster& cluster = iter->second;
+        if (cluster.valid) {
+            cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
+            for (int i = 0; i < cluster.size; i++)
+                if (invalidForShake[cluster.peripheralID[i]])
+                    cluster.valid = false;
+            if (cluster.valid)
+                ++validShakeClusters;
+        }
+    }
+
+    // Record the SHAKE clusters.
+
+    if (validShakeClusters > 0) {
+        vector<int4> atoms;
+        vector<float4> params;
+        int index = 0;
+        for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+            const ShakeCluster& cluster = iter->second;
+            if (!cluster.valid)
+                continue;
+            atoms.push_back(make_int4(cluster.centralID, cluster.peripheralID[0], (cluster.size > 1 ? cluster.peripheralID[1] : -1), (cluster.size > 2 ? cluster.peripheralID[2] : -1)));
+            params.push_back(make_float4((float) cluster.centralInvMass, (float) (0.5/(cluster.centralInvMass+cluster.peripheralInvMass)), (float) (cluster.distance*cluster.distance), (float) cluster.peripheralInvMass));
+            isShakeAtom[cluster.centralID] = true;
+            isShakeAtom[cluster.peripheralID[0]] = true;
+            if (cluster.size > 1)
+                isShakeAtom[cluster.peripheralID[1]] = true;
+            if (cluster.size > 2)
+                isShakeAtom[cluster.peripheralID[2]] = true;
+            ++index;
+        }
+        shakeAtoms = CudaArray::create<int4>(atoms.size(), "shakeAtoms");
+        shakeParams = CudaArray::create<float4>(params.size(), "shakeParams");
+        shakeAtoms->upload(atoms);
+        shakeParams->upload(params);
+    }
+
+    // Find connected constraints for CCMA.
+
+    vector<int> ccmaConstraints;
+    for (unsigned i = 0; i < atom1.size(); i++)
+        if (!isShakeAtom[atom1[i]])
+            ccmaConstraints.push_back(i);
+
+    // Record the connections between constraints.
+
+    int numCCMA = (int) ccmaConstraints.size();
+    if (numCCMA > 0) {
+        vector<vector<int> > atomConstraints(context.getNumAtoms());
+        for (int i = 0; i < numCCMA; i++) {
+            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
+            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
+        }
+        vector<vector<int> > linkedConstraints(numCCMA);
+        for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
+            for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
+                for (unsigned j = 0; j < i; j++) {
+                    int c1 = atomConstraints[atom][i];
+                    int c2 = atomConstraints[atom][j];
+                    linkedConstraints[c1].push_back(c2);
+                    linkedConstraints[c2].push_back(c1);
+                }
+        }
+        int maxLinks = 0;
+        for (unsigned i = 0; i < linkedConstraints.size(); i++)
+            maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
+        int maxAtomConstraints = 0;
+        for (unsigned i = 0; i < atomConstraints.size(); i++)
+            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
+
+        // Compute the constraint coupling matrix
+
+        vector<vector<int> > atomAngles(numAtoms);
+        HarmonicAngleForce const* angleForce = NULL;
+        for (int i = 0; i < system.getNumForces() && angleForce == NULL; i++)
+            angleForce = dynamic_cast<HarmonicAngleForce const*>(&system.getForce(i));
+        if (angleForce != NULL)
+            for (int i = 0; i < angleForce->getNumAngles(); i++) {
+                int particle1, particle2, particle3;
+                double angle, k;
+                angleForce->getAngleParameters(i, particle1, particle2, particle3, angle, k);
+                atomAngles[particle2].push_back(i);
+            }
+        vector<vector<pair<int, double> > > matrix(numCCMA);
+        for (int j = 0; j < numCCMA; j++) {
+            for (int k = 0; k < numCCMA; k++) {
+                if (j == k) {
+                    matrix[j].push_back(pair<int, double>(j, 1.0));
+                    continue;
+                }
+                double scale;
+                int cj = ccmaConstraints[j];
+                int ck = ccmaConstraints[k];
+                int atomj0 = atom1[cj];
+                int atomj1 = atom2[cj];
+                int atomk0 = atom1[ck];
+                int atomk1 = atom2[ck];
+                int atoma, atomb, atomc;
+                double imj0 = 1.0/system.getParticleMass(atomj0);
+                double imj1 = 1.0/system.getParticleMass(atomj1);
+                if (atomj0 == atomk0) {
+                    atoma = atomj1;
+                    atomb = atomj0;
+                    atomc = atomk1;
+                    scale = imj0/(imj0+imj1);
+                }
+                else if (atomj1 == atomk1) {
+                    atoma = atomj0;
+                    atomb = atomj1;
+                    atomc = atomk0;
+                    scale = imj1/(imj0+imj1);
+                }
+                else if (atomj0 == atomk1) {
+                    atoma = atomj1;
+                    atomb = atomj0;
+                    atomc = atomk0;
+                    scale = imj0/(imj0+imj1);
+                }
+                else if (atomj1 == atomk0) {
+                    atoma = atomj0;
+                    atomb = atomj1;
+                    atomc = atomk1;
+                    scale = imj1/(imj0+imj1);
+                }
+                else
+                    continue; // These constraints are not connected.
+
+                // Look for a third constraint forming a triangle with these two.
+
+                bool foundConstraint = false;
+                for (int m = 0; m < numCCMA; m++) {
+                    int other = ccmaConstraints[m];
+                    if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
+                        double d1 = distance[cj];
+                        double d2 = distance[ck];
+                        double d3 = distance[other];
+                        matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
+                        foundConstraint = true;
+                        break;
+                    }
+                }
+                if (!foundConstraint && angleForce != NULL) {
+                    // We didn't find one, so look for an angle force field term.
+
+                    const vector<int>& angleCandidates = atomAngles[atomb];
+                    for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
+                        int particle1, particle2, particle3;
+                        double angle, ka;
+                        angleForce->getAngleParameters(*iter, particle1, particle2, particle3, angle, ka);
+                        if ((particle1 == atoma && particle3 == atomc) || (particle3 == atoma && particle1 == atomc)) {
+                            matrix[j].push_back(pair<int, double>(k, scale*cos(angle)));
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Invert it using QR.
+
+        vector<int> matrixRowStart;
+        vector<int> matrixColIndex;
+        vector<double> matrixValue;
+        for (int i = 0; i < numCCMA; i++) {
+            matrixRowStart.push_back(matrixValue.size());
+            for (int j = 0; j < (int) matrix[i].size(); j++) {
+                pair<int, double> element = matrix[i][j];
+                matrixColIndex.push_back(element.first);
+                matrixValue.push_back(element.second);
+            }
+        }
+        matrixRowStart.push_back(matrixValue.size());
+        int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
+        double *qValue, *rValue;
+        int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
+                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
+        vector<double> rhs(numCCMA);
+        matrix.clear();
+        matrix.resize(numCCMA);
+        for (int i = 0; i < numCCMA; i++) {
+            // Extract column i of the inverse matrix.
+
+            for (int j = 0; j < numCCMA; j++)
+                rhs[j] = (i == j ? 1.0 : 0.0);
+            result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
+            result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
+            for (int j = 0; j < numCCMA; j++) {
+                double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
+                if (abs(value) > 0.1)
+                    matrix[j].push_back(pair<int, double>(i, value));
+            }
+        }
+        QUERN_free_result(qRowStart, qColIndex, qValue);
+        QUERN_free_result(rRowStart, rColIndex, rValue);
+        int maxRowElements = 0;
+        for (unsigned i = 0; i < matrix.size(); i++)
+            maxRowElements = max(maxRowElements, (int) matrix[i].size());
+        maxRowElements++;
+
+        // Sort the constraints.
+
+        vector<int> constraintOrder(numCCMA);
+        for (int i = 0; i < numCCMA; ++i)
+            constraintOrder[i] = i;
+        sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
+        vector<int> inverseOrder(numCCMA);
+        for (int i = 0; i < numCCMA; ++i)
+            inverseOrder[constraintOrder[i]] = i;
+        for (int i = 0; i < (int)matrix.size(); ++i)
+            for (int j = 0; j < (int)matrix[i].size(); ++j)
+                matrix[i][j].first = inverseOrder[matrix[i][j].first];
+
+        // Record the CCMA data structures.
+
+        ccmaAtoms = CudaArray::create<int2>(numCCMA, "CcmaAtoms");
+        ccmaDistance = CudaArray::create<float4>(numCCMA, "CcmaDistance");
+        ccmaAtomConstraints = CudaArray::create<int>(numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
+        ccmaNumAtomConstraints = CudaArray::create<int>(numAtoms, "CcmaAtomConstraintsIndex");
+        ccmaDelta1 = CudaArray::create<float>(numCCMA, "CcmaDelta1");
+        ccmaDelta2 = CudaArray::create<float>(numCCMA, "CcmaDelta2");
+        ccmaConverged = CudaArray::create<int>(2, "CcmaConverged");
+        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), 0), "Error allocating pinned memory");
+        ccmaReducedMass = CudaArray::create<float>(numCCMA, "CcmaReducedMass");
+        ccmaConstraintMatrixColumn = CudaArray::create<int>(numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConstraintMatrixValue = CudaArray::create<float>(numCCMA*maxRowElements, "ConstraintMatrixValue");
+        vector<int2> atomsVec(ccmaAtoms->getSize());
+        vector<float4> distanceVec(ccmaDistance->getSize());
+        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
+        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
+        vector<float> reducedMassVec(ccmaReducedMass->getSize());
+        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
+        vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
+        for (int i = 0; i < numCCMA; i++) {
+            int index = constraintOrder[i];
+            int c = ccmaConstraints[index];
+            atomsVec[i].x = atom1[c];
+            atomsVec[i].y = atom2[c];
+            distanceVec[i].w = (float) distance[c];
+            reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+            for (unsigned int j = 0; j < matrix[index].size(); j++) {
+                constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+                constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
+            }
+            constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
+        }
+        for (unsigned int i = 0; i < atomConstraints.size(); i++) {
+            numAtomConstraintsVec[i] = atomConstraints[i].size();
+            for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
+                bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
+                atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
+            }
+        }
+        ccmaAtoms->upload(atomsVec);
+        ccmaDistance->upload(distanceVec);
+        ccmaAtomConstraints->upload(atomConstraintsVec);
+        ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
+        ccmaReducedMass->upload(reducedMassVec);
+        ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
+        ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
+
+        // Create the CCMA kernels.
+
+        map<string, string> defines;
+        defines["NUM_CONSTRAINTS"] = context.intToString(numCCMA);
+        defines["NUM_ATOMS"] = context.intToString(numAtoms);
+//        CUmodule ccmaModule = context.createModule(CudaKernelSources::ccma, defines);
+//        ccmaDirectionsKernel = context.getKernel(ccmaModule, "computeConstraintDirections");
+//        ccmaPosForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
+//        ccmaMultiplyKernel = context.getKernel(ccmaModule, "multiplyByConstraintMatrix");
+//        ccmaPosUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
+//        defines["CONSTRAIN_VELOCITIES"] = "1";
+//        ccmaModule = context.createModule(CudaKernelSources::ccma, defines);
+//        ccmaVelForceKernel = context.getKernel(ccmaModule, "computeConstraintForce");
+//        ccmaVelUpdateKernel = context.getKernel(ccmaModule, "updateAtomPositions");
+    }
+    
+    // Build the list of virtual sites.
+    
+    vector<int4> vsite2AvgAtomVec;
+    vector<float2> vsite2AvgWeightVec;
+    vector<int4> vsite3AvgAtomVec;
+    vector<float4> vsite3AvgWeightVec;
+    vector<int4> vsiteOutOfPlaneAtomVec;
+    vector<float4> vsiteOutOfPlaneWeightVec;
+    for (int i = 0; i < numAtoms; i++) {
+        if (system.isVirtualSite(i)) {
+            if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                // A two particle average.
+                
+                const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
+                vsite2AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), 0));
+                vsite2AvgWeightVec.push_back(make_float2((float) site.getWeight(0), (float) site.getWeight(1)));
+            }
+            else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                // A three particle average.
+                
+                const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
+                vsite3AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
+                vsite3AvgWeightVec.push_back(make_float4((float) site.getWeight(0), (float) site.getWeight(1), (float) site.getWeight(2), 0.0f));
+            }
+            else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
+                // An out of plane site.
+                
+                const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
+                vsiteOutOfPlaneAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
+                vsiteOutOfPlaneWeightVec.push_back(make_float4((float) site.getWeight12(), (float) site.getWeight13(), (float) site.getWeightCross(), 0.0f));
+            }
+        }
+    }
+    int num2Avg = vsite2AvgAtomVec.size();
+    int num3Avg = vsite3AvgAtomVec.size();
+    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
+    vsite2AvgAtoms = CudaArray::create<int4>(max(1, num2Avg), "vsite2AvgAtoms");
+    vsite2AvgWeights = CudaArray::create<float2>(max(1, num2Avg), "vsite2AvgWeights");
+    vsite3AvgAtoms = CudaArray::create<int4>(max(1, num3Avg), "vsite3AvgAtoms");
+    vsite3AvgWeights = CudaArray::create<float4>(max(1, num3Avg), "vsite3AvgWeights");
+    vsiteOutOfPlaneAtoms = CudaArray::create<int4>(max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
+    vsiteOutOfPlaneWeights = CudaArray::create<float4>(max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
+    if (num2Avg > 0) {
+        vsite2AvgAtoms->upload(vsite2AvgAtomVec);
+        vsite2AvgWeights->upload(vsite2AvgWeightVec);
+    }
+    if (num3Avg > 0) {
+        vsite3AvgAtoms->upload(vsite3AvgAtomVec);
+        vsite3AvgWeights->upload(vsite3AvgWeightVec);
+    }
+    if (numOutOfPlane > 0) {
+        vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
+        vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
+    }
+    
+    // Create the kernels for virtual sites.
+
+    map<string, string> defines;
+    defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
+    defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
+    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
+//    CUmodule vsiteModule = context.createModule(CudaKernelSources::virtualSites, defines);
+//    vsitePositionKernel = context.getKernel(vsiteModule, "computeVirtualSites");
+//    vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
+//    vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer());
+//    vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer());
+//    vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer());
+//    vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer());
+//    vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+//    vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer());
+//    vsiteForceKernel = context.getKernel(vsiteModule, "distributeForces");
+//    vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
+//    // Skip argument 1: the force array hasn't been created yet.
+//    vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
+//    vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
+//    vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
+//    vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
+//    vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+//    vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    numVsites = num2Avg+num3Avg+numOutOfPlane;
+}
+
+CudaIntegrationUtilities::~CudaIntegrationUtilities() {
+    if (posDelta != NULL)
+        delete posDelta;
+    if (settleAtoms != NULL)
+        delete settleAtoms;
+    if (settleParams != NULL)
+        delete settleParams;
+    if (shakeAtoms != NULL)
+        delete shakeAtoms;
+    if (shakeParams != NULL)
+        delete shakeParams;
+    if (random != NULL)
+        delete random;
+    if (randomSeed != NULL)
+        delete randomSeed;
+    if (stepSize != NULL)
+        delete stepSize;
+    if (ccmaAtoms != NULL)
+        delete ccmaAtoms;
+    if (ccmaDistance != NULL)
+        delete ccmaDistance;
+    if (ccmaReducedMass != NULL)
+        delete ccmaReducedMass;
+    if (ccmaAtomConstraints != NULL)
+        delete ccmaAtomConstraints;
+    if (ccmaNumAtomConstraints != NULL)
+        delete ccmaNumAtomConstraints;
+    if (ccmaConstraintMatrixColumn != NULL)
+        delete ccmaConstraintMatrixColumn;
+    if (ccmaConstraintMatrixValue != NULL)
+        delete ccmaConstraintMatrixValue;
+    if (ccmaDelta1 != NULL)
+        delete ccmaDelta1;
+    if (ccmaDelta2 != NULL)
+        delete ccmaDelta2;
+    if (ccmaConverged != NULL)
+        delete ccmaConverged;
+    if (ccmaConvergedMemory != NULL)
+        cuMemFreeHost(ccmaConvergedMemory);
+    if (vsite2AvgAtoms != NULL)
+        delete vsite2AvgAtoms;
+    if (vsite2AvgWeights != NULL)
+        delete vsite2AvgWeights;
+    if (vsite3AvgAtoms != NULL)
+        delete vsite3AvgAtoms;
+    if (vsite3AvgWeights != NULL)
+        delete vsite3AvgWeights;
+    if (vsiteOutOfPlaneAtoms != NULL)
+        delete vsiteOutOfPlaneAtoms;
+    if (vsiteOutOfPlaneWeights != NULL)
+        delete vsiteOutOfPlaneWeights;
+}
+
+//void CudaIntegrationUtilities::applyConstraints(double tol) {
+//    applyConstraints(false, tol);
+//}
+//
+//void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
+//    applyConstraints(true, tol);
+//}
+//
+//void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
+//    bool hasInitialized;
+//    CUfunction settleKernel, shakeKernel, ccmaForceKernel, ccmaUpdateKernel;
+//    if (constrainVelocities) {
+//        hasInitialized = hasInitializedVelConstraintKernels;
+//        settleKernel = settleVelKernel;
+//        shakeKernel = shakeVelKernel;
+//        ccmaForceKernel = ccmaVelForceKernel;
+//        ccmaUpdateKernel = ccmaVelUpdateKernel;
+//        hasInitializedVelConstraintKernels = true;
+//    }
+//    else {
+//        hasInitialized = hasInitializedPosConstraintKernels;
+//        settleKernel = settlePosKernel;
+//        shakeKernel = shakePosKernel;
+//        ccmaForceKernel = ccmaPosForceKernel;
+//        ccmaUpdateKernel = ccmaPosUpdateKernel;
+//        hasInitializedPosConstraintKernels = true;
+//    }
+//    if (settleAtoms != NULL) {
+//        if (!hasInitialized) {
+//            settleKernel.setArg<int>(0, settleAtoms->getSize());
+//            settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+//            settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer());
+//            settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
+//            settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer());
+//            settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer());
+//        }
+//        settleKernel.setArg<float>(1, (float) tol);
+//        context.executeKernel(settleKernel, settleAtoms->getSize());
+//    }
+//    if (shakeAtoms != NULL) {
+//        if (!hasInitialized) {
+//            shakeKernel.setArg<int>(0, shakeAtoms->getSize());
+//            shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+//            shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+//            shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer());
+//            shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer());
+//        }
+//        shakeKernel.setArg<float>(1, (float) tol);
+//        context.executeKernel(shakeKernel, shakeAtoms->getSize());
+//    }
+//    if (ccmaAtoms != NULL) {
+//        if (!hasInitialized) {
+//            ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
+//            ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
+//            ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+//            ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
+//            ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
+//            ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+//            ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
+//            ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
+//            ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
+//            ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
+//            ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
+//            ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
+//            ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue->getDeviceBuffer());
+//            ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(5, ccmaDelta1->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
+//            ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
+//        }
+//        ccmaForceKernel.setArg<float>(6, (float) tol);
+//        context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
+//        const int checkInterval = 4;
+//        cl::Event event;
+//        for (int i = 0; i < 150; i++) {
+//            ccmaForceKernel.setArg<int>(7, i);
+//            if (i == 0) {
+//                ccmaConvergedMemory[0] = 1;
+//                ccmaConvergedMemory[1] = 0;
+//                context.getQueue().enqueueWriteBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), ccmaConvergedMemory);
+//            }
+//            context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
+//            if ((i+1)%checkInterval == 0)
+//                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(int), ccmaConvergedMemory, NULL, &event);
+//            ccmaMultiplyKernel.setArg<int>(5, i);
+//            context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
+//            ccmaUpdateKernel.setArg<int>(8, i);
+//            context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
+//            if ((i+1)%checkInterval == 0) {
+//                event.wait();
+//                if (ccmaConvergedMemory[i%2])
+//                    break;
+//            }
+//        }
+//    }
+//}
+//
+//void CudaIntegrationUtilities::computeVirtualSites() {
+//    if (numVsites > 0)
+//        context.executeKernel(vsitePositionKernel, numVsites);
+//}
+//
+//void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
+//    if (numVsites > 0) {
+//        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
+//        context.executeKernel(vsiteForceKernel, numVsites);
+//    }
+//}
+
+void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
+    if (random != NULL) {
+        if (randomNumberSeed != lastSeed)
+           throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
+        return;
+    }
+
+    // Create the random number arrays.
+
+    lastSeed = randomNumberSeed;
+    random = CudaArray::create<float4>(32*context.getPaddedNumAtoms(), "random");
+    randomSeed = CudaArray::create<int4>(context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
+    randomPos = random->getSize();
+
+    // Use a quick and dirty RNG to pick seeds for the real random number generator.
+
+    vector<int4> seed(randomSeed->getSize());
+    unsigned int r = randomNumberSeed;
+    for (int i = 0; i < randomSeed->getSize(); i++) {
+        seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+    }
+    randomSeed->upload(seed);
+
+    // Create the kernel.
+
+    CUmodule randomModule = context.createModule(CudaKernelSources::random);
+    randomKernel = context.getKernel(randomModule, "generateRandomNumbers");
+}
+
+int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
+    if (randomPos+numValues <= random->getSize()) {
+        int oldPos = randomPos;
+        randomPos += numValues;
+        return oldPos;
+    }
+    if (numValues > random->getSize()) {
+        delete random;
+        random = CudaArray::create<float4>(numValues, "random");
+    }
+    int size = random->getSize();
+    void* args[] = {&size, &random->getDevicePointer(), &randomSeed->getDevicePointer()};
+    context.executeKernel(randomKernel, args, random->getSize());
+    randomPos = numValues;
+    return 0;
+}
+
+void CudaIntegrationUtilities::createCheckpoint(ostream& stream) {
+    stream.write((char*) &randomPos, sizeof(int));
+    vector<float4> randomVec;
+    random->download(randomVec);
+    stream.write((char*) &randomVec[0], sizeof(float4)*random->getSize());
+    vector<int4> randomSeedVec;
+    randomSeed->download(randomSeedVec);
+    stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize());
+}
+
+void CudaIntegrationUtilities::loadCheckpoint(istream& stream) {
+    stream.read((char*) &randomPos, sizeof(int));
+    vector<float4> randomVec(random->getSize());
+    stream.read((char*) &randomVec[0], sizeof(float4)*random->getSize());
+    random->upload(randomVec);
+    vector<int4> randomSeedVec(randomSeed->getSize());
+    stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize());
+    randomSeed->upload(randomSeedVec);
+}
--- a/platforms/cuda2/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.h
+#ifndef OPENMM_CUDAINTEGRATIONUTILITIES_H_
+#define OPENMM_CUDAINTEGRATIONUTILITIES_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/System.h"
+#include "CudaContext.h"
+#include "openmm/internal/windowsExport.h"
+#include <iosfwd>
+
+namespace OpenMM {
+
+/**
+ * This class implements features that are used by many different integrators, including
+ * common workspace arrays, random number generation, and enforcing constraints.
+ */
+
+class OPENMM_EXPORT CudaIntegrationUtilities {
+public:
+    CudaIntegrationUtilities(CudaContext& context, const System& system);
+    ~CudaIntegrationUtilities();
+    /**
+     * Get the array which contains position deltas.
+     */
+    CudaArray& getPosDelta() {
+        return *posDelta;
+    }
+    /**
+     * Get the array which contains random values.  Each element is a float4, whose components
+     * are independent, normally distributed random numbers with mean 0 and variance 1.
+     */
+    CudaArray& getRandom() {
+        return *random;
+    }
+    /**
+     * Get the array which contains the current step size.
+     */
+    CudaArray& getStepSize() {
+        return *stepSize;
+    }
+    /**
+     * Apply constraints to the atom positions.
+     *
+     * @param tol             the constraint tolerance
+     */
+    void applyConstraints(double tol);
+    /**
+     * Apply constraints to the atom velocities.
+     *
+     * @param tol             the constraint tolerance
+     */
+    void applyVelocityConstraints(double tol);
+    /**
+     * Initialize the random number generator.
+     */
+    void initRandomNumberGenerator(unsigned int randomNumberSeed);
+    /**
+     * Ensure that sufficient random numbers are available in the array, and generate new ones if not.
+     *
+     * @param numValues     the number of random float4's that will be required
+     * @return the index in the array at which to start reading
+     */
+    int prepareRandomNumbers(int numValues);
+    /**
+     * Compute the positions of virtual sites.
+     */
+    void computeVirtualSites();
+    /**
+     * Distribute forces from virtual sites to the atoms they are based on.
+     */
+    void distributeForcesFromVirtualSites();
+    /**
+     * Create a checkpoint recording the current state of the random number generator.
+     * 
+     * @param stream    an output stream the checkpoint data should be written to
+     */
+    void createCheckpoint(std::ostream& stream);
+    /**
+     * Load a checkpoint that was written by createCheckpoint().
+     * 
+     * @param stream    an input stream the checkpoint data should be read from
+     */
+    void loadCheckpoint(std::istream& stream);
+private:
+    void applyConstraints(bool constrainVelocities, double tol);
+    CudaContext& context;
+    CUfunction settlePosKernel, settleVelKernel;
+    CUfunction shakePosKernel, shakeVelKernel;
+    CUfunction ccmaDirectionsKernel;
+    CUfunction ccmaPosForceKernel, ccmaVelForceKernel;
+    CUfunction ccmaMultiplyKernel;
+    CUfunction ccmaPosUpdateKernel, ccmaVelUpdateKernel;
+    CUfunction vsitePositionKernel, vsiteForceKernel;
+    CUfunction randomKernel;
+    CudaArray* posDelta;
+    CudaArray* settleAtoms;
+    CudaArray* settleParams;
+    CudaArray* shakeAtoms;
+    CudaArray* shakeParams;
+    CudaArray* random;
+    CudaArray* randomSeed;
+    CudaArray* stepSize;
+    CudaArray* ccmaAtoms;
+    CudaArray* ccmaDistance;
+    CudaArray* ccmaReducedMass;
+    CudaArray* ccmaAtomConstraints;
+    CudaArray* ccmaNumAtomConstraints;
+    CudaArray* ccmaConstraintMatrixColumn;
+    CudaArray* ccmaConstraintMatrixValue;
+    CudaArray* ccmaDelta1;
+    CudaArray* ccmaDelta2;
+    CudaArray* ccmaConverged;
+    int* ccmaConvergedMemory;
+    CudaArray* vsite2AvgAtoms;
+    CudaArray* vsite2AvgWeights;
+    CudaArray* vsite3AvgAtoms;
+    CudaArray* vsite3AvgWeights;
+    CudaArray* vsiteOutOfPlaneAtoms;
+    CudaArray* vsiteOutOfPlaneWeights;
+    int randomPos;
+    int lastSeed, numVsites;
+    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;
+    struct ShakeCluster;
+    struct ConstraintOrderer;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CUDAINTEGRATIONUTILITIES_H_*/
--- a/platforms/cuda2/src/kernels/random.cu
+++ b/platforms/cuda2/src/kernels/random.cu
+/**
+ * Generate random numbers
+ */
+
+extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    uint4 state = seed[index];
+    unsigned int carry = 0;
+    while (index < numValues) {
+        float4 value;
+
+        // Generate first value.
+
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        unsigned int m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x1 = sqrt(-2.0f * log(x1));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.x = x1 * cos(2.0f * 3.14159265f * x2);
+
+        // Generate second value.
+
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x3 = sqrt(-2.0f * log(x3));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.y = x3 * cos(2.0f * 3.14159265f * x4);
+
+        // Generate third value.
+
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x5 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x5 = sqrt(-2.0f * log(x5));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.z = x5 * cos(2.0f * 3.14159265f * x6);
+
+        // Generate fourth value.
+
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x7 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x7 = sqrt(-2.0f * log(x7));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.w = x7 * cos(2.0f * 3.14159265f * x8);
+
+        // Record the values.
+
+        random[index] = value;
+        index += blockDim.x*gridDim.x;
+    }
+    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+}
--- a/platforms/cuda2/tests/TestCudaRandom.cpp
+++ b/platforms/cuda2/tests/TestCudaRandom.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of random number generation.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "../src/CudaArray.h"
+#include "../src/CudaContext.h"
+#include "../src/CudaIntegrationUtilities.h"
+#include "openmm/System.h"
+#include <iostream>
+
+using namespace OpenMM;
+using namespace std;
+
+void testGaussian() {
+    int numAtoms = 5000;
+    System system;
+    for (int i = 0; i < numAtoms; i++)
+        system.addParticle(1.0);
+    CudaPlatform platform;
+    CudaPlatform::PlatformData platformData(system, "", "true", "single",
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+    CudaContext& context = *platformData.contexts[0];
+    context.initialize();
+    context.getIntegrationUtilities().initRandomNumberGenerator(0);
+    CudaArray& random = context.getIntegrationUtilities().getRandom();
+    context.getIntegrationUtilities().prepareRandomNumbers(random.getSize());
+    const int numValues = random.getSize()*4;
+    vector<float4> values(numValues);
+    random.download(values);
+    float* data = reinterpret_cast<float*>(&values[0]);
+    double mean = 0.0;
+    double var = 0.0;
+    double skew = 0.0;
+    double kurtosis = 0.0;
+    for (int i = 0; i < numValues; i++) {
+        double value = data[i];
+        mean += value;
+        var += value*value;
+        skew += value*value*value;
+        kurtosis += value*value*value*value;
+    }
+    mean /= numValues;
+    var /= numValues;
+    skew /= numValues;
+    kurtosis /= numValues;
+    double c2 = var-mean*mean;
+    double c3 = skew-3*var*mean+2*mean*mean*mean;
+    double c4 = kurtosis-4*skew*mean-3*var*var+12*var*mean*mean-6*mean*mean*mean*mean;
+    ASSERT_EQUAL_TOL(0.0, mean, 3.0/sqrt((double)numValues));
+    ASSERT_EQUAL_TOL(1.0, c2, 3.0/pow(numValues, 1.0/3.0));
+    ASSERT_EQUAL_TOL(0.0, c3, 3.0/pow(numValues, 1.0/4.0));
+    ASSERT_EQUAL_TOL(0.0, c4, 3.0/pow(numValues, 1.0/4.0));
+}
+
+int main() {
+    try {
+        testGaussian();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+