Continuing to implement new CUDA platform

1f0ec7b5 · Peter Eastman · 99cebd08 · 1f0ec7b5 · 1f0ec7b5 · 1f0ec7b5
Commit 1f0ec7b5 authored Jun 07, 2012 by Peter Eastman
10 changed files
--- a/platforms/cuda2/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaBondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaBondedUtilities.h"
+#include "CudaExpressionUtilities.h"
+#include "openmm/OpenMMException.h"
+#include "CudaNonbondedUtilities.h"
+#include <iostream>
+using namespace OpenMM;
+using namespace std;
+CudaBondedUtilities::CudaBondedUtilities(CudaContext& context) : context(context), numForceBuffers(0), maxBonds(0), hasInitializedKernels(false) {
+}
+CudaBondedUtilities::~CudaBondedUtilities() {
+    for (int i = 0; i < (int) atomIndices.size(); i++)
+        for (int j = 0; j < (int) atomIndices[i].size(); j++)
+            delete atomIndices[i][j];
+}
+void CudaBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
+    if (atoms.size() > 0) {
+        forceAtoms.push_back(atoms);
+        forceSource.push_back(source);
+        forceGroup.push_back(group);
+    }
+}
+std::string CudaBondedUtilities::addArgument(CUdeviceptr data, const string& type) {
+    arguments.push_back(data);
+    argTypes.push_back(type);
+    return "customArg"+context.intToString(arguments.size());
+}
+void CudaBondedUtilities::addPrefixCode(const string& source) {
+    prefixCode.push_back(source);
+}
+void CudaBondedUtilities::initialize(const System& system) {
+    int numForces = forceAtoms.size();
+    if (numForces == 0)
+        return;
+    // Build the lists of atom indices.
+    atomIndices.resize(numForces);
+    for (int i = 0; i < numForces; i++) {
+        int numBonds = forceAtoms[i].size();
+        int numAtoms = forceAtoms[i][0].size();
+        int startAtom = 0;
+        while (startAtom < numAtoms) {
+            int width = max(numAtoms-startAtom, 4);
+            if (width == 3)
+                width = 4;
+            vector<unsigned int> indexVec(width*numBonds);
+            for (int bond = 0; bond < numBonds; bond++) {
+                for (int atom = 0; atom < width; atom++)
+                    indexVec[bond*width+atom] = forceAtoms[i][bond][startAtom+atom];
+            }
+            CudaArray* indices = CudaArray::create<unsigned int>(indexVec.size(), "bondedIndices");
+            indices->upload(indexVec);
+            atomIndices[i].push_back(indices);
+            startAtom += width;
+        }
+    }
+    // Create the kernel.
+    stringstream s;
+    for (int i = 0; i < (int) prefixCode.size(); i++)
+        s<<prefixCode[i];
+    s<<"extern \"C\" __global__ void computeBondedForces(long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups";
+    for (int force = 0; force < numForces; force++) {
+        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
+            int indexWidth = atomIndices[force][i]->getElementSize()/4;
+            string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+            s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
+        }
+    }
+    for (int i = 0; i < (int) arguments.size(); i++)
+        s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
+    s<<") {\n";
+    s<<"real energy = 0;\n";
+    for (int force = 0; force < numForces; force++)
+        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
+    s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
+    s<<"}\n";
+    map<string, string> defines;
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    CUmodule module = context.createModule(s.str(), defines);
+    kernel = context.getKernel(module, "computeBondedForces");
+    forceAtoms.clear();
+    forceSource.clear();
+}
+string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const string& computeForce) {
+    maxBonds = max(maxBonds, numBonds);
+    string suffix1[] = {""};
+    string suffix4[] = {".x", ".y", ".z", ".w"};
+    string* suffix;
+    stringstream s;
+    s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
+    s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
+    int startAtom = 0;
+    for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
+        int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4;
+        suffix = (indexWidth == 1 ? suffix1 : suffix4);
+        string indexType = "unsigned int"+(indexWidth == 1 ? "" : context.intToString(indexWidth));
+        s<<"    "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
+        s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
+        for (int j = 0; j < indexWidth; j++) {
+            s<<"    unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
+            s<<"    real4 pos"<<(j+1)<<" = posq[atom"<<(j+1)<<"];\n";
+        }
+        startAtom += indexWidth;
+    }
+    s<<computeForce<<"\n";
+    for (int i = 0; i < numAtoms; i++) {
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force.x*0xFFFFFFFF));\n";
+        s<<"    atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], (long) (force.x*0xFFFFFFFF));\n";
+    }
+    s<<"}\n";
+    return s.str();
+}
+void CudaBondedUtilities::computeInteractions(int groups) {
+//    if (!hasInitializedKernels) {
+//        hasInitializedKernels = true;
+//        for (int i = 0; i < (int) forceSets.size(); i++) {
+//            int index = 0;
+//            cl::Kernel& kernel = kernels[i];
+//            kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
+//            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
+//            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+//            index++;
+//            for (int j = 0; j < (int) forceSets[i].size(); j++) {
+//                kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]]->getDeviceBuffer());
+//                kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]]->getDeviceBuffer());
+//            }
+//            for (int j = 0; j < (int) arguments.size(); j++)
+//                kernel.setArg<cl::Memory>(index++, *arguments[j]);
+//        }
+//    }
+//    for (int i = 0; i < (int) kernels.size(); i++) {
+//        kernels[i].setArg<cl_int>(3, groups);
+//        context.executeKernel(kernels[i], maxBonds);
+//    }
+}
--- a/platforms/cuda2/src/CudaBondedUtilities.h
+++ b/platforms/cuda2/src/CudaBondedUtilities.h
+#ifndef OPENMM_CUDABONDEDUTILITIES_H_
+#define OPENMM_CUDABONDEDUTILITIES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaArray.h"
+#include "CudaContext.h"
+#include "openmm/System.h"
+#include <string>
+#include <vector>
+namespace OpenMM {
+/**
+ * This class provides a generic mechanism for evaluating bonded interactions.  You write only
+ * the source code needed to compute one interaction, and this class takes care of creating
+ * and executing a complete kernel that loops over bonds, evaluates each one, and accumulates
+ * the resulting forces and energies.  This offers two advantages.  First, it simplifies the
+ * task of writing a new Force.  Second, it allows multiple forces to be evaluated by a single
+ * kernel, which reduces overhead and improves performance.
+ * 
+ * A "bonded interaction" means an interaction that affects a small, fixed set of particles.
+ * The interaction energy may depend on the positions of only those particles, and the list of
+ * particles forming a "bond" may not change with time.  Examples of bonded interactions
+ * include HarmonicBondForce, HarmonicAngleForce, and PeriodicTorsionForce.
+ * 
+ * To create a bonded interaction, call addInteraction().  You pass to it a block of source
+ * code for evaluating the interaction.  The inputs and outputs for that source code are as
+ * follows:
+ * 
+ * <ol>
+ * <li>The index of the bond being evaluated will have been stored in the unsigned int variable "index".</li>
+ * <li>The indices of the atoms forming that bond will have been stored in the unsigned int variables "atom1",
+ * "atom2", ....</li>
+ * <li>The positions of those atoms will have been stored in the real4 variables "pos1", "pos2", ....</li>
+ * <li>A real variable called "energy" will exist.  Your code should add the potential energy of the
+ * bond to that variable.</li>
+ * <li>Your code should define real4 variables called "force1", "force2", ... that contain the force to
+ * apply to each atom.</li>
+ * </ol>
+ * 
+ * As a simple example, the following source code would be used to implement a pairwise interaction of
+ * the form E=r^2:
+ * 
+ * <tt><pre>
+ * real4 delta = pos2-pos1;
+ * energy += delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+ * real4 force1 = 2.0f*delta;
+ * real4 force2 = -2.0f*delta;
+ * </pre></tt>
+ * 
+ * Interactions will often depend on parameters or other data.  Call addArgument() to provide the data
+ * to this class.  It will be passed to the interaction kernel as an argument, and you can refer to it
+ * from your interaction code.
+ */
+class OPENMM_EXPORT CudaBondedUtilities {
+public:
+    CudaBondedUtilities(CudaContext& context);
+    ~CudaBondedUtilities();
+    /**
+     * Add a bonded interaction.
+     *
+     * @param atoms    this should have one entry for each bond, and that entry should contain the list
+     *                 of atoms involved in the bond.  Every entry must have the same number of atoms.
+     * @param source   the code to evaluate the interaction
+     * @param group    the force group in which the interaction should be calculated
+     */
+    void addInteraction(const std::vector<std::vector<int> >& atoms, const std::string& source, int group);
+    /**
+     * Add an argument that should be passed to the interaction kernel.
+     * 
+     * @param data    the device memory containing the data to pass
+     * @param type    the data type contained in the memory (e.g. "float4")
+     * @return the name that will be used for the argument.  Any code you pass to addInteraction() should
+     * refer to it by this name.
+     */
+    std::string addArgument(CUdeviceptr data, const std::string& type);
+    /**
+     * Add some Cuda code that should be included in the program, before the start of the kernel.
+     * This can be used, for example, to define functions that will be called by the kernel.
+     * 
+     * @param source   the code to include
+     */
+    void addPrefixCode(const std::string& source);
+    /**
+     * Initialize this object in preparation for a simulation.
+     */
+    void initialize(const System& system);
+    /**
+     * Compute the bonded interactions.
+     * 
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    void computeInteractions(int groups);
+private:
+    std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
+    CudaContext& context;
+    CUfunction kernel;
+    std::vector<std::vector<std::vector<int> > > forceAtoms;
+    std::vector<std::vector<int> > indexWidth;
+    std::vector<std::string> forceSource;
+    std::vector<int> forceGroup;
+    std::vector<CUdeviceptr> arguments;
+    std::vector<std::string> argTypes;
+    std::vector<std::vector<CudaArray*> > atomIndices;
+    std::vector<std::string> prefixCode;
+    int numForceBuffers, maxBonds;
+    bool hasInitializedKernels;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDABONDEDUTILITIES_H_*/
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -32,7 +32,7 @@
 #include "CudaArray.h"
 //#include "CudaBondedUtilities.h"
 #include "CudaForceInfo.h"
-//#include "CudaIntegrationUtilities.h"
+#include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
 //#include "CudaNonbondedUtilities.h"
 #include "hilbert.h"
@@ -40,6 +40,7 @@
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/VirtualSite.h"
+#include "CudaExpressionUtilities.h"
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
@@ -66,8 +67,8 @@ bool CudaContext::hasInitializedCuda = false;
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
-        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
+        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL),*/ integration(NULL), expression(NULL),
-        bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
+        /*bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -143,11 +144,17 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
+        compilationDefines["make_real2"] = "make_double2";
+        compilationDefines["make_real3"] = "make_double3";
+        compilationDefines["make_real4"] = "make_double4";
    }
    else {
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
        velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
+        compilationDefines["make_real2"] = "make_float2";
+        compilationDefines["make_real3"] = "make_float3";
+        compilationDefines["make_real4"] = "make_float4";
    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
@@ -160,8 +167,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
-    reduceFloat4Kernel = getKernel(utilities, "reduceFloat4Buffer");
-    reduceForcesKernel = getKernel(utilities, "reduceForces");
    // Set defines based on the requested precision.
@@ -170,14 +175,21 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
+    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
+    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
+    compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
+    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
+    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
+    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
    // Create the work thread used for parallelization when running on multiple devices.
    thread = new WorkThread();
-//    
-//    // Create the integration utilities object.
+    // Create utilities objects.
-//    
-//    integration = new CudaIntegrationUtilities(*this, system);
+    integration = new CudaIntegrationUtilities(*this, system);
+    expression = new CudaExpressionUtilities(*this);
 }
 CudaContext::~CudaContext() {
@@ -201,8 +213,10 @@ CudaContext::~CudaContext() {
 //        delete energyBuffer;
 //    if (atomIndex != NULL)
 //        delete atomIndex;
-//    if (integration != NULL)
+    if (integration != NULL)
-//        delete integration;
+        delete integration;
+    if (expression != NULL)
+        delete expression;
 //    if (bonded != NULL)
 //        delete bonded;
 //    if (nonbonded != NULL)
@@ -272,6 +286,18 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    }
    if (!compilationDefines.empty())
        src << endl;
+    if (useDoublePrecision) {
+        src << "typedef double real;\n";
+        src << "typedef double2 real2;\n";
+        src << "typedef double3 real3;\n";
+        src << "typedef double4 real4;\n";
+    }
+    else {
+        src << "typedef float real;\n";
+        src << "typedef float2 real2;\n";
+        src << "typedef float3 real3;\n";
+        src << "typedef float4 real4;\n";
+    }
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())
@@ -498,22 +524,7 @@ void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
 //        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
 //    }
 //}
-//
-//void CudaContext::reduceForces() {
-//    if (supports64BitGlobalAtomics)
-//        executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
-//    else
-//        reduceBuffer(*forceBuffers, numForceBuffers);
-//}
-//
-//void CudaContext::reduceBuffer(CudaArray<mm_float4>& array, int numBuffers) {
-//    int bufferSize = array.getSize()/numBuffers;
-//    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
-//    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
-//    reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
-//    executeKernel(reduceFloat4Kernel, bufferSize, 128);
-//}
-//
 void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
    // Recursively tag atoms as belonging to a particular molecule.

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -46,6 +46,7 @@ namespace OpenMM {
 class CudaArray;
 class CudaForceInfo;
+class CudaExpressionUtilities;
 class CudaIntegrationUtilities;
 class CudaBondedUtilities;
 class CudaNonbondedUtilities;
@@ -216,25 +217,13 @@ public:
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *
     * @param memory     the memory to clear
-     * @param size       the number of float/double elements in the buffer
+     * @param size       the number of 4-byte elements in the buffer
     */
    void addAutoclearBuffer(CUdeviceptr memory, int size);
 //    /**
 //     * Clear all buffers that have been registered with addAutoclearBuffer().
 //     */
 //    void clearAutoclearBuffers();
-//    /**
-//     * Given a collection of buffers packed into an array, sum them and store
-//     * the sum in the first buffer.
-//     *
-//     * @param array       the array containing the buffers to reduce
-//     * @param numBuffers  the number of buffers packed into the array
-//     */
-//    void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
-//    /**
-//     * Sum the buffesr containing forces.
-//     */
-//    void reduceForces();
    /**
     * Get the current simulation time.
     */
@@ -341,12 +330,18 @@ public:
 //    float4 getInvPeriodicBoxSize() const {
 //        return invPeriodicBoxSize;
 //    }
-//    /**
+    /**
-//     * Get the CudaIntegrationUtilities for this context.
+     * Get the CudaIntegrationUtilities for this context.
-//     */
+     */
-//    CudaIntegrationUtilities& getIntegrationUtilities() {
+    CudaIntegrationUtilities& getIntegrationUtilities() {
-//        return *integration;
+        return *integration;
-//    }
+    }
+    /**
+     * Get the CudaExpressionUtilities for this context.
+     */
+    CudaExpressionUtilities& getExpressionUtilities() {
+        return *expression;
+    }
 //    /**
 //     * Get the CudaBondedUtilities for this context.
 //     */
@@ -445,8 +440,6 @@ private:
    CUfunction clearFourBuffersKernel;
    CUfunction clearFiveBuffersKernel;
    CUfunction clearSixBuffersKernel;
-    CUfunction reduceFloat4Kernel;
-    CUfunction reduceForcesKernel;
    std::vector<CudaForceInfo*> forces;
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
@@ -461,7 +454,8 @@ private:
    std::vector<CUdeviceptr> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
-//    CudaIntegrationUtilities* integration;
+    CudaIntegrationUtilities* integration;
+    CudaExpressionUtilities* expression;
 //    CudaBondedUtilities* bonded;
 //    CudaNonbondedUtilities* nonbonded;
    WorkThread* thread;

--- a/platforms/cuda2/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda2/src/CudaExpressionUtilities.cpp
@@ -33,19 +33,6 @@ using namespace OpenMM;
 using namespace Lepton;
 using namespace std;
-string CudaExpressionUtilities::doubleToString(double value) {
-    stringstream s;
-    s.precision(8);
-    s << scientific << value << "f";
-    return s.str();
-}
-string CudaExpressionUtilities::intToString(int value) {
-    stringstream s;
-    s << value;
-    return s.str();
-}
 string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
    vector<pair<ExpressionTreeNode, string> > variableNodes;
@@ -75,13 +62,13 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            return;
    for (int i = 0; i < (int) node.getChildren().size(); i++)
        processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
-    string name = prefix+intToString(temps.size());
+    string name = prefix+context.intToString(temps.size());
    bool hasRecordedNode = false;
    out << tempType << " " << name << " = ";
    switch (node.getOperation().getId()) {
        case Operation::CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+            out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
            break;
        case Operation::VARIABLE:
            throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
@@ -107,7 +94,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            string valueName = name;
            string derivName = name;
            if (valueNode != NULL && derivNode != NULL) {
-                string name2 = prefix+intToString(temps.size());
+                string name2 = prefix+context.intToString(temps.size());
                out << tempType << " " << name2 << " = 0.0f;\n";
                if (isDeriv) {
                    valueName = name2;
@@ -120,14 +107,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            }
            out << "{\n";
            out << "float4 params = " << functionParams << "[" << i << "];\n";
-            out << "float x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+            out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
            out << "if (x >= params.x && x <= params.y) {\n";
            out << "x = (x-params.x)*params.z;\n";
            out << "int index = (int) (floor(x));\n";
            out << "index = min(index, (int) params.w);\n";
            out << "float4 coeff = " << functions[i].second << "[index];\n";
-            out << "float b = x-index;\n";
+            out << "real b = x-index;\n";
-            out << "float a = 1.0f-b;\n";
+            out << "real a = 1.0f-b;\n";
            if (valueNode != NULL)
                out << valueName << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);\n";
            if (derivNode != NULL)
@@ -164,7 +151,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "-" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::SQRT:
-            out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "SQRT(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::EXP:
            out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
@@ -173,31 +160,31 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SIN:
-            out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "SIN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::COS:
-            out << "cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "COS(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SEC:
-            out << "1.0f/cos(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "RECIP(COS(" << getTempName(node.getChildren()[0], temps) << "))";
            break;
        case Operation::CSC:
-            out << "1.0f/sin(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "RECIP(SIN(" << getTempName(node.getChildren()[0], temps) << "))";
            break;
        case Operation::TAN:
-            out << "tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "TAN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::COT:
-            out << "1.0f/tan(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "RECIP(TAN(" << getTempName(node.getChildren()[0], temps) << "))";
            break;
        case Operation::ASIN:
-            out << "asin(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ACOS:
-            out << "acos(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ACSO(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ATAN:
-            out << "atan(" << getTempName(node.getChildren()[0], temps) << ")";
+            out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SINH:
            out << "sinh(" << getTempName(node.getChildren()[0], temps) << ")";
@@ -236,10 +223,10 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ADD_CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
+            out << context.doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::MULTIPLY_CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
+            out << context.doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::POWER_CONSTANT:
        {
@@ -266,14 +253,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
                    if (iter->first != exponent) {
                        exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
-                        string name2 = prefix+intToString(temps.size());
+                        string name2 = prefix+context.intToString(temps.size());
                        names.push_back(name2);
                        temps.push_back(make_pair(*iter->second, name2));
                        out << tempType << " " << name2 << " = 0.0f;\n";
                    }
                }
                out << "{\n";
-                out << "float multiplier = " << (exponent < 0.0 ? "1.0f/" : "") << getTempName(node.getChildren()[0], temps) << ";\n";
+                out << "real multiplier = " << (exponent < 0.0 ? "RECIP(" : "(") << getTempName(node.getChildren()[0], temps) << ");\n";
                bool done = false;
                while (!done) {
                    done = true;
@@ -295,7 +282,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                out << "}";
            }
            else
-                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
+                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
            break;
        }
        case Operation::MIN:

--- a/platforms/cuda2/src/CudaExpressionUtilities.h
+++ b/platforms/cuda2/src/CudaExpressionUtilities.h
@@ -45,6 +45,8 @@ namespace OpenMM {
 class OPENMM_EXPORT CudaExpressionUtilities {
 public:
+    CudaExpressionUtilities(CudaContext& context) : context(context) {
+    }
    /**
     * Generate the source code for calculating a set of expressions.
     *
@@ -54,10 +56,10 @@ public:
     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
     * @param prefix         a prefix to put in front of temporary variables
     * @param functionParams the variable name containing the parameters for each tabulated function
-     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     * @param tempType       the type of value to use for temporary variables (defaults to "real")
     */
-    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
+    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
-            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
    /**
     * Generate the source code for calculating a set of expressions.
     *
@@ -67,10 +69,10 @@ public:
     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
     * @param prefix         a prefix to put in front of temporary variables
     * @param functionParams the variable name containing the parameters for each tabulated function
-     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     * @param tempType       the type of value to use for temporary variables (defaults to "real")
     */
-    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
+    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
-            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
    /**
     * Calculate the spline coefficients for a tabulated function that appears in expressions.
     *
@@ -79,26 +81,19 @@ public:
     * @param max            the value of the independent variable corresponding to the last element of values
     * @return the spline coefficients
     */
-    static std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
+    std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    static std::string doubleToString(double value);
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    static std::string intToString(int value);
    class FunctionPlaceholder;
 private:
-    static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
+    void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
            std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
            const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
-    static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
+    std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
-    static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+    void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
            const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
-    static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+    void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
            std::map<int, const Lepton::ExpressionTreeNode*>& powers);
+    CudaContext& context;
 };
 /**

--- a/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.cpp
--- a/platforms/cuda2/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.h
+#ifndef OPENMM_CUDAINTEGRATIONUTILITIES_H_
+#define OPENMM_CUDAINTEGRATIONUTILITIES_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/System.h"
+#include "CudaContext.h"
+#include "openmm/internal/windowsExport.h"
+#include <iosfwd>
+namespace OpenMM {
+/**
+ * This class implements features that are used by many different integrators, including
+ * common workspace arrays, random number generation, and enforcing constraints.
+ */
+class OPENMM_EXPORT CudaIntegrationUtilities {
+public:
+    CudaIntegrationUtilities(CudaContext& context, const System& system);
+    ~CudaIntegrationUtilities();
+    /**
+     * Get the array which contains position deltas.
+     */
+    CudaArray& getPosDelta() {
+        return *posDelta;
+    }
+    /**
+     * Get the array which contains random values.  Each element is a float4, whose components
+     * are independent, normally distributed random numbers with mean 0 and variance 1.
+     */
+    CudaArray& getRandom() {
+        return *random;
+    }
+    /**
+     * Get the array which contains the current step size.
+     */
+    CudaArray& getStepSize() {
+        return *stepSize;
+    }
+    /**
+     * Apply constraints to the atom positions.
+     *
+     * @param tol             the constraint tolerance
+     */
+    void applyConstraints(double tol);
+    /**
+     * Apply constraints to the atom velocities.
+     *
+     * @param tol             the constraint tolerance
+     */
+    void applyVelocityConstraints(double tol);
+    /**
+     * Initialize the random number generator.
+     */
+    void initRandomNumberGenerator(unsigned int randomNumberSeed);
+    /**
+     * Ensure that sufficient random numbers are available in the array, and generate new ones if not.
+     *
+     * @param numValues     the number of random float4's that will be required
+     * @return the index in the array at which to start reading
+     */
+    int prepareRandomNumbers(int numValues);
+    /**
+     * Compute the positions of virtual sites.
+     */
+    void computeVirtualSites();
+    /**
+     * Distribute forces from virtual sites to the atoms they are based on.
+     */
+    void distributeForcesFromVirtualSites();
+    /**
+     * Create a checkpoint recording the current state of the random number generator.
+     * 
+     * @param stream    an output stream the checkpoint data should be written to
+     */
+    void createCheckpoint(std::ostream& stream);
+    /**
+     * Load a checkpoint that was written by createCheckpoint().
+     * 
+     * @param stream    an input stream the checkpoint data should be read from
+     */
+    void loadCheckpoint(std::istream& stream);
+private:
+    void applyConstraints(bool constrainVelocities, double tol);
+    CudaContext& context;
+    CUfunction settlePosKernel, settleVelKernel;
+    CUfunction shakePosKernel, shakeVelKernel;
+    CUfunction ccmaDirectionsKernel;
+    CUfunction ccmaPosForceKernel, ccmaVelForceKernel;
+    CUfunction ccmaMultiplyKernel;
+    CUfunction ccmaPosUpdateKernel, ccmaVelUpdateKernel;
+    CUfunction vsitePositionKernel, vsiteForceKernel;
+    CUfunction randomKernel;
+    CudaArray* posDelta;
+    CudaArray* settleAtoms;
+    CudaArray* settleParams;
+    CudaArray* shakeAtoms;
+    CudaArray* shakeParams;
+    CudaArray* random;
+    CudaArray* randomSeed;
+    CudaArray* stepSize;
+    CudaArray* ccmaAtoms;
+    CudaArray* ccmaDistance;
+    CudaArray* ccmaReducedMass;
+    CudaArray* ccmaAtomConstraints;
+    CudaArray* ccmaNumAtomConstraints;
+    CudaArray* ccmaConstraintMatrixColumn;
+    CudaArray* ccmaConstraintMatrixValue;
+    CudaArray* ccmaDelta1;
+    CudaArray* ccmaDelta2;
+    CudaArray* ccmaConverged;
+    int* ccmaConvergedMemory;
+    CudaArray* vsite2AvgAtoms;
+    CudaArray* vsite2AvgWeights;
+    CudaArray* vsite3AvgAtoms;
+    CudaArray* vsite3AvgWeights;
+    CudaArray* vsiteOutOfPlaneAtoms;
+    CudaArray* vsiteOutOfPlaneWeights;
+    int randomPos;
+    int lastSeed, numVsites;
+    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;
+    struct ShakeCluster;
+    struct ConstraintOrderer;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CUDAINTEGRATIONUTILITIES_H_*/
--- a/platforms/cuda2/src/kernels/random.cu
+++ b/platforms/cuda2/src/kernels/random.cu
+/**
+ * Generate random numbers
+ */
+extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    uint4 state = seed[index];
+    unsigned int carry = 0;
+    while (index < numValues) {
+        float4 value;
+        // Generate first value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        unsigned int m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x1 = sqrt(-2.0f * log(x1));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.x = x1 * cos(2.0f * 3.14159265f * x2);
+        // Generate second value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x3 = sqrt(-2.0f * log(x3));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.y = x3 * cos(2.0f * 3.14159265f * x4);
+        // Generate third value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x5 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x5 = sqrt(-2.0f * log(x5));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.z = x5 * cos(2.0f * 3.14159265f * x6);
+        // Generate fourth value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x7 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x7 = sqrt(-2.0f * log(x7));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.w = x7 * cos(2.0f * 3.14159265f * x8);
+        // Record the values.
+        random[index] = value;
+        index += blockDim.x*gridDim.x;
+    }
+    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+}
--- a/platforms/cuda2/tests/TestCudaRandom.cpp
+++ b/platforms/cuda2/tests/TestCudaRandom.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of random number generation.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "../src/CudaArray.h"
+#include "../src/CudaContext.h"
+#include "../src/CudaIntegrationUtilities.h"
+#include "openmm/System.h"
+#include <iostream>
+using namespace OpenMM;
+using namespace std;
+void testGaussian() {
+    int numAtoms = 5000;
+    System system;
+    for (int i = 0; i < numAtoms; i++)
+        system.addParticle(1.0);
+    CudaPlatform platform;
+    CudaPlatform::PlatformData platformData(system, "", "true", "single",
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+    CudaContext& context = *platformData.contexts[0];
+    context.initialize();
+    context.getIntegrationUtilities().initRandomNumberGenerator(0);
+    CudaArray& random = context.getIntegrationUtilities().getRandom();
+    context.getIntegrationUtilities().prepareRandomNumbers(random.getSize());
+    const int numValues = random.getSize()*4;
+    vector<float4> values(numValues);
+    random.download(values);
+    float* data = reinterpret_cast<float*>(&values[0]);
+    double mean = 0.0;
+    double var = 0.0;
+    double skew = 0.0;
+    double kurtosis = 0.0;
+    for (int i = 0; i < numValues; i++) {
+        double value = data[i];
+        mean += value;
+        var += value*value;
+        skew += value*value*value;
+        kurtosis += value*value*value*value;
+    }
+    mean /= numValues;
+    var /= numValues;
+    skew /= numValues;
+    kurtosis /= numValues;
+    double c2 = var-mean*mean;
+    double c3 = skew-3*var*mean+2*mean*mean*mean;
+    double c4 = kurtosis-4*skew*mean-3*var*var+12*var*mean*mean-6*mean*mean*mean*mean;
+    ASSERT_EQUAL_TOL(0.0, mean, 3.0/sqrt((double)numValues));
+    ASSERT_EQUAL_TOL(1.0, c2, 3.0/pow(numValues, 1.0/3.0));
+    ASSERT_EQUAL_TOL(0.0, c3, 3.0/pow(numValues, 1.0/4.0));
+    ASSERT_EQUAL_TOL(0.0, c4, 3.0/pow(numValues, 1.0/4.0));
+}
+int main() {
+    try {
+        testGaussian();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}