Created OpenCL implementations of SETTLE and LangevinIntegrator (not yet fully debugged).

ebbc40e3 · Peter Eastman · efc1083e · ebbc40e3 · ebbc40e3 · ebbc40e3
Commit ebbc40e3 authored Sep 29, 2009 by Peter Eastman
12 changed files
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -26,6 +26,8 @@
 #include "OpenCLIntegrationUtilities.h"
 #include "OpenCLArray.h"
+#include <cmath>
+#include <cstdlib>
 #include <map>
 using namespace OpenMM;
@@ -65,7 +67,8 @@ struct OpenCLIntegrationUtilities::ShakeCluster {
 };
 OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
-        oldPos(NULL), posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL) {
+        oldPos(NULL), posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
+        random(NULL), randomSeed(NULL), randomPos(NULL) {
    // Create workspace arrays.
    posDelta = new OpenCLArray<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
@@ -73,8 +76,10 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    // Create kernels for enforcing constraints.
-    cl::Program program = context.createProgram(context.loadSourceFromFile("shakeHydrogens.cl"));
+    cl::Program settleProgram = context.createProgram(context.loadSourceFromFile("settle.cl"));
-    shakeKernel = cl::Kernel(program, "applyShakeToHydrogens");
+    settleKernel = cl::Kernel(settleProgram, "applySettle");
+    cl::Program shakeProgram = context.createProgram(context.loadSourceFromFile("shakeHydrogens.cl"));
+    shakeKernel = cl::Kernel(shakeProgram, "applyShakeToHydrogens");
    // Record the set of constraints and how many constraints each atom is involved in.
@@ -122,8 +127,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    vector<bool> isShakeAtom(system.getNumParticles(), false);
    if (settleClusters.size() > 0) {
-        vector<mm_int4> atoms(settleAtoms->getSize());
+        vector<mm_int4> atoms;
-        vector<mm_float2> params(settleParams->getSize());
+        vector<mm_float2> params;
        for (int i = 0; i < (int) settleClusters.size(); i++) {
            int atom1 = settleClusters[i];
            int atom2 = settleConstraints[atom1].begin()->first;
@@ -217,8 +222,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    // Record the SHAKE clusters.
    if (validShakeClusters > 0) {
-        vector<mm_int4> atoms(shakeAtoms->getSize());
+        vector<mm_int4> atoms;
-        vector<mm_float4> params(shakeParams->getSize());
+        vector<mm_float4> params;
        int index = 0;
        for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
            const ShakeCluster& cluster = iter->second;
@@ -254,9 +259,24 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
        delete shakeAtoms;
    if (shakeParams != NULL)
        delete shakeParams;
+    if (random != NULL)
+        delete random;
+    if (randomSeed != NULL)
+        delete randomSeed;
 }
 void OpenCLIntegrationUtilities::applyConstraints(double tol, OpenCLArray<mm_float4>& oldPositions, OpenCLArray<mm_float4>& positionDeltas, OpenCLArray<mm_float4>& newDeltas) {
+    if (settleAtoms != NULL) {
+        settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
+        settleKernel.setArg<cl_float>(1, tol);
+        settleKernel.setArg<cl::Buffer>(2, oldPositions.getDeviceBuffer());
+        settleKernel.setArg<cl::Buffer>(3, positionDeltas.getDeviceBuffer());
+        settleKernel.setArg<cl::Buffer>(4, newDeltas.getDeviceBuffer());
+        settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
+        settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
+        settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
+        context.executeKernel(settleKernel, settleAtoms->getSize());
+    }
    if (shakeAtoms != NULL) {
        shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
        shakeKernel.setArg<cl_float>(1, tol);
@@ -268,18 +288,47 @@ void OpenCLIntegrationUtilities::applyConstraints(double tol, OpenCLArray<mm_flo
        context.executeKernel(shakeKernel, shakeAtoms->getSize());
    }
 }
-//
-//void OpenCLIntegrationUtilities::clearBuffer(OpenCLArray<mm_float4>& array) {
-//    clearBufferKernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
-//    clearBufferKernel.setArg<cl_int>(1, array.getSize()*4);
-//    executeKernel(clearBufferKernel, array.getSize()*4);
-//}
-//
-//void OpenCLIntegrationUtilities::reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers) {
-//    int bufferSize = array.getSize()/numBuffers;
-//    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
-//    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
-//    reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
-//    executeKernel(reduceFloat4Kernel, bufferSize);
-//}
+void OpenCLIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
+    if (random != NULL) {
+        if (randomNumberSeed != lastSeed)
+           throw OpenMMException("OpenCLIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
+        return;
+    }
+    // Create the random number arrays.
+    lastSeed = randomNumberSeed;
+    random = new OpenCLArray<mm_float4>(context, 32*context.getNumAtoms(), "random");
+    randomSeed = new OpenCLArray<mm_int4>(context, context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
+    randomPos = random->getSize();
+    // Initialize the random number seeds.
+    srand(randomNumberSeed);
+    vector<mm_int4> seed(randomSeed->getSize());
+    for (int i = 0; i < randomSeed->getSize(); i++)
+        seed[i] = (mm_int4) {rand(), rand(), rand(), rand()};
+    randomSeed->upload(seed);
+    // Create the kernel.
+    cl::Program randomProgram = context.createProgram(context.loadSourceFromFile("random.cl"));
+    randomKernel = cl::Kernel(randomProgram, "generateRandomNumbers");
+}
+int OpenCLIntegrationUtilities::prepareRandomNumbers(int numValues) {
+    if (randomPos+numValues <= random->getSize()) {
+        int oldPos = randomPos;
+        randomPos += numValues;
+        return oldPos;
+    }
+    randomKernel.setArg<cl_int>(0, random->getSize());
+    randomKernel.setArg<cl::Buffer>(1, random->getDeviceBuffer());
+    randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer());
+    context.executeKernel(randomKernel, random->getSize());
+    randomPos = numValues;
+    vector<mm_float4> r(random->getSize());
+    random->download(r);
+    return 0;
+}
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.h
@@ -34,7 +34,7 @@ namespace OpenMM {
 /**
 * This class implements features that are used by many different integrators, including
- * common workspace arrays and enforcing constraints.
+ * common workspace arrays, random number generation, and enforcing constraints.
 */
 class OpenCLIntegrationUtilities {
@@ -53,6 +53,12 @@ public:
    OpenCLArray<mm_float4>& getOldPos() {
        return *oldPos;
    }
+    /**
+     * Get the array which contains random values.
+     */
+    OpenCLArray<mm_float4>& getRandom() {
+        return *random;
+    }
    /**
     * Apply constraints to the atom positions.
     *
@@ -62,16 +68,32 @@ public:
     * @param newDeltas       the array to store constrained deltas into
     */
    void applyConstraints(double tol, OpenCLArray<mm_float4>& oldPositions, OpenCLArray<mm_float4>& positionDeltas, OpenCLArray<mm_float4>& newDeltas);
+    /**
+     * Initialize the random number generator.
+     */
+    void initRandomNumberGenerator(unsigned int randomNumberSeed);
+    /**
+     * Ensure that sufficient random numbers are available in the array, and generate new ones if not.
+     *
+     * @param numValues     the number of random float4's that will be required
+     * @return the index in the array at which to start reading
+     */
+    int prepareRandomNumbers(int numValues);
 private:
    OpenCLContext& context;
-//    cl::Kernel clearBufferKernel;
+    cl::Kernel settleKernel;
    cl::Kernel shakeKernel;
+    cl::Kernel randomKernel;
    OpenCLArray<mm_float4>* posDelta;
    OpenCLArray<mm_float4>* oldPos;
    OpenCLArray<mm_int4>* settleAtoms;
    OpenCLArray<mm_float2>* settleParams;
    OpenCLArray<mm_int4>* shakeAtoms;
    OpenCLArray<mm_float4>* shakeParams;
+    OpenCLArray<mm_float4>* random;
+    OpenCLArray<mm_int4>* randomSeed;
+    int randomPos;
+    int lastSeed;
    struct ShakeCluster;
 };

--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -54,8 +54,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
 //        return new OpenCLCalcGBSAOBCForceKernel(name, platform, cl);
    if (name == IntegrateVerletStepKernel::Name())
        return new OpenCLIntegrateVerletStepKernel(name, platform, cl);
-//    if (name == IntegrateLangevinStepKernel::Name())
+    if (name == IntegrateLangevinStepKernel::Name())
-//        return new OpenCLIntegrateLangevinStepKernel(name, platform, cl);
+        return new OpenCLIntegrateLangevinStepKernel(name, platform, cl);
 //    if (name == IntegrateBrownianStepKernel::Name())
 //        return new OpenCLIntegrateBrownianStepKernel(name, platform, cl);
 //    if (name == IntegrateVariableVerletStepKernel::Name())

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -35,6 +35,12 @@
 using namespace OpenMM;
 using namespace std;
+static const double KILO = 1e3;                      // Thousand
+static const double BOLTZMANN = 1.380658e-23;            // (J/K)
+static const double AVOGADRO = 6.0221367e23;            // ()
+static const double RGAS = BOLTZMANN*AVOGADRO;     // (J/(mol K))
+static const double BOLTZ = (RGAS/KILO);            // (kJ/(mol K))
 void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
 }
@@ -836,47 +842,149 @@ void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const Verlet
    cl.setStepCount(cl.getStepCount()+1);
 }
-//OpenCLIntegrateLangevinStepKernel::~OpenCLIntegrateLangevinStepKernel() {
+OpenCLIntegrateLangevinStepKernel::~OpenCLIntegrateLangevinStepKernel() {
-//}
+    if (params != NULL)
-//
+        delete params;
-//void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
+    if (xVector != NULL)
-//    initializeIntegration(system, data, integrator);
+        delete xVector;
-//    _gpuContext* gpu = data.gpu;
+    if (vVector != NULL)
-//    gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
+        delete vVector;
-//    gpuInitializeRandoms(gpu);
+}
-//    prevStepSize = -1.0;
-//}
+void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
-//
+    cl.initialize(system);
-//void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+    cl.getIntegrationUtilties().initRandomNumberGenerator(integrator.getRandomNumberSeed());
-//    _gpuContext* gpu = data.gpu;
+    cl::Program program = cl.createProgram(cl.loadSourceFromFile("langevin.cl"));
-//    double temperature = integrator.getTemperature();
+    kernel1 = cl::Kernel(program, "integrateLangevinPart1");
-//    double friction = integrator.getFriction();
+    kernel2 = cl::Kernel(program, "integrateLangevinPart2");
-//    double stepSize = integrator.getStepSize();
+    kernel3 = cl::Kernel(program, "integrateLangevinPart3");
-//    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+    params = new OpenCLArray<cl_float>(cl, 11, "langevinParams");
-//        // Initialize the GPU parameters.
+    xVector = new OpenCLArray<mm_float4>(cl, cl.getPaddedNumAtoms(), "xVector");
-//
+    vVector = new OpenCLArray<mm_float4>(cl, cl.getPaddedNumAtoms(), "vVector");
-//        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+    vector<mm_float4> initialXVector(xVector->getSize(), (mm_float4) {0.0f, 0.0f, 0.0f, 0.0f});
-//        gpuSetLangevinIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature, 0.0f);
+    xVector->upload(initialXVector);
-//        gpuSetConstants(gpu);
+    prevStepSize = -1.0;
-//        kGenerateRandoms(gpu);
+}
-//        prevTemp = temperature;
-//        prevFriction = friction;
+void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
-//        prevStepSize = stepSize;
+    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilties();
-//    }
+    int numAtoms = cl.getNumAtoms();
-//    kLangevinUpdatePart1(gpu);
+    double temperature = integrator.getTemperature();
-//    kApplyFirstShake(gpu);
+    double friction = integrator.getFriction();
-//    kApplyFirstSettle(gpu);
+    double stepSize = integrator.getStepSize();
-//    kApplyFirstCCMA(gpu);
+    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
-//    if (data.removeCM)
+        // Calculate the integration parameters.
-//        if (data.stepCount%data.cmMotionFrequency == 0)
-//            gpu->bCalculateCM = true;
+        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
-//    kLangevinUpdatePart2(gpu);
+        double kT = BOLTZ*temperature;
-//    kApplySecondShake(gpu);
+        double GDT = stepSize/tau;
-//    kApplySecondSettle(gpu);
+        double EPH = exp(0.5*GDT);
-//    kApplySecondCCMA(gpu);
+        double EMH = exp(-0.5*GDT);
-//    data.time += stepSize;
+        double EP = exp(GDT);
-//    data.stepCount++;
+        double EM = exp(-GDT);
-//}
+        double B, C, D;
+        if (GDT >= 0.1)
+        {
+            double term1 = EPH - 1.0;
+            term1 *= term1;
+            B = GDT*(EP - 1.0) - 4.0*term1;
+            C = GDT - 3.0 + 4.0*EMH - EM;
+            D = 2.0 - EPH - EMH;
+        }
+        else
+        {
+            double term1 = 0.5*GDT;
+            double term2 = term1*term1;
+            double term4 = term2*term2;
+            double third = 1.0/3.0;
+            double o7_9 = 7.0/9.0;
+            double o1_12 = 1.0/12.0;
+            double o17_90 = 17.0/90.0;
+            double o7_30 = 7.0/30.0;
+            double o31_1260 = 31.0/1260.0;
+            double o_360 = 1.0/360.0;
+            B = term4*(third + term1*(third + term1*(o17_90 + term1*o7_9)));
+            C = term2*term1*(2.0*third + term1*(-0.5 + term1*(o7_30 + term1*(-o1_12 + term1*o31_1260))));
+            D = term2*(-1.0 + term2*(-o1_12 - term2*o_360));
+        }
+        double DOverTauC = D/(tau*C);
+        double TauOneMinusEM = tau*(1.0-EM);
+        double TauDOverEMMinusOne = tau*D/(EM - 1.0);
+        double fix1 = tau*(EPH - EMH);
+        if (fix1 == 0.0)
+            fix1 = stepSize;
+        double oneOverFix1 = 1.0/fix1;
+        double V = sqrt(kT*(1.0 - EM));
+        double X = tau*sqrt(kT*C);
+        double Yv = sqrt(kT*B/C);
+        double Yx = tau*sqrt(kT*B/(1.0 - EM));
+        vector<cl_float> p(params->getSize());
+        p[0] = EM;
+        p[1] = EM;
+        p[2] = DOverTauC;
+        p[3] = TauOneMinusEM;
+        p[4] = TauDOverEMMinusOne;
+        p[5] = V;
+        p[6] = X;
+        p[7] = Yv;
+        p[8] = Yx;
+        p[9] = fix1;
+        p[10] = oneOverFix1;
+        params->upload(p);
+        prevTemp = temperature;
+        prevFriction = friction;
+        prevStepSize = stepSize;
+    }
+    // Call the first integration kernel.
+    kernel1.setArg<cl_int>(0, numAtoms);
+    kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(3, cl.getForce().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(5, params->getDeviceBuffer());
+    kernel1.setArg(6, params->getSize()*sizeof(cl_float), NULL);
+    kernel1.setArg<cl::Buffer>(7, xVector->getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(8, vVector->getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(9,integration.getRandom().getDeviceBuffer());
+    kernel1.setArg<cl_uint>(10, integration.prepareRandomNumbers(2*numAtoms));
+    cl.executeKernel(kernel1, numAtoms);
+    // Apply constraints.
+    integration.applyConstraints(integrator.getConstraintTolerance(), integration.getOldPos(), integration.getPosDelta(), integration.getPosDelta());
+    // Call the second integration kernel.
+    kernel2.setArg<cl_int>(0, numAtoms);
+    kernel2.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(3, params->getDeviceBuffer());
+    kernel2.setArg(4, params->getSize()*sizeof(cl_float), NULL);
+    kernel2.setArg<cl::Buffer>(5, xVector->getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(6, vVector->getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(7,integration.getRandom().getDeviceBuffer());
+    kernel2.setArg<cl_uint>(8, integration.prepareRandomNumbers(2*numAtoms));
+    cl.executeKernel(kernel2, numAtoms);
+    // Reapply constraints.
+    integration.applyConstraints(integrator.getConstraintTolerance(), integration.getOldPos(), integration.getPosDelta(), cl.getPosq());
+    // Call the third integration kernel.
+    kernel3.setArg<cl_int>(0, numAtoms);
+    kernel3.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
+    kernel3.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
+    cl.executeKernel(kernel3, numAtoms);
+    // Update the time and step count.
+    cl.setTime(cl.getTime()+stepSize);
+    cl.setStepCount(cl.getStepCount()+1);
+}
 //
 //OpenCLIntegrateBrownianStepKernel::~OpenCLIntegrateBrownianStepKernel() {
 //}
@@ -1039,6 +1147,7 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
    // on the CPU.
    OpenCLArray<mm_float4>& velm = cl.getVelm();
+    velm.download();
    double energy = 0.0;
    for (size_t i = 0; i < masses.size(); ++i) {
        mm_float4 v = velm[i];

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -421,37 +421,41 @@ public:
    void execute(ContextImpl& context, const VerletIntegrator& integrator);
 private:
    OpenCLContext& cl;
-    cl::Kernel kernel1;
+    cl::Kernel kernel1, kernel2;
-    cl::Kernel kernel2;
+};
+/**
+ * This kernel is invoked by LangevinIntegrator to take one time step.
+ */
+class OpenCLIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
+public:
+    OpenCLIntegrateLangevinStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl),
+            params(NULL), xVector(NULL), vVector(NULL) {
+    }
+    ~OpenCLIntegrateLangevinStepKernel();
+    /**
+     * Initialize the kernel, setting up the particle masses.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param integrator the LangevinIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const LangevinIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     *
+     * @param context    the context in which to execute this kernel
+     * @param integrator the LangevinIntegrator this kernel is being used for
+     */
+    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
+private:
+    OpenCLContext& cl;
+    double prevTemp, prevFriction, prevStepSize;
+    OpenCLArray<cl_float>* params;
+    OpenCLArray<mm_float4>* xVector;
+    OpenCLArray<mm_float4>* vVector;
+    cl::Kernel kernel1, kernel2, kernel3;
 };
-///**
-// * This kernel is invoked by LangevinIntegrator to take one time step.
-// */
-//class OpenCLIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
-//public:
-//    OpenCLIntegrateLangevinStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl) {
-//    }
-//    ~OpenCLIntegrateLangevinStepKernel();
-//    /**
-//     * Initialize the kernel, setting up the particle masses.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param integrator the LangevinIntegrator this kernel will be used for
-//     */
-//    void initialize(const System& system, const LangevinIntegrator& integrator);
-//    /**
-//     * Execute the kernel.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     * @param integrator the LangevinIntegrator this kernel is being used for
-//     */
-//    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
-//private:
-//    OpenCLContext& cl;
-//    double prevTemp, prevFriction, prevStepSize;
-//};
-//
 ///**
 // * This kernel is invoked by BrownianIntegrator to take one time step.
 // */

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -55,7 +55,7 @@ OpenCLPlatform::OpenCLPlatform() {
 //    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
 //    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
-//    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
 //    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
+enum {EM, EM_V, DOverTauC, TauOneMinusEM_V, TauDOverEMMinusOne, V, X, Yv, Yx, Fix1, OneOverFix1, MaxParams};
+/**
+ * Perform the first step of Langevin integration.
+ */
+__kernel void integrateLangevinPart1(int numAtoms, __global float4* posq, __global float4* velm, __global float4* force,
+        __global float4* posDelta, __global float* paramBuffer, __local float* params, __global float4* xVector,
+        __global float4* vVector, __global float4* random, unsigned int randomIndex) {
+    // Load the parameters into local memory for faster access.
+    int index = get_global_id(0);
+    if (index < MaxParams)
+        params[index] = paramBuffer[index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    randomIndex += index;
+    while (index < numAtoms) {
+        float4 velocity = velm[index];
+        float sqrtInvMass = sqrt(velocity.w);
+        float4 vmh = (float4) (xVector[index].xyz*params[DOverTauC] + sqrtInvMass*params[Yv]*random[randomIndex].xyz, 0.0f);
+        randomIndex += get_global_size(0);
+        float4 vVec = (float4) (sqrtInvMass*params[V]*random[randomIndex].xyz, 0.0f);
+        randomIndex += get_global_size(0);
+        vVector[index] = vVec;
+        velocity.xyz = velocity.xyz*params[EM_V] + velocity.w*force[index].xyz*params[TauOneMinusEM_V] + vVec.xyz - params[EM]*vmh.xyz;
+        posDelta[index] = velocity*params[Fix1];
+        velm[index] = velocity;
+        index += get_global_size(0);
+    }
+}
+/**
+ * Perform the second step of Langevin integration.
+ */
+__kernel void integrateLangevinPart2(int numAtoms, __global float4* velm, __global float4* posDelta, __global float* paramBuffer,
+        __local float* params, __global float4* xVector, __global float4* vVector, __global float4* random, unsigned int randomIndex) {
+    // Load the parameters into local memory for faster access.
+    int index = get_global_id(0);
+    if (index < MaxParams)
+        params[index] = paramBuffer[index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    randomIndex += index;
+    while (index < numAtoms) {
+        float4 delta = posDelta[index];
+        float4 velocity = velm[index];
+        float sqrtInvMass = 0.0f;//sqrt(velocity.w);
+        velocity.xyz = delta.xyz*params[OneOverFix1];
+        float4 xmh = (float4) (vVector[index].xyz*params[TauDOverEMMinusOne] + sqrtInvMass*params[Yx]*random[randomIndex].xyz, 0.0f);
+        randomIndex += get_global_size(0);
+        float4 xVec = (float4) (sqrtInvMass*params[X]*random[randomIndex].xyz, 0.0f);
+        randomIndex += get_global_size(0);
+        delta.xyz += xVec.xyz - xmh.xyz;
+        posDelta[index] = delta;
+        velm[index] = velocity;
+        xVector[index] = xVec;
+        index += get_global_size(0);
+    }
+}
+/**
+ * Perform the third step of Langevin integration.
+ */
+__kernel void integrateLangevinPart3(int numAtoms, __global float4* posq, __global float4* posDelta) {
+    int index = get_global_id(0);
+    while (index < numAtoms) {
+        float4 pos = posq[index];
+        float4 delta = posDelta[index];
+        pos.xyz += delta.xyz;
+        posq[index] = pos;
+        index += get_global_size(0);
+    }
+}
--- a/platforms/opencl/src/kernels/random.cl
+++ b/platforms/opencl/src/kernels/random.cl
+/**
+ * Generate random numbers
+ */
+__kernel void generateRandomNumbers(int numValues, __global float4* random, __global uint4* seed) {
+    int index = get_global_id(0);
+    uint4 state = seed[index];
+    unsigned int carry = 0;
+    while (index < numValues) {
+        float4 value;
+        // Generate first value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        unsigned int m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x1 = sqrt(-2.0f * log(x1));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.x = x1 * cos(2.0f * 3.14159265f * x2);
+        // Generate second value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x3 = sqrt(-2.0f * log(x3));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.y = x3 * cos(2.0f * 3.14159265f * x4);
+        // Generate third value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x5 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x5 = sqrt(-2.0f * log(x5));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.z = x5 * cos(2.0f * 3.14159265f * x6);
+        // Generate fourth value.
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x7 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
+        state.x = state.x * 69069 + 1;
+        state.y ^= state.y << 13;
+        state.y ^= state.y >> 17;
+        state.y ^= state.y << 5;
+        x7 = sqrt(-2.0f * log(x7));
+        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+        m = state.w + state.w + state.z + carry;
+        state.z = state.w;
+        state.w = m;
+        carry = k >> 30;
+        float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+        value.w = x7 * cos(2.0f * 3.14159265f * x8);
+        // Record the values.
+        random[index] = value;
+        index += get_global_size(0);
+    }
+    seed[get_global_id(0)] = state;
+}
--- a/platforms/opencl/src/kernels/settle.cl
+++ b/platforms/opencl/src/kernels/settle.cl
+/**
+ * Enforce constraints on SETTLE clusters
+ */
+__kernel void applySettle(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global float4* velm, __global int4* clusterAtoms, __global float2* clusterParams) {
+    int index = get_global_id(0);
+    while (index < numClusters) {
+        // Load the data for this cluster.
+        int4 atoms = clusterAtoms[index];
+        float2 params = clusterParams[index];
+        float4 apos0 = oldPos[atoms.x];
+        float4 xp0 = posDelta[atoms.x];
+        float4 apos1 = oldPos[atoms.y];
+        float4 xp1 = posDelta[atoms.y];
+        float4 apos2 = oldPos[atoms.z];
+        float4 xp2 = posDelta[atoms.z];
+        float m0 = 1.0f/velm[atoms.x].w;
+        float m1 = 1.0f/velm[atoms.y].w;
+        float m2 = 1.0f/velm[atoms.z].w;
+        // Translate the molecule to the origin to improve numerical precision.
+        float4 center = apos0;
+        apos0.xyz -= center.xyz;
+        apos1.xyz -= center.xyz;
+        apos2.xyz -= center.xyz;
+        // Apply the SETTLE algorithm.
+        float xb0 = apos1.x-apos0.x;
+        float yb0 = apos1.y-apos0.y;
+        float zb0 = apos1.z-apos0.z;
+        float xc0 = apos2.x-apos0.x;
+        float yc0 = apos2.y-apos0.y;
+        float zc0 = apos2.z-apos0.z;
+        float totalMass = m0+m1+m2;
+        float xcom = ((apos0.x+xp0.x)*m0 + (apos1.x+xp1.x)*m1 + (apos2.x+xp2.x)*m2) / totalMass;
+        float ycom = ((apos0.y+xp0.y)*m0 + (apos1.y+xp1.y)*m1 + (apos2.y+xp2.y)*m2) / totalMass;
+        float zcom = ((apos0.z+xp0.z)*m0 + (apos1.z+xp1.z)*m1 + (apos2.z+xp2.z)*m2) / totalMass;
+        float xa1 = apos0.x + xp0.x - xcom;
+        float ya1 = apos0.y + xp0.y - ycom;
+        float za1 = apos0.z + xp0.z - zcom;
+        float xb1 = apos1.x + xp1.x - xcom;
+        float yb1 = apos1.y + xp1.y - ycom;
+        float zb1 = apos1.z + xp1.z - zcom;
+        float xc1 = apos2.x + xp2.x - xcom;
+        float yc1 = apos2.y + xp2.y - ycom;
+        float zc1 = apos2.z + xp2.z - zcom;
+        float xaksZd = yb0*zc0 - zb0*yc0;
+        float yaksZd = zb0*xc0 - xb0*zc0;
+        float zaksZd = xb0*yc0 - yb0*xc0;
+        float xaksXd = ya1*zaksZd - za1*yaksZd;
+        float yaksXd = za1*xaksZd - xa1*zaksZd;
+        float zaksXd = xa1*yaksZd - ya1*xaksZd;
+        float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
+        float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
+        float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
+        float axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        float aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        float azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        float trns11 = xaksXd / axlng;
+        float trns21 = yaksXd / axlng;
+        float trns31 = zaksXd / axlng;
+        float trns12 = xaksYd / aylng;
+        float trns22 = yaksYd / aylng;
+        float trns32 = zaksYd / aylng;
+        float trns13 = xaksZd / azlng;
+        float trns23 = yaksZd / azlng;
+        float trns33 = zaksZd / azlng;
+        float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
+        float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
+        float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
+        float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
+        float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
+        float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
+        float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
+        float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
+        float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
+        float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
+        float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
+        //                                        --- Step2  A2' ---
+        float rc = 0.5*params.y;
+        float rb = sqrt(params.x*params.x-rc*rc);
+        float ra = rb*(m1+m2)/totalMass;
+        rb -= ra;
+        float sinphi = za1d / ra;
+        float cosphi = sqrt(1.0f - sinphi*sinphi);
+        float sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
+        float cospsi = sqrt(1.0f - sinpsi*sinpsi);
+        float ya2d =   ra*cosphi;
+        float xb2d = - rc*cospsi;
+        float yb2d = - rb*cosphi - rc*sinpsi*sinphi;
+        float yc2d = - rb*cosphi + rc*sinpsi*sinphi;
+        float xb2d2 = xb2d*xb2d;
+        float hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
+        float deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
+        xb2d -= deltx*0.5;
+        //                                        --- Step3  al,be,ga ---
+        float alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
+        float beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
+        float gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
+        float al2be2 = alpha*alpha + beta*beta;
+        float sintheta = (alpha*gamma - beta*sqrt (al2be2 - gamma*gamma)) / al2be2;
+        //                                        --- Step4  A3' ---
+        float costheta = sqrt (1.0f - sintheta*sintheta);
+        float xa3d = - ya2d*sintheta;
+        float ya3d =   ya2d*costheta;
+        float za3d = za1d;
+        float xb3d =   xb2d*costheta - yb2d*sintheta;
+        float yb3d =   xb2d*sintheta + yb2d*costheta;
+        float zb3d = zb1d;
+        float xc3d = - xb2d*costheta - yc2d*sintheta;
+        float yc3d = - xb2d*sintheta + yc2d*costheta;
+        float zc3d = zc1d;
+        //                                        --- Step5  A3 ---
+        float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
+        float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
+        float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
+        float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
+        float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
+        float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
+        float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
+        float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
+        float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
+        xp0.x = xcom + xa3 - apos0.x;
+        xp0.y = ycom + ya3 - apos0.y;
+        xp0.z = zcom + za3 - apos0.z;
+        xp1.x = xcom + xb3 - apos1.x;
+        xp1.y = ycom + yb3 - apos1.y;
+        xp1.z = zcom + zb3 - apos1.z;
+        xp2.x = xcom + xc3 - apos2.x;
+        xp2.y = ycom + yc3 - apos2.y;
+        xp2.z = zcom + zc3 - apos2.z;
+        // Record the new positions.
+        newDelta[atoms.x] = xp0;
+        newDelta[atoms.y] = xp1;
+        newDelta[atoms.z] = xp2;
+        index += get_global_size(0);
+    }
+}
--- a/platforms/opencl/tests/TestOpenCLLangevinIntegrator.cpp
+++ b/platforms/opencl/tests/TestOpenCLLangevinIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the reference implementation of LangevinIntegrator.
+ */
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "OpenCLPlatform.h"
+#include "openmm/HarmonicBondForce.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "../src/sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testSingleBond() {
+    OpenCLPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    system.addParticle(2.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+    HarmonicBondForce* forceField = new HarmonicBondForce();
+    forceField->addBond(0, 1, 1.5, 1);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(-1, 0, 0);
+    positions[1] = Vec3(1, 0, 0);
+    context.setPositions(positions);
+    // This is simply a damped harmonic oscillator, so compare it to the analytical solution.
+    double freq = std::sqrt(1-0.05*0.05);
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions | State::Velocities);
+        double time = state.getTime();
+        double expectedDist = 1.5+0.5*std::exp(-0.05*time)*std::cos(freq*time);
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
+        double expectedSpeed = -0.5*std::exp(-0.05*time)*(0.05*std::cos(freq*time)+freq*std::sin(freq*time));
+        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.02);
+        ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.02);
+        integrator.step(1);
+    }
+    // Not set the friction to a tiny value and see if it conserves energy.
+    integrator.setFriction(5e-5);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+    double initialEnergy = state.getKineticEnergy()+state.getPotentialEnergy();
+    for (int i = 0; i < 1000; ++i) {
+        state = context.getState(State::Energy);
+        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
+        ASSERT_EQUAL_TOL(initialEnergy, energy, 0.01);
+        integrator.step(1);
+    }
+}
+void testTemperature() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    OpenCLPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.01);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(2.0);
+        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
+    }
+//    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
+    context.setPositions(positions);
+    // Let it equilibrate.
+    integrator.step(10000);
+    // Now run it for a while and see if the temperature is correct.
+    double ke = 0.0;
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Energy);
+        ke += state.getKineticEnergy();
+        integrator.step(1);
+    }
+    ke /= 1000;
+    double expected = 0.5*numParticles*3*BOLTZ*temp;
+    ASSERT_EQUAL_TOL(expected, ke, 3*expected/std::sqrt(1000.0));
+}
+void testConstraints() {
+    const int numParticles = 8;
+    const int numConstraints = 5;
+    const double temp = 100.0;
+    OpenCLPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.01);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(10.0);
+        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
+    }
+    system.addConstraint(0, 1, 1.0);
+    system.addConstraint(1, 2, 1.0);
+    system.addConstraint(2, 3, 1.0);
+    system.addConstraint(4, 5, 1.0);
+    system.addConstraint(6, 7, 1.0);
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    init_gen_rand(0);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3(i/2, (i+1)/2, 0);
+        velocities[i] = Vec3(genrand_real2()-0.5, genrand_real2()-0.5, genrand_real2()-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    // Simulate it and see whether the constraints remain satisfied.
+    for (int i = 0; i < 1000; ++i) {
+        State state = context.getState(State::Positions);
+        for (int j = 0; j < numConstraints; ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
+        }
+        integrator.step(1);
+    }
+}
+void testRandomSeed() {
+    const int numParticles = 8;
+    const double temp = 100.0;
+    const double collisionFreq = 10.0;
+    OpenCLPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.01);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(2.0);
+        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
+    }
+//    system.addForce(forceField);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
+        velocities[i] = Vec3(0, 0, 0);
+    }
+    // Try twice with the same random seed.
+    integrator.setRandomNumberSeed(5);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state1 = context.getState(State::Positions);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state2 = context.getState(State::Positions);
+    // Try twice with a different random seed.
+    integrator.setRandomNumberSeed(10);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state3 = context.getState(State::Positions);
+    context.reinitialize();
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    integrator.step(10);
+    State state4 = context.getState(State::Positions);
+    // Compare the results.
+    for (int i = 0; i < numParticles; i++) {
+        for (int j = 0; j < 3; j++) {
+            ASSERT(state1.getPositions()[i][j] == state2.getPositions()[i][j]);
+            ASSERT(state3.getPositions()[i][j] == state4.getPositions()[i][j]);
+            ASSERT(state1.getPositions()[i][j] != state3.getPositions()[i][j]);
+        }
+    }
+}
+int main() {
+    try {
+        testSingleBond();
+        testTemperature();
+//        testConstraints();
+        testRandomSeed();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/opencl/tests/TestOpenCLRandom.cpp
+++ b/platforms/opencl/tests/TestOpenCLRandom.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the OpenCL implementation of random number generation.
+ */
+#include "../../../tests/AssertionUtilities.h"
+#include "../src/OpenCLArray.h"
+#include "../src/OpenCLContext.h"
+#include "../src/OpenCLIntegrationUtilities.h"
+#include "OpenMM/System.h"
+#include <iostream>
+using namespace OpenMM;
+using namespace std;
+void testGaussian() {
+    int numAtoms = 5000;
+    System system;
+    for (int i = 0; i < numAtoms; i++)
+        system.addParticle(1.0);
+    OpenCLContext context(numAtoms, -1);
+    context.initialize(system);
+    context.getIntegrationUtilties().initRandomNumberGenerator(0);
+    OpenCLArray<mm_float4>& random = context.getIntegrationUtilties().getRandom();
+    context.getIntegrationUtilties().prepareRandomNumbers(random.getSize());
+    const int numValues = random.getSize()*4;
+    vector<mm_float4> values(numValues);
+    random.download(values);
+    float* data = reinterpret_cast<float*>(&values[0]);
+    double mean = 0.0;
+    double var = 0.0;
+    double skew = 0.0;
+    double kurtosis = 0.0;
+    for (int i = 0; i < numValues; i++) {
+        double value = data[i];
+        mean += value;
+        var += value*value;
+        skew += value*value*value;
+        kurtosis += value*value*value*value;
+    }
+    mean /= numValues;
+    var /= numValues;
+    skew /= numValues;
+    kurtosis /= numValues;
+    double c2 = var-mean*mean;
+    double c3 = skew-3*var*mean+2*mean*mean*mean;
+    double c4 = kurtosis-4*skew*mean-3*var*var+12*var*mean*mean-6*mean*mean*mean*mean;
+    ASSERT_EQUAL_TOL(0.0, mean, 1.0/sqrt((double)numValues));
+    ASSERT_EQUAL_TOL(1.0, c2, 1.0/pow(numValues, 1.0/3.0));
+    ASSERT_EQUAL_TOL(0.0, c3, 1.0/pow(numValues, 1.0/4.0));
+    ASSERT_EQUAL_TOL(0.0, c4, 1.0/pow(numValues, 1.0/4.0));
+}
+int main() {
+    try {
+        testGaussian();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/opencl/tests/TestOpenCLSettle.cpp
+++ b/platforms/opencl/tests/TestOpenCLSettle.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the OpenCL implementation of the SETTLE algorithm.
+ */
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "OpenCLPlatform.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "../src/sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+void testConstraints() {
+    const int numMolecules = 10;
+    const int numParticles = numMolecules*3;
+    const int numConstraints = numMolecules*3;
+    const double temp = 100.0;
+    OpenCLPlatform platform;
+    System system;
+    VerletIntegrator integrator(0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numMolecules; ++i) {
+        system.addParticle(16.0);
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        forceField->addParticle(-0.82, 0.317, 0.65);
+        forceField->addParticle(0.41, 1.0, 0.0);
+        forceField->addParticle(0.41, 1.0, 0.0);
+        system.addConstraint(i*3, i*3+1, 0.1);
+        system.addConstraint(i*3, i*3+2, 0.1);
+        system.addConstraint(i*3+1, i*3+2, 0.163);
+    }
+//    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    init_gen_rand(0);
+    for (int i = 0; i < numMolecules; ++i) {
+        positions[i*3] = Vec3((i%4)*0.4, (i/4)*0.4, 0);
+        positions[i*3+1] = positions[i*3]+Vec3(0.1, 0, 0);
+        positions[i*3+2] = positions[i*3]+Vec3(-0.03333, 0.09428, 0);
+        velocities[i*3] = Vec3(genrand_real2()-0.5, genrand_real2()-0.5, genrand_real2()-0.5);
+        velocities[i*3+1] = Vec3(genrand_real2()-0.5, genrand_real2()-0.5, genrand_real2()-0.5);
+        velocities[i*3+2] = Vec3(genrand_real2()-0.5, genrand_real2()-0.5, genrand_real2()-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+    // Simulate it and see whether the constraints remain satisfied.
+    for (int i = 0; i < 1000; ++i) {
+        integrator.step(1);
+        State state = context.getState(State::Positions | State::Forces);
+        for (int j = 0; j < numConstraints; ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 1e-5);
+        }
+    }
+}
+int main() {
+    try {
+        testConstraints();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}