Merge pull request #3 from openmm/master

Sync with official repo

Merge pull request #3 from openmm/master
Sync with official repo
8dd60914 · Tristan Croll · GitHub · 3475b790 · 75c1fcb6 · 8dd60914
Unverified Commit 8dd60914 authored Nov 22, 2019 by Tristan Croll Committed by GitHub Nov 22, 2019
20 changed files
--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -97,6 +97,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/baoab.cu
+++ b/platforms/cuda/src/kernels/baoab.cu
+enum {VelScale, NoiseScale};
+
+/**
+ * Perform the first part of BAOAB integration: velocity half step, then position half step.
+ */
+
+extern "C" __global__ void integrateBAOABPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+        mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    mixed fscale = halfdt/(mixed) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            velocity.x += fscale*velocity.w*force[index];
+            velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
+            velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
+            velm[index] = velocity;
+            mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+    }
+}
+
+/**
+ * Perform the second part of BAOAB integration: apply constraint forces to velocities, then interact with heat bath,
+ * then position half step.
+ */
+
+extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, mixed4* __restrict__ posDelta,
+        mixed4* __restrict__ oldDelta, const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+    mixed vscale = paramBuffer[VelScale];
+    mixed noisescale = paramBuffer[NoiseScale];
+    mixed halfdt = 0.5*dt[0].y;
+    mixed invHalfdt = 1/halfdt;
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    randomIndex += index;
+    while (index < numAtoms) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            mixed sqrtInvMass = SQRT(velocity.w);
+            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
+            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
+            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
+            velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x;
+            velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.z = vscale*velocity.z + noisescale*sqrtInvMass*random[randomIndex].z;
+            velm[index] = velocity;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+            delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+        randomIndex += blockDim.x*gridDim.x;
+        index += blockDim.x*gridDim.x;
+    }
+}
+
+/**
+ * Perform the third part of BAOAB integration: apply constraint forces to velocities, then record
+ * the constrained positions in preparation for computing forces.
+ */
+
+extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,
+        mixed4* __restrict__ posDelta, mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    mixed invHalfdt = 1/halfdt;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
+            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
+            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
+            velm[index] = velocity;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+    }
+}
+
+/**
+ * Perform the fourth part of BAOAB integration: velocity half step.
+ */
+
+extern "C" __global__ void integrateBAOABPart4(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm,
+        const long long* __restrict__ force, const mixed2* __restrict__ dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    mixed fscale = halfdt/(mixed) 0x100000000;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            velocity.x += fscale*velocity.w*force[index];
+            velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
+            velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
+            velm[index] = velocity;
+        }
+    }
+}
--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
@@ -243,6 +243,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
+            bool forceInclude = false;
            if (includeBlock2) {
                real4 blockCenterY = sortedBlockCenter[block2];
                real4 blockSizeY = sortedBlockBoundingBox[block2];
@@ -260,7 +261,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
-                    includeBlock2 = true;
+                    includeBlock2 = forceInclude = true;
 #endif
                if (includeBlock2) {
                    unsigned short y = (unsigned short) sortedBlocks[block2].y;
@@ -272,9 +273,11 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
            // Loop over any blocks we identified as potentially containing neighbors.
            
            int includeBlockFlags = BALLOT(includeBlock2);
+            int forceIncludeFlags = BALLOT(forceInclude);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
+                forceInclude = (forceIncludeFlags>>i) & 1;
                unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;

                // Check each atom in block Y for interactions.
@@ -291,7 +294,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(atomDelta)
 #endif
-                int atomFlags = BALLOT(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
+                int atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
                int interacts = 0;
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
                    int first = __ffs(atomFlags)-1;

--- a/platforms/cuda/tests/TestCudaBAOABLangevinIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBAOABLangevinIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestBAOABLangevinIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -32,6 +32,18 @@
 #include <string>
 #define __CL_ENABLE_EXCEPTIONS
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#ifndef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+  #define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
+#endif
+#ifndef CL_DEVICE_SIMD_WIDTH_AMD
+  #define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
+#endif
+#ifndef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+  #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
+#endif
+#ifndef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+  #define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
+#endif
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
    #define NOMINMAX

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -1383,6 +1383,47 @@ private:
    cl::Kernel kernel1, kernel2;
 };

+/**
+ * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
+ */
+class OpenCLIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
+public:
+    OpenCLIntegrateBAOABStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateBAOABStepKernel(name, platform), cl(cl),
+            hasInitializedKernels(false) {
+    }
+    /**
+     * Initialize the kernel, setting up the particle masses.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the BAOABLangevinIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const BAOABLangevinIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     *                       false to show that cached forces are invalid and must be recalculated.
+     *                       On exit, this should specify whether the cached forces are valid at the
+     *                       end of the step.
+     */
+    void execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator);
+private:
+    OpenCLContext& cl;
+    double prevTemp, prevFriction, prevStepSize;
+    bool hasInitializedKernels;
+    OpenCLArray params, oldDelta;
+    cl::Kernel kernel1, kernel2, kernel3, kernel4;
+};
+
 /**
 * This kernel is invoked by BrownianIntegrator to take one time step.
 */
@@ -1584,7 +1625,7 @@ private:
    double energy;
    float energyFloat;
    int numGlobalVariables, sumWorkGroupSize;
-    bool hasInitializedKernels, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
+    bool hasInitializedKernels, deviceGlobalsAreCurrent, modifiesParameters, hasAnyConstraints, needsEnergyParamDerivs;
    std::vector<bool> deviceValuesAreCurrent;
    mutable std::vector<bool> localValuesAreCurrent;
    OpenCLArray globalValues;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -131,7 +131,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    // This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
                    // or the CPU device) so still have to check for errors.
                    try {
-#ifdef CL_DEVICE_SIMD_WIDTH_AMD
                        processingElementsPerComputeUnit =
                            // AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
                            // The SIMD width is the number of threads the SIMD executes per cycle.
@@ -145,7 +144,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                        // Just in case any of the queries return 0.
                        if (processingElementsPerComputeUnit <= 0)
                            processingElementsPerComputeUnit = 1;
-#endif
                    }
                    catch (cl::Error err) {
                        // Runtime does not support the queries so use default.
@@ -221,7 +219,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    // This attribute does not ensure that all queries are supported by the runtime so still have to
                    // check for errors.
                    try {
-#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
                        // Must catch cl:Error as will fail if runtime does not support queries.

                        cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
@@ -230,12 +227,15 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                        // If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
                        // set instead of the VLIW instruction set. It therefore needs more thread blocks per
                        // compute unit to hide memory latency.
-                        if (simdPerComputeUnit > 1)
-                            numThreadBlocksPerComputeUnit = 4 * simdPerComputeUnit;
+                        if (simdPerComputeUnit > 1) {
+                            if (simdWidth == 32)
+                                numThreadBlocksPerComputeUnit = 6*simdPerComputeUnit; // Navi seems to like more thread blocks than older GPUs
+                            else
+                                numThreadBlocksPerComputeUnit = 4*simdPerComputeUnit;
+                        }

                        // If the queries are supported then must be newer than SDK 2.4.
                        amdPostSdk2_4 = true;
-#endif
                    }
                    catch (cl::Error err) {
                        // Runtime does not support the query so is unlikely to be the newer scalar GPU.
@@ -254,7 +254,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        if (supportsDoublePrecision)
            compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
        if (simdWidth >= 32)
-            compilationDefines["SYNC_WARPS"] = "";
+            compilationDefines["SYNC_WARPS"] = "mem_fence(CLK_LOCAL_MEM_FENCE)";
        else
            compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
        vector<cl::Device> contextDevices;
@@ -729,7 +729,7 @@ void OpenCLContext::clearAutoclearBuffers() {
        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
    }
    else if (total-base == 1) {
-        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
+        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]*4);
    }
 }


--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -118,6 +118,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
        return new OpenCLIntegrateVerletStepKernel(name, platform, cl);
    if (name == IntegrateLangevinStepKernel::Name())
        return new OpenCLIntegrateLangevinStepKernel(name, platform, cl);
+    if (name == IntegrateBAOABStepKernel::Name())
+        return new OpenCLIntegrateBAOABStepKernel(name, platform, cl);
    if (name == IntegrateBrownianStepKernel::Name())
        return new OpenCLIntegrateBrownianStepKernel(name, platform, cl);
    if (name == IntegrateVariableVerletStepKernel::Name())

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -7473,6 +7473,111 @@ double OpenCLIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& cont
    return cl.getIntegrationUtilities().computeKineticEnergy(0.5*integrator.getStepSize());
 }

+void OpenCLIntegrateBAOABStepKernel::initialize(const System& system, const BAOABLangevinIntegrator& integrator) {
+    cl.getPlatformData().initializeContexts(system);
+    cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
+    map<string, string> defines;
+    defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
+    cl::Program program = cl.createProgram(OpenCLKernelSources::baoab, defines, "");
+    kernel1 = cl::Kernel(program, "integrateBAOABPart1");
+    kernel2 = cl::Kernel(program, "integrateBAOABPart2");
+    kernel3 = cl::Kernel(program, "integrateBAOABPart3");
+    kernel4 = cl::Kernel(program, "integrateBAOABPart4");
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+        params.initialize<cl_double>(cl, 2, "baoabParams");
+        oldDelta.initialize<mm_double4>(cl, cl.getPaddedNumAtoms(), "oldDelta");
+    }
+    else {
+        params.initialize<cl_float>(cl, 2, "baoabParams");
+        oldDelta.initialize<mm_float4>(cl, cl.getPaddedNumAtoms(), "oldDelta");
+    }
+    prevStepSize = -1.0;
+}
+
+void OpenCLIntegrateBAOABStepKernel::execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid) {
+    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
+    int numAtoms = cl.getNumAtoms();
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(3, oldDelta.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel2, 1);
+        kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(3, integration.getPosDelta().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(4, oldDelta.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(5, params.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(6, integration.getStepSize().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(7, integration.getRandom().getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel3, 1);
+        kernel3.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(3, integration.getPosDelta().getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(4, oldDelta.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(5, integration.getStepSize().getDeviceBuffer());
+        kernel4.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
+        kernel4.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
+        kernel4.setArg<cl::Buffer>(2, integration.getStepSize().getDeviceBuffer());
+    }
+    if (!forcesAreValid) {
+        context.calcForcesAndEnergy(true, false);
+        forcesAreValid = true;
+    }
+    double temperature = integrator.getTemperature();
+    double friction = integrator.getFriction();
+    double stepSize = integrator.getStepSize();
+    cl.getIntegrationUtilities().setNextStepSize(stepSize);
+    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+        // Calculate the integration parameters.
+
+        double kT = BOLTZ*temperature;
+        double vscale = exp(-stepSize*friction);
+        double noisescale = sqrt(kT*(1-vscale*vscale));
+        vector<cl_double> p(params.getSize());
+        p[0] = vscale;
+        p[1] = noisescale;
+        params.upload(p, true, true);
+        prevTemp = temperature;
+        prevFriction = friction;
+        prevStepSize = stepSize;
+    }
+
+    // Perform the integrator.
+
+    kernel2.setArg<cl_uint>(8, integration.prepareRandomNumbers(cl.getPaddedNumAtoms()));
+    cl.executeKernel(kernel1, numAtoms);
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    cl.executeKernel(kernel2, numAtoms);
+    integration.applyConstraints(integrator.getConstraintTolerance());
+    cl.executeKernel(kernel3, numAtoms);
+    context.calcForcesAndEnergy(true, false);
+    cl.executeKernel(kernel4, numAtoms);
+    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
+    integration.computeVirtualSites();
+
+    // Update the time and step count.
+
+    cl.setTime(cl.getTime()+stepSize);
+    cl.setStepCount(cl.getStepCount()+1);
+    cl.reorderAtoms();
+    if (cl.getAtomsWereReordered())
+        forcesAreValid = false;
+    
+    // Reduce UI lag.
+    
+#ifdef WIN32
+    cl.getQueue().flush();
+#endif
+}
+
+double OpenCLIntegrateBAOABStepKernel::computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator) {
+    return cl.getIntegrationUtilities().computeKineticEnergy(0.0);
+}
+
 OpenCLIntegrateBrownianStepKernel::~OpenCLIntegrateBrownianStepKernel() {
 }

@@ -7594,6 +7699,8 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
    // Select the step size to use.

    double maxStepSize = maxTime-cl.getTime();
+    if (integrator.getMaximumStepSize() > 0)
+        maxStepSize = min(integrator.getMaximumStepSize(), maxStepSize);
    float maxStepSizeFloat = (float) maxStepSize;
    if (useDouble) {
        selectSizeKernel.setArg<cl_double>(1, maxStepSize);
@@ -7691,6 +7798,8 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
    // Select the step size to use.

    double maxStepSize = maxTime-cl.getTime();
+    if (integrator.getMaximumStepSize() > 0)
+        maxStepSize = min(integrator.getMaximumStepSize(), maxStepSize);
    float maxStepSizeFloat = (float) maxStepSize;
    if (useDouble) {
        selectSizeKernel.setArg<cl_double>(0, maxStepSize);
@@ -8255,7 +8364,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
            kineticEnergyKernel.setArg<cl::Buffer>(index++, array.getDeviceBuffer());
        for (auto& array : tabulatedFunctions)
            kineticEnergyKernel.setArg<cl::Buffer>(index++, array.getDeviceBuffer());
-        keNeedsForce = usesVariable(keExpression, "f");

        // Create a second kernel to sum the values.

@@ -8518,16 +8626,6 @@ bool OpenCLIntegrateCustomStepKernel::evaluateCondition(int step) {

 double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
    prepareForComputation(context, integrator, forcesAreValid);
-    if (keNeedsForce && !forcesAreValid) {
-        // Compute the force.  We want to then mark that forces are valid, which means also computing
-        // potential energy if any steps will expect it to be valid too.
-        
-        bool willNeedEnergy = false;
-        for (int i = 0; i < integrator.getNumComputations(); i++)
-            willNeedEnergy |= needsEnergy[i];
-        energy = context.calcForcesAndEnergy(true, willNeedEnergy, -1);
-        forcesAreValid = true;
-    }
    cl.clearBuffer(sumBuffer);
    kineticEnergyKernel.setArg<cl::Buffer>(8, cl.getIntegrationUtilities().getRandom().getDeviceBuffer());
    kineticEnergyKernel.setArg<cl_uint>(9, 0);

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -88,6 +88,7 @@ OpenCLPlatform::OpenCLPlatform() {
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
@@ -141,6 +142,17 @@ bool OpenCLPlatform::isPlatformSupported() {
        return false;
 #endif

+    // Make sure at least one OpenCL implementation is installed.
+
+    std::vector<cl::Platform> platforms;
+    try {
+        cl::Platform::get(&platforms);
+        if (platforms.size() == 0)
+            return false;
+    }
+    catch (...) {
+        return false;
+    }
    return true;
 }


--- a/platforms/opencl/src/kernels/baoab.cl
+++ b/platforms/opencl/src/kernels/baoab.cl
+enum {VelScale, NoiseScale};
+
+/**
+ * Perform the first part of BAOAB integration: velocity half step, then position half step.
+ */
+
+__kernel void integrateBAOABPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
+        __global mixed4* restrict oldDelta, __global const mixed2* restrict dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            velocity.x += halfdt*velocity.w*force[index].x;
+            velocity.y += halfdt*velocity.w*force[index].y;
+            velocity.z += halfdt*velocity.w*force[index].z;
+            velm[index] = velocity;
+            mixed4 delta = halfdt*velocity;
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+    }
+}
+
+/**
+ * Perform the second part of BAOAB integration: apply constraint forces to velocities, then interact with heat bath,
+ * then position half step.
+ */
+
+__kernel void integrateBAOABPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global mixed4* restrict posDelta,
+        __global mixed4* restrict oldDelta, __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
+    mixed vscale = paramBuffer[VelScale];
+    mixed noisescale = paramBuffer[NoiseScale];
+    mixed halfdt = 0.5*dt[0].y;
+    mixed invHalfdt = 1/halfdt;
+    int index = get_global_id(0);
+    randomIndex += index;
+    while (index < NUM_ATOMS) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            mixed sqrtInvMass = SQRT(velocity.w);
+            velocity.xyz += (delta.xyz-oldDelta[index].xyz)*invHalfdt;
+            velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x;
+            velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.z = vscale*velocity.z + noisescale*sqrtInvMass*random[randomIndex].z;
+            velm[index] = velocity;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.xyz += delta.xyz;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = convert_real4(pos);
+            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+            delta = halfdt*velocity;
+            posDelta[index] = delta;
+            oldDelta[index] = delta;
+        }
+        randomIndex += get_global_size(0);
+        index += get_global_size(0);
+    }
+}
+
+/**
+ * Perform the third part of BAOAB integration: apply constraint forces to velocities, then record
+ * the constrained positions in preparation for computing forces.
+ */
+
+__kernel void integrateBAOABPart3(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm,
+         __global mixed4* restrict posDelta, __global mixed4* restrict oldDelta, __global const mixed2* restrict dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    mixed invHalfdt = 1/halfdt;
+    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            mixed4 delta = posDelta[index];
+            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
+            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
+            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
+            velm[index] = velocity;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.xyz += delta.xyz;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = convert_real4(pos);
+            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
+        }
+    }
+}
+
+/**
+ * Perform the fourth part of BAOAB integration: velocity half step.
+ */
+
+__kernel void integrateBAOABPart4(__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt) {
+    mixed halfdt = 0.5*dt[0].y;
+    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0.0) {
+            velocity.x += halfdt*velocity.w*force[index].x;
+            velocity.y += halfdt*velocity.w*force[index].y;
+            velocity.z += halfdt*velocity.w*force[index].z;
+            velm[index] = velocity;
+        }
+    }
+}
--- a/platforms/opencl/tests/TestOpenCLBAOABLangevinIntegrator.cpp
+++ b/platforms/opencl/tests/TestOpenCLBAOABLangevinIntegrator.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "OpenCLTests.h"
+#include "TestBAOABLangevinIntegrator.h"
+
+void runPlatformTests() {
+}
--- a/platforms/reference/include/ReferenceBAOABDynamics.h
+++ b/platforms/reference/include/ReferenceBAOABDynamics.h
+
+/* Portions copyright (c) 2006-2019 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __ReferenceBAOABDynamics_H__
+#define __ReferenceBAOABDynamics_H__
+
+#include "ReferenceDynamics.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/windowsExport.h"
+
+namespace OpenMM {
+
+class OPENMM_EXPORT ReferenceBAOABDynamics : public ReferenceDynamics {
+
+   protected:
+
+      std::vector<OpenMM::Vec3> xPrime, oldx;
+      std::vector<double> inverseMasses;
+      double friction;
+      
+   public:
+
+      /**---------------------------------------------------------------------------------------
+      
+         Constructor
+
+         @param numberOfAtoms  number of atoms
+         @param deltaT         delta t for dynamics
+         @param friction       friction coefficient
+         @param temperature    temperature
+      
+         --------------------------------------------------------------------------------------- */
+
+       ReferenceBAOABDynamics(int numberOfAtoms, double deltaT, double friction, double temperature);
+
+      /**---------------------------------------------------------------------------------------
+      
+         Destructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       ~ReferenceBAOABDynamics();
+
+      /**---------------------------------------------------------------------------------------
+      
+         Get friction coefficient
+      
+         --------------------------------------------------------------------------------------- */
+      
+      double getFriction() const;
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Update
+      
+         @param context             the context this integrator is updating
+         @param system              the System to be integrated
+         @param atomCoordinates     atom coordinates
+         @param velocities          velocities
+         @param masses              atom masses
+         @param forcesAreValid      whether the current forces are valid or need to be recomputed
+         @param tolerance           the constraint tolerance
+      
+         --------------------------------------------------------------------------------------- */
+     
+      void update(OpenMM::ContextImpl& context, std::vector<OpenMM::Vec3>& atomCoordinates,
+                  std::vector<OpenMM::Vec3>& velocities, std::vector<double>& masses, bool& forcesAreValid, double tolerance);
+     
+      /**---------------------------------------------------------------------------------------
+      
+         First update; based on code in update.c do_update_sd() Gromacs 3.1.4
+      
+         @param numberOfAtoms       number of atoms
+         @param atomCoordinates     atom coordinates
+         @param velocities          velocities
+         @param forces              forces
+         @param inverseMasses       inverse atom masses
+         @param xPrime              xPrime
+      
+         --------------------------------------------------------------------------------------- */
+      
+      virtual void updatePart1(int numberOfAtoms, std::vector<OpenMM::Vec3>& atomCoordinates, std::vector<OpenMM::Vec3>& velocities,
+                       std::vector<OpenMM::Vec3>& forces, std::vector<double>& inverseMasses, std::vector<OpenMM::Vec3>& xPrime);
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Second update
+      
+         @param numberOfAtoms       number of atoms
+         @param atomCoordinates     atom coordinates
+         @param velocities          velocities
+         @param inverseMasses       inverse atom masses
+         @param xPrime              xPrime
+      
+         --------------------------------------------------------------------------------------- */
+      
+      virtual void updatePart2(int numberOfAtoms, std::vector<OpenMM::Vec3>& atomCoordinates, std::vector<OpenMM::Vec3>& velocities,
+                       std::vector<double>& inverseMasses, std::vector<OpenMM::Vec3>& xPrime);
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Third update
+      
+         @param context             the context this integrator is updating
+         @param numberOfAtoms       number of atoms
+         @param atomCoordinates     atom coordinates
+         @param velocities          velocities
+         @param forces              forces
+         @param inverseMasses       inverse atom masses
+         @param xPrime              xPrime
+      
+         --------------------------------------------------------------------------------------- */
+      
+      virtual void updatePart3(OpenMM::ContextImpl& context, int numberOfAtoms, std::vector<OpenMM::Vec3>& atomCoordinates, std::vector<OpenMM::Vec3>& velocities,
+                       std::vector<OpenMM::Vec3>& forces, std::vector<double>& inverseMasses, std::vector<OpenMM::Vec3>& xPrime);
+};
+
+} // namespace OpenMM
+
+#endif // __ReferenceBAOABDynamics_H__
--- a/platforms/reference/include/ReferenceKernels.h
+++ b/platforms/reference/include/ReferenceKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -45,6 +45,7 @@ namespace OpenMM {

 class ReferenceObc;
 class ReferenceAndersenThermostat;
+class ReferenceBAOABDynamics;
 class ReferenceCustomBondIxn;
 class ReferenceCustomAngleIxn;
 class ReferenceCustomTorsionIxn;
@@ -1170,6 +1171,47 @@ private:
    double prevTemp, prevFriction, prevStepSize;
 };

+/**
+ * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
+ */
+class ReferenceIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
+public:
+    ReferenceIntegrateBAOABStepKernel(std::string name, const Platform& platform, ReferencePlatform::PlatformData& data) : IntegrateBAOABStepKernel(name, platform),
+        data(data), dynamics(0) {
+    }
+    ~ReferenceIntegrateBAOABStepKernel();
+    /**
+     * Initialize the kernel, setting up the particle masses.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the BAOABLangevinIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const BAOABLangevinIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     * @param forcesAreValid if the context has been modified since the last time step, this will be
+     *                       false to show that cached forces are invalid and must be recalculated.
+     *                       On exit, this should specify whether the cached forces are valid at the
+     *                       end of the step.
+     */
+    void execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator);
+private:
+    ReferencePlatform::PlatformData& data;
+    ReferenceBAOABDynamics* dynamics;
+    std::vector<double> masses;
+    double prevTemp, prevFriction, prevStepSize;
+};
+
 /**
 * This kernel is invoked by BrownianIntegrator to take one time step.
 */

--- a/platforms/reference/include/ReferencePlatform.h
+++ b/platforms/reference/include/ReferencePlatform.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,6 +35,9 @@
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/internal/windowsExport.h"
+#include "ReferenceConstraints.h"
+#include <map>
+#include <vector>

 namespace OpenMM {

@@ -62,13 +65,13 @@ public:
    ~PlatformData();
    int numParticles, stepCount;
    double time;
-    void* positions;
-    void* velocities;
-    void* forces;
-    void* periodicBoxSize;
-    void* periodicBoxVectors;
-    void* constraints;
-    void* energyParameterDerivatives;
+    std::vector<Vec3>* positions;
+    std::vector<Vec3>* velocities;
+    std::vector<Vec3>* forces;
+    Vec3* periodicBoxSize;
+    Vec3* periodicBoxVectors;
+    ReferenceConstraints* constraints;
+    std::map<std::string, double>* energyParameterDerivatives;
 };
 } // namespace OpenMM


--- a/platforms/reference/include/ReferenceTabulatedFunction.h
+++ b/platforms/reference/include/ReferenceTabulatedFunction.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2014-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2014-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,6 +35,7 @@
 #include "openmm/TabulatedFunction.h"
 #include "openmm/internal/windowsExport.h"
 #include "lepton/CustomFunction.h"
+#include <memory>
 #include <vector>

 namespace OpenMM {
@@ -146,6 +147,22 @@ private:
    std::vector<double> values;
 };

+/**
+ * This is a lightweight wrapper around an immutable CustomFunction.  It makes
+ * cloning very inexpensive since nothing needs to be copied except a single
+ * pointer.
+ */
+class OPENMM_EXPORT SharedFunctionWrapper : public Lepton::CustomFunction {
+public:
+    SharedFunctionWrapper(std::shared_ptr<const CustomFunction> pointer);
+    int getNumArguments() const;
+    double evaluate(const double* arguments) const;
+    double evaluateDerivative(const double* arguments, const int* derivOrder) const;
+    CustomFunction* clone() const;
+private:
+    std::shared_ptr<const CustomFunction> pointer;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_REFERENCETABULATEDFUNCTION_H_*/
--- a/platforms/reference/src/ReferenceKernelFactory.cpp
+++ b/platforms/reference/src/ReferenceKernelFactory.cpp
@@ -90,6 +90,8 @@ KernelImpl* ReferenceKernelFactory::createKernelImpl(std::string name, const Pla
        return new ReferenceIntegrateVerletStepKernel(name, platform, data);
    if (name == IntegrateLangevinStepKernel::Name())
        return new ReferenceIntegrateLangevinStepKernel(name, platform, data);
+    if (name == IntegrateBAOABStepKernel::Name())
+        return new ReferenceIntegrateBAOABStepKernel(name, platform, data);
    if (name == IntegrateBrownianStepKernel::Name())
        return new ReferenceIntegrateBrownianStepKernel(name, platform, data);
    if (name == IntegrateVariableLangevinStepKernel::Name())

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,6 +33,7 @@
 #include "ReferenceObc.h"
 #include "ReferenceAndersenThermostat.h"
 #include "ReferenceAngleBondIxn.h"
+#include "ReferenceBAOABDynamics.h"
 #include "ReferenceBondForce.h"
 #include "ReferenceBrownianDynamics.h"
 #include "ReferenceCCMAAlgorithm.h"
@@ -91,37 +92,37 @@ using namespace std;

 static vector<Vec3>& extractPositions(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((vector<Vec3>*) data->positions);
+    return *data->positions;
 }

 static vector<Vec3>& extractVelocities(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((vector<Vec3>*) data->velocities);
+    return *data->velocities;
 }

 static vector<Vec3>& extractForces(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((vector<Vec3>*) data->forces);
+    return *data->forces;
 }

 static Vec3& extractBoxSize(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *(Vec3*) data->periodicBoxSize;
+    return *data->periodicBoxSize;
 }

 static Vec3* extractBoxVectors(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return (Vec3*) data->periodicBoxVectors;
+    return data->periodicBoxVectors;
 }

 static ReferenceConstraints& extractConstraints(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *(ReferenceConstraints*) data->constraints;
+    return *data->constraints;
 }

 static map<string, double>& extractEnergyParameterDerivatives(ContextImpl& context) {
    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
-    return *((map<string, double>*) data->energyParameterDerivatives);
+    return *data->energyParameterDerivatives;
 }

 /**
@@ -2149,6 +2150,49 @@ double ReferenceIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& c
    return computeShiftedKineticEnergy(context, masses, 0.5*integrator.getStepSize());
 }

+ReferenceIntegrateBAOABStepKernel::~ReferenceIntegrateBAOABStepKernel() {
+    if (dynamics)
+        delete dynamics;
+}
+
+void ReferenceIntegrateBAOABStepKernel::initialize(const System& system, const BAOABLangevinIntegrator& integrator) {
+    int numParticles = system.getNumParticles();
+    masses.resize(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        masses[i] = system.getParticleMass(i);
+    SimTKOpenMMUtilities::setRandomNumberSeed((unsigned int) integrator.getRandomNumberSeed());
+}
+
+void ReferenceIntegrateBAOABStepKernel::execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid) {
+    double temperature = integrator.getTemperature();
+    double friction = integrator.getFriction();
+    double stepSize = integrator.getStepSize();
+    vector<Vec3>& posData = extractPositions(context);
+    vector<Vec3>& velData = extractVelocities(context);
+    if (dynamics == 0 || temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+        // Recreate the computation objects with the new parameters.
+        
+        if (dynamics)
+            delete dynamics;
+        dynamics = new ReferenceBAOABDynamics(
+                context.getSystem().getNumParticles(), 
+                stepSize, 
+                friction, 
+                temperature);
+        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
+        prevTemp = temperature;
+        prevFriction = friction;
+        prevStepSize = stepSize;
+    }
+    dynamics->update(context, posData, velData, masses, forcesAreValid, integrator.getConstraintTolerance());
+    data.time += stepSize;
+    data.stepCount++;
+}
+
+double ReferenceIntegrateBAOABStepKernel::computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator) {
+    return computeShiftedKineticEnergy(context, masses, 0.0);
+}
+
 ReferenceIntegrateBrownianStepKernel::~ReferenceIntegrateBrownianStepKernel() {
    if (dynamics)
        delete dynamics;
@@ -2225,6 +2269,8 @@ double ReferenceIntegrateVariableLangevinStepKernel::execute(ContextImpl& contex
        prevErrorTol = errorTol;
    }
    double maxStepSize = maxTime-data.time;
+    if (integrator.getMaximumStepSize() > 0)
+        maxStepSize = min(integrator.getMaximumStepSize(), maxStepSize);
    dynamics->update(context.getSystem(), posData, velData, forceData, masses, maxStepSize, integrator.getConstraintTolerance());
    data.time += dynamics->getDeltaT();
    if (dynamics->getDeltaT() == maxStepSize)
@@ -2264,6 +2310,8 @@ double ReferenceIntegrateVariableVerletStepKernel::execute(ContextImpl& context,
        prevErrorTol = errorTol;
    }
    double maxStepSize = maxTime-data.time;
+    if (integrator.getMaximumStepSize() > 0)
+        maxStepSize = min(integrator.getMaximumStepSize(), maxStepSize);
    dynamics->update(context.getSystem(), posData, velData, forceData, masses, maxStepSize, integrator.getConstraintTolerance());
    data.time += dynamics->getDeltaT();
    if (dynamics->getDeltaT() == maxStepSize)

--- a/platforms/reference/src/ReferencePlatform.cpp
+++ b/platforms/reference/src/ReferencePlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -30,14 +30,11 @@
 * -------------------------------------------------------------------------- */

 #include "ReferencePlatform.h"
-#include "ReferenceConstraints.h"
 #include "ReferenceKernelFactory.h"
 #include "ReferenceKernels.h"
 #include "openmm/internal/ContextImpl.h"
 #include "SimTKOpenMMRealType.h"
 #include "openmm/Vec3.h"
-#include <map>
-#include <vector>

 using namespace OpenMM;
 using namespace std;
@@ -70,6 +67,7 @@ ReferencePlatform::ReferencePlatform() {
    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
+    registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
@@ -107,11 +105,11 @@ ReferencePlatform::PlatformData::PlatformData(const System& system) : time(0.0),
 }

 ReferencePlatform::PlatformData::~PlatformData() {
-    delete (vector<Vec3>*) positions;
-    delete (vector<Vec3>*) velocities;
-    delete (vector<Vec3>*) forces;
-    delete (Vec3*) periodicBoxSize;
-    delete[] (Vec3*) periodicBoxVectors;
-    delete (ReferenceConstraints*) constraints;
-    delete (map<string, double>*) energyParameterDerivatives;
+    delete positions;
+    delete velocities;
+    delete forces;
+    delete periodicBoxSize;
+    delete[] periodicBoxVectors;
+    delete constraints;
+    delete energyParameterDerivatives;
 }
--- a/platforms/reference/src/ReferenceTabulatedFunction.cpp
+++ b/platforms/reference/src/ReferenceTabulatedFunction.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2014-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2014-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -56,19 +56,22 @@ using namespace std;
 using Lepton::CustomFunction;

 extern "C" OPENMM_EXPORT CustomFunction* createReferenceTabulatedFunction(const TabulatedFunction& function) {
+    CustomFunction* fn;
    if (dynamic_cast<const Continuous1DFunction*>(&function) != NULL)
-        return new ReferenceContinuous1DFunction(dynamic_cast<const Continuous1DFunction&>(function));
-    if (dynamic_cast<const Continuous2DFunction*>(&function) != NULL)
-        return new ReferenceContinuous2DFunction(dynamic_cast<const Continuous2DFunction&>(function));
-    if (dynamic_cast<const Continuous3DFunction*>(&function) != NULL)
-        return new ReferenceContinuous3DFunction(dynamic_cast<const Continuous3DFunction&>(function));
-    if (dynamic_cast<const Discrete1DFunction*>(&function) != NULL)
-        return new ReferenceDiscrete1DFunction(dynamic_cast<const Discrete1DFunction&>(function));
-    if (dynamic_cast<const Discrete2DFunction*>(&function) != NULL)
-        return new ReferenceDiscrete2DFunction(dynamic_cast<const Discrete2DFunction&>(function));
-    if (dynamic_cast<const Discrete3DFunction*>(&function) != NULL)
-        return new ReferenceDiscrete3DFunction(dynamic_cast<const Discrete3DFunction&>(function));
-    throw OpenMMException("createReferenceTabulatedFunction: Unknown function type");
+        fn = new ReferenceContinuous1DFunction(dynamic_cast<const Continuous1DFunction&>(function));
+    else if (dynamic_cast<const Continuous2DFunction*>(&function) != NULL)
+        fn = new ReferenceContinuous2DFunction(dynamic_cast<const Continuous2DFunction&>(function));
+    else if (dynamic_cast<const Continuous3DFunction*>(&function) != NULL)
+        fn = new ReferenceContinuous3DFunction(dynamic_cast<const Continuous3DFunction&>(function));
+    else if (dynamic_cast<const Discrete1DFunction*>(&function) != NULL)
+        fn = new ReferenceDiscrete1DFunction(dynamic_cast<const Discrete1DFunction&>(function));
+    else if (dynamic_cast<const Discrete2DFunction*>(&function) != NULL)
+        fn = new ReferenceDiscrete2DFunction(dynamic_cast<const Discrete2DFunction&>(function));
+    else if (dynamic_cast<const Discrete3DFunction*>(&function) != NULL)
+        fn = new ReferenceDiscrete3DFunction(dynamic_cast<const Discrete3DFunction&>(function));
+    else
+        throw OpenMMException("createReferenceTabulatedFunction: Unknown function type");
+    return new SharedFunctionWrapper(shared_ptr<const CustomFunction>(fn));
 }

 ReferenceContinuous1DFunction::ReferenceContinuous1DFunction(const Continuous1DFunction& function) : function(function) {
@@ -298,3 +301,22 @@ double ReferenceDiscrete3DFunction::evaluateDerivative(const double* arguments,
 CustomFunction* ReferenceDiscrete3DFunction::clone() const {
    return new ReferenceDiscrete3DFunction(function);
 }
+
+SharedFunctionWrapper::SharedFunctionWrapper(shared_ptr<const CustomFunction> pointer) : pointer(pointer) {
+}
+
+int SharedFunctionWrapper::getNumArguments() const {
+    return pointer->getNumArguments();
+}
+
+double SharedFunctionWrapper::evaluate(const double* arguments) const {
+    return pointer->evaluate(arguments);
+}
+
+double SharedFunctionWrapper::evaluateDerivative(const double* arguments, const int* derivOrder) const {
+    return pointer->evaluateDerivative(arguments, derivOrder);
+}
+
+CustomFunction* SharedFunctionWrapper::clone() const {
+    return new SharedFunctionWrapper(pointer);
+}