Merge branch 'upstream' into fork

3862202e · Justin MacCallum · e1a4e015 · 73882ac5 · 3862202e · 3862202e
Commit 3862202e authored Jul 12, 2013 by Justin MacCallum
20 changed files
--- a/platforms/opencl/src/OpenCLParallelKernels.h
+++ b/platforms/opencl/src/OpenCLParallelKernels.h
--- a/platforms/opencl/src/OpenCLParameterSet.h
+++ b/platforms/opencl/src/OpenCLParameterSet.h
--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -87,17 +87,25 @@ public:
        static const std::string key = "OpenCLPrecision";
        return key;
    }
+    /**
+     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
+     */
+    static const std::string& OpenCLUseCpuPme() {
+        static const std::string key = "OpenCLUseCpuPme";
+        return key;
+    }
 };
 class OPENMM_EXPORT_OPENCL OpenCLPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
+    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty, const std::string& cpuPmeProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
+    ContextImpl* context;
    std::vector<OpenCLContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM;
+    bool removeCM, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/opencl/src/OpenCLSort.h
+++ b/platforms/opencl/src/OpenCLSort.h
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -334,6 +334,10 @@ OpenCLContext::~OpenCLContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    for (int i = 0; i < (int) preComputations.size(); i++)
+        delete preComputations[i];
+    for (int i = 0; i < (int) postComputations.size(); i++)
+        delete postComputations[i];
    if (pinnedBuffer != NULL)
        delete pinnedBuffer;
    if (posq != NULL)
@@ -1106,6 +1110,14 @@ void OpenCLContext::addReorderListener(ReorderListener* listener) {
    reorderListeners.push_back(listener);
 }
+void OpenCLContext::addPreComputation(ForcePreComputation* computation) {
+    preComputations.push_back(computation);
+}
+void OpenCLContext::addPostComputation(ForcePostComputation* computation) {
+    postComputations.push_back(computation);
+}
 struct OpenCLContext::WorkThread::ThreadData {
    ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -27,7 +27,7 @@
 #include "OpenCLFFT3D.h"
 #include "OpenCLExpressionUtilities.h"
 #include "OpenCLKernelSources.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include <map>
 #include <sstream>
 #include <string>

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -44,8 +44,8 @@
 #include "lepton/Operation.h"
 #include "lepton/Parser.h"
 #include "lepton/ParsedExpression.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
-#include "../src/SimTKUtilities/SimTKOpenMMUtilities.h"
+#include "SimTKOpenMMUtilities.h"
 #include <cmath>
 #include <set>
@@ -104,10 +104,12 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
 }
 void OpenCLCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cl.clearAutoclearBuffers();
+    for (vector<OpenCLContext::ForcePreComputation*>::iterator iter = cl.getPreComputations().begin(); iter != cl.getPreComputations().end(); ++iter)
+        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cl.setComputeForceCount(cl.getComputeForceCount()+1);
-    cl.clearAutoclearBuffers();
    if (includeNonbonded)
        nb.prepareInteractions();
 }
@@ -117,8 +119,10 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
    if ((groups&(1<<cl.getNonbondedUtilities().getForceGroup())) != 0)
        cl.getNonbondedUtilities().computeInteractions();
    cl.reduceForces();
+    double sum = 0.0;
+    for (vector<OpenCLContext::ForcePostComputation*>::iterator iter = cl.getPostComputations().begin(); iter != cl.getPostComputations().end(); ++iter)
+        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
-    double sum = 0.0f;
    if (includeEnergy) {
        OpenCLArray& energyArray = cl.getEnergyBuffer();
        if (cl.getUseDoublePrecision()) {
@@ -1323,6 +1327,58 @@ private:
    const NonbondedForce& force;
 };
+class OpenCLCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    PmeIO(OpenCLContext& cl, cl::Kernel addForcesKernel) : cl(cl), addForcesKernel(addForcesKernel), forceTemp(NULL) {
+        forceTemp = OpenCLArray::create<mm_float4>(cl, cl.getNumAtoms(), "PmeForce");
+        addForcesKernel.setArg<cl::Buffer>(0, forceTemp->getDeviceBuffer());
+    }
+    ~PmeIO() {
+        if (forceTemp != NULL)
+            delete forceTemp;
+    }
+    float* getPosq() {
+        cl.getPosq().download(posq);
+        return (float*) &posq[0];
+    }
+    void setForce(float* force) {
+        forceTemp->upload(force);
+        addForcesKernel.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
+        cl.executeKernel(addForcesKernel, cl.getNumAtoms());
+    }
+private:
+    OpenCLContext& cl;
+    vector<mm_float4> posq;
+    OpenCLArray* forceTemp;
+    cl::Kernel addForcesKernel;
+};
+class OpenCLCalcNonbondedForceKernel::PmePreComputation : public OpenCLContext::ForcePreComputation {
+public:
+    PmePreComputation(OpenCLContext& cl, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cl(cl), pme(pme), io(io) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        Vec3 boxSize(cl.getPeriodicBoxSize().x, cl.getPeriodicBoxSize().y, cl.getPeriodicBoxSize().z);
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
+    }
+private:
+    OpenCLContext& cl;
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+class OpenCLCalcNonbondedForceKernel::PmePostComputation : public OpenCLContext::ForcePostComputation {
+public:
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+    }
+private:
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
 OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
    if (sigmaEpsilon != NULL)
        delete sigmaEpsilon;
@@ -1350,6 +1406,8 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
        delete sort;
    if (fft != NULL)
        delete fft;
+    if (pmeio != NULL)
+        delete pmeio;
 }
 void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -1430,7 +1488,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cl.getContextIndex() == 0) {
        // Compute the Ewald parameters.
        int kmaxx, kmaxy, kmaxz;
@@ -1438,7 +1496,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cl.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
        // Create the reciprocal space kernels.
@@ -1454,7 +1512,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
        cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
    }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
+    else if (force.getNonbondedMethod() == NonbondedForce::PME && cl.getContextIndex() == 0) {
        // Compute the PME parameters.
        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1465,7 +1523,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cl.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
        pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
        pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
        pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
@@ -1476,7 +1534,23 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
        if (deviceIsCpu)
            pmeDefines["DEVICE_IS_CPU"] = "1";
+        if (cl.getPlatformData().useCpuPme) {
+            // Create the CPU PME kernel.
+            try {
+                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
+                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
+                cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
+                pmeio = new PmeIO(cl, addForcesKernel);
+                cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
+                cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+            }
+            catch (OpenMMException& ex) {
+                // The CPU PME plugin isn't available.
+            }
+        }
+        if (pmeio == NULL) {
            // Create required data structures.
            int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
@@ -1565,6 +1639,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                }
            }
        }
+    }
    else
        ewaldSelfEnergy = 0.0;
@@ -1650,7 +1725,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            }
       }
    }
-    if (cosSinSums != NULL && cl.getContextIndex() == 0 && includeReciprocal) {
+    if (cosSinSums != NULL && includeReciprocal) {
        mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
        mm_double4 recipBoxSize = mm_double4(2*M_PI/boxSize.x, 2*M_PI/boxSize.y, 2*M_PI/boxSize.z, 0.0);
        double recipCoefficient = ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z);
@@ -1669,7 +1744,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        cl.executeKernel(ewaldSumsKernel, cosSinSums->getSize());
        cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
    }
-    if (pmeGrid != NULL && cl.getContextIndex() == 0 && includeReciprocal) {
+    if (pmeGrid != NULL && includeReciprocal) {
        setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4);
        setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5);
        cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
@@ -4905,24 +4980,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
        defines["WORK_GROUP_SIZE"] = cl.intToString(OpenCLContext::ThreadBlockSize);
-        // Initialize the random number generator.
-        uniformRandoms = OpenCLArray::create<mm_float4>(cl, cl.getNumAtoms(), "uniformRandoms");
-        randomSeed = OpenCLArray::create<mm_int4>(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
-        vector<mm_int4> seed(randomSeed->getSize());
-        unsigned int r = integrator.getRandomNumberSeed()+1;
-        for (int i = 0; i < randomSeed->getSize(); i++) {
-            seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-            seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-            seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-            seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
-        }
-        randomSeed->upload(seed);
-        cl::Program randomProgram = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
-        randomKernel = cl::Kernel(randomProgram, "generateRandomNumbers");
-        randomKernel.setArg<cl::Buffer>(0, uniformRandoms->getDeviceBuffer());
-        randomKernel.setArg<cl::Buffer>(1, randomSeed->getDeviceBuffer());
        // Build a list of all variables that affect the forces, so we can tell which
        // steps invalidate them.
@@ -5013,10 +5070,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        for (int step = 1; step < numSteps; step++) {
            if (needsForces[step] || needsEnergy[step])
                continue;
-            if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal)
+            if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal &&
+                    !usesVariable(expression[step], "uniform") && !usesVariable(expression[step], "gaussian"))
                merged[step] = true;
-            if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof &&
+            if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
-                    !usesVariable(expression[step], "uniform"))
                merged[step] = true;
        }
@@ -5035,7 +5092,13 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                }
                int numGaussian = 0, numUniform = 0;
                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
+                    numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
+                    numUniform += numAtoms*usesVariable(expression[j], "uniform");
                    compute << "{\n";
+                    if (numGaussian > 0)
+                        compute << "float4 gaussian = gaussianValues[gaussianIndex+index];\n";
+                    if (numUniform > 0)
+                        compute << "float4 uniform = uniformValues[uniformIndex+index];\n";
                    for (int i = 0; i < 3; i++)
                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
                    if (variable[j] == "x") {
@@ -5058,9 +5121,11 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                            compute << "perDofValues"<<cl.intToString(i+1)<<"[3*index+2] = perDofz"<<cl.intToString(i+1)<<";\n";
                        }
                    }
+                    if (numGaussian > 0)
+                        compute << "gaussianIndex += NUM_ATOMS;\n";
+                    if (numUniform > 0)
+                        compute << "uniformIndex += NUM_ATOMS;\n";
                    compute << "}\n";
-                    numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
-                    numUniform += numAtoms*usesVariable(expression[j], "uniform");
                }
                map<string, string> replacements;
                replacements["COMPUTE_STEP"] = compute.str();
@@ -5090,9 +5155,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
-                kernel.setArg<cl::Buffer>(index++, integration.getRandom().getDeviceBuffer());
+                index += 3;
-                index++;
-                kernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
                    kernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
@@ -5154,6 +5217,28 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
            }
        }
+        // Initialize the random number generator.
+        int maxUniformRandoms = 1;
+        for (int i = 0; i < (int) requiredUniform.size(); i++)
+            maxUniformRandoms = max(maxUniformRandoms, requiredUniform[i]);
+        uniformRandoms = OpenCLArray::create<mm_float4>(cl, maxUniformRandoms, "uniformRandoms");
+        randomSeed = OpenCLArray::create<mm_int4>(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
+        vector<mm_int4> seed(randomSeed->getSize());
+        unsigned int r = integrator.getRandomNumberSeed()+1;
+        for (int i = 0; i < randomSeed->getSize(); i++) {
+            seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+            seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+            seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+            seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        }
+        randomSeed->upload(seed);
+        cl::Program randomProgram = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
+        randomKernel = cl::Kernel(randomProgram, "generateRandomNumbers");
+        randomKernel.setArg<cl_int>(0, maxUniformRandoms);
+        randomKernel.setArg<cl::Buffer>(1, uniformRandoms->getDeviceBuffer());
+        randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer());
        // Create the kernel for summing the potential energy.
        cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
@@ -5199,8 +5284,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        kineticEnergyKernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
        kineticEnergyKernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer());
        kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
-        kineticEnergyKernel.setArg<cl::Buffer>(index++, integration.getRandom().getDeviceBuffer());
+        index += 2;
-        kineticEnergyKernel.setArg<cl_uint>(index++, 0);
        kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer());
        kineticEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
@@ -5317,6 +5401,8 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
        }
        if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
            kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
+            kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer());
+            kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer());
            if (requiredUniform[i] > 0)
                cl.executeKernel(randomKernel, numAtoms);
            cl.executeKernel(kernels[i][0], numAtoms);
@@ -5328,6 +5414,8 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
        }
        else if (stepType[i] == CustomIntegrator::ComputeSum) {
            kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
+            kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer());
+            kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer());
            if (requiredUniform[i] > 0)
                cl.executeKernel(randomKernel, numAtoms);
            cl.clearBuffer(*sumBuffer);
@@ -5379,6 +5467,8 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
        forcesAreValid = true;
    }
    cl.clearBuffer(*sumBuffer);
+    kineticEnergyKernel.setArg<cl::Buffer>(9, cl.getIntegrationUtilities().getRandom().getDeviceBuffer());
+    kineticEnergyKernel.setArg<cl_uint>(10, 0);
    cl.executeKernel(kineticEnergyKernel, cl.getNumAtoms());
    cl.executeKernel(sumKineticEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -31,6 +31,7 @@
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/Context.h"
 #include "openmm/System.h"
+#include <algorithm>
 #include <sstream>
 using namespace OpenMM;
@@ -78,11 +79,13 @@ OpenCLPlatform::OpenCLPlatform() {
    platformProperties.push_back(OpenCLPlatformIndex());
    platformProperties.push_back(OpenCLPlatformName());
    platformProperties.push_back(OpenCLPrecision());
+    platformProperties.push_back(OpenCLUseCpuPme());
    setPropertyDefaultValue(OpenCLDeviceIndex(), "");
    setPropertyDefaultValue(OpenCLDeviceName(), "");
    setPropertyDefaultValue(OpenCLPlatformIndex(), "");
    setPropertyDefaultValue(OpenCLPlatformName(), "");
    setPropertyDefaultValue(OpenCLPrecision(), "single");
+    setPropertyDefaultValue(OpenCLUseCpuPme(), "false");
 }
 double OpenCLPlatform::getSpeed() const {
@@ -112,7 +115,15 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
            getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
    string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
            getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
-    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
+    string cpuPmePropValue = (properties.find(OpenCLUseCpuPme()) == properties.end() ?
+            getPropertyDefaultValue(OpenCLUseCpuPme()) : properties.find(OpenCLUseCpuPme())->second);
+    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
+    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
+    vector<string> pmeKernelName;
+    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
+    if (!supportsKernels(pmeKernelName))
+        cpuPmePropValue = "false";
+    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue, cpuPmePropValue));
 }
 void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
@@ -121,7 +132,7 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
 }
 OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
-        const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+        const string& precisionProperty, const string& cpuPmeProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    int platformIndex = 0;
    if (platformPropValue.length() > 0)
        stringstream(platformPropValue) >> platformIndex;
@@ -150,6 +161,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
        deviceIndex << contexts[i]->getDeviceIndex();
        deviceName << contexts[i]->getDevice().getInfo<CL_DEVICE_NAME>();
    }
+    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = deviceIndex.str();
    propertyValues[OpenCLPlatform::OpenCLDeviceName()] = deviceName.str();
    propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = contexts[0]->intToString(platformIndex);
@@ -157,6 +169,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
    cl::Platform::get(&platforms);
    propertyValues[OpenCLPlatform::OpenCLPlatformName()] = platforms[platformIndex].getInfo<CL_PLATFORM_NAME>();
    propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
+    propertyValues[OpenCLPlatform::OpenCLUseCpuPme()] = useCpuPme ? "true" : "false";
    contextEnergy.resize(contexts.size());
 }

--- a/platforms/opencl/src/kernels/customIntegrator.cl
+++ b/platforms/opencl/src/kernels/customIntegrator.cl
@@ -52,10 +52,10 @@ __kernel void applyPositionDeltas(__global real4* restrict posq, __global real4*
    }
 }
-__kernel void generateRandomNumbers(__global float4* restrict random, __global uint4* restrict seed) {
+__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
    uint4 state = seed[get_global_id(0)];
    unsigned int carry = 0;
-    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
+    for (int index = get_global_id(0); index < numValues; index += get_global_size(0)) {
        // Generate three uniform random numbers.
        state.x = state.x * 69069 + 1;

--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
@@ -26,11 +26,10 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
 __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
        __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
-        unsigned int randomIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
+        unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = get_global_id(0);
-    randomIndex += index;
    while (index < NUM_ATOMS) {
 #ifdef LOAD_POS_AS_DELTA
        mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
@@ -41,11 +40,10 @@ __kernel void computePerDof(__global real4* restrict posq, __global real4* restr
        real4 f = force[index];
        mixed mass = 1/velocity.w;
        if (velocity.w != 0.0) {
-            float4 gaussian = gaussianValues[randomIndex];
+            int gaussianIndex = gaussianBaseIndex;
-            float4 uniform = uniformValues[index];
+            int uniformIndex = 0;
            COMPUTE_STEP
        }
-        randomIndex += get_global_size(0);
        index += get_global_size(0);
    }
 }
--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -391,3 +391,8 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
        forceBuffers[atom] = totalForce;
    }
 }
+__kernel void addForces(__global const real4* restrict forces, __global real4* restrict forceBuffers) {
+    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0))
+        forceBuffers[atom] += forces[atom];
+}
--- a/platforms/opencl/tests/TestOpenCLAndersenThermostat.cpp
+++ b/platforms/opencl/tests/TestOpenCLAndersenThermostat.cpp
@@ -40,7 +40,7 @@
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLBrownianIntegrator.cpp
+++ b/platforms/opencl/tests/TestOpenCLBrownianIntegrator.cpp
@@ -43,7 +43,7 @@
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/BrownianIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
@@ -40,7 +40,7 @@
 #include "openmm/PeriodicTorsionForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLCMMotionRemover.cpp
+++ b/platforms/opencl/tests/TestOpenCLCMMotionRemover.cpp
@@ -42,7 +42,7 @@
 #include "openmm/System.h"
 #include "openmm/LangevinIntegrator.h"
 #include "openmm/VerletIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLCustomBondForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomBondForce.cpp
@@ -39,7 +39,7 @@
 #include "openmm/CustomBondForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLCustomExternalForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomExternalForce.cpp
@@ -39,7 +39,7 @@
 #include "openmm/CustomExternalForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
@@ -41,7 +41,7 @@
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/CustomIntegrator.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>
@@ -651,6 +651,72 @@ void testRespa() {
    }
 }
+/**
+ * Make sure random numbers are computed correctly when steps get merged.
+ */
+void testMergedRandoms() {
+    const int numParticles = 10;
+    const int numSteps = 10;
+    System system;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomIntegrator integrator(0.1);
+    integrator.addPerDofVariable("dofUniform1", 0);
+    integrator.addPerDofVariable("dofUniform2", 0);
+    integrator.addPerDofVariable("dofGaussian1", 0);
+    integrator.addPerDofVariable("dofGaussian2", 0);
+    integrator.addGlobalVariable("globalUniform1", 0);
+    integrator.addGlobalVariable("globalUniform2", 0);
+    integrator.addGlobalVariable("globalGaussian1", 0);
+    integrator.addGlobalVariable("globalGaussian2", 0);
+    integrator.addComputePerDof("dofUniform1", "uniform");
+    integrator.addComputePerDof("dofUniform2", "uniform");
+    integrator.addComputePerDof("dofGaussian1", "gaussian");
+    integrator.addComputePerDof("dofGaussian2", "gaussian");
+    integrator.addComputeGlobal("globalUniform1", "uniform");
+    integrator.addComputeGlobal("globalUniform2", "uniform");
+    integrator.addComputeGlobal("globalGaussian1", "gaussian");
+    integrator.addComputeGlobal("globalGaussian2", "gaussian");
+    Context context(system, integrator, platform);
+    // See if the random numbers are computed correctly.
+    vector<Vec3> values1, values2;
+    for (int i = 0; i < numSteps; i++) {
+        integrator.step(1);
+        integrator.getPerDofVariable(0, values1);
+        integrator.getPerDofVariable(1, values2);
+        for (int i = 0; i < numParticles; i++)
+            for (int j = 0; j < 3; j++) {
+                double v1 = values1[i][j];
+                double v2 = values2[i][j];
+                ASSERT(v1 >= 0 && v1 < 1);
+                ASSERT(v2 >= 0 && v2 < 1);
+                ASSERT(v1 != v2);
+            }
+        integrator.getPerDofVariable(2, values1);
+        integrator.getPerDofVariable(3, values2);
+        for (int i = 0; i < numParticles; i++)
+            for (int j = 0; j < 3; j++) {
+                double v1 = values1[i][j];
+                double v2 = values2[i][j];
+                ASSERT(v1 >= -10 && v1 < 10);
+                ASSERT(v2 >= -10 && v2 < 10);
+                ASSERT(v1 != v2);
+            }
+        double v1 = integrator.getGlobalVariable(0);
+        double v2 = integrator.getGlobalVariable(1);
+        ASSERT(v1 >= 0 && v1 < 1);
+        ASSERT(v2 >= 0 && v2 < 1);
+        ASSERT(v1 != v2);
+        v1 = integrator.getGlobalVariable(2);
+        v2 = integrator.getGlobalVariable(3);
+        ASSERT(v1 >= -10 && v1 < 10);
+        ASSERT(v2 >= -10 && v2 < 10);
+        ASSERT(v1 != v2);
+    }
+}
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -666,6 +732,7 @@ int main(int argc, char* argv[]) {
        testPerDofVariables();
        testForceGroups();
        testRespa();
+        testMergedRandoms();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/tests/TestOpenCLEwald.cpp
+++ b/platforms/opencl/tests/TestOpenCLEwald.cpp
@@ -42,7 +42,7 @@
 #include "openmm/LangevinIntegrator.h"
 #include "openmm/VerletIntegrator.h"
 #include "openmm/internal/ContextImpl.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKOpenMMRealType.h"
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <vector>

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -34,11 +34,11 @@
 */
 #include "openmm/internal/AssertionUtilities.h"
-#include "../src/OpenCLArray.h"
+#include "OpenCLArray.h"
-#include "../src/OpenCLContext.h"
+#include "OpenCLContext.h"
-#include "../src/OpenCLFFT3D.h"
+#include "OpenCLFFT3D.h"
-#include "../src/OpenCLSort.h"
+#include "OpenCLSort.h"
-#include "../src/SimTKReference/fftpack.h"
+#include "fftpack.h"
 #include "sfmt/SFMT.h"
 #include "openmm/System.h"
 #include <iostream>
@@ -54,7 +54,7 @@ template <class Real2>
 void testTransform() {
    System system;
    system.addParticle(0.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"));
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;