Merge pull request #32 from peastman/pme

Created CPU implementation of PME

Merge pull request #32 from peastman/pme
Created CPU implementation of PME
b1d621b3 · Yutong Zhao · cce1ca81 · 794ab9a7 · b1d621b3 · b1d621b3
Commit b1d621b3 authored Jul 01, 2013 by Yutong Zhao
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -396,6 +396,20 @@ IF(OPENMM_BUILD_DRUDE_PLUGIN)
   ADD_SUBDIRECTORY(plugins/drude)
 ENDIF(OPENMM_BUILD_DRUDE_PLUGIN)

+# CPU PME plugin
+
+FIND_PACKAGE(FFTW QUIET)
+IF(FFTW_FOUND)
+    SET(OPENMM_BUILD_PME_PLUGIN ON CACHE BOOL "Build CPU PME plugin")
+ELSE(FFTW_FOUND)
+    SET(OPENMM_BUILD_PME_PLUGIN OFF CACHE BOOL "Build CPU PME plugin")
+ENDIF(FFTW_FOUND)
+SET(OPENMM_BUILD_PME_PATH)
+IF(OPENMM_BUILD_PME_PLUGIN)
+   SET(OPENMM_BUILD_PME_PATH ${CMAKE_CURRENT_SOURCE_DIR}/plugins/cpupme)
+   ADD_SUBDIRECTORY(plugins/cpupme)
+ENDIF(OPENMM_BUILD_PME_PLUGIN)
+
 INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_TARGET})
 IF(OPENMM_BUILD_STATIC_LIB)
  INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${STATIC_TARGET})

--- a/cmake_modules/FindFFTW.cmake
+++ b/cmake_modules/FindFFTW.cmake
+# - Find FFTW
+# Find the native FFTW includes and library
+#
+#  FFTW_INCLUDES        - where to find fftw3.h
+#  FFTW_LIBRARY         - the main FFTW library.
+#  FFTW_THREADS_LIBRARY - the FFTW multithreading support library.
+#  FFTW_FOUND           - True if FFTW found.
+
+if (FFTW_INCLUDES)
+  # Already in cache, be silent
+  set (FFTW_FIND_QUIETLY TRUE)
+endif (FFTW_INCLUDES)
+
+find_path (FFTW_INCLUDES fftw3.h)
+
+find_library (FFTW_LIBRARY NAMES fftw3f)
+find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads)
+
+# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
+# all listed variables are TRUE
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES)
+find_package_handle_standard_args (FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES)
+
+mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES)
--- a/olla/include/openmm/kernels.h
+++ b/olla/include/openmm/kernels.h
@@ -1152,6 +1152,70 @@ public:
    virtual void execute(ContextImpl& context) = 0;
 };

+/**
+ * This kernel performs the reciprocal space calculation for PME.  In most cases, this
+ * calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
+ * In some cases it may want to outsource the work to a different kernel.  In particular,
+ * GPU based platforms sometimes use a CPU based implementation provided by a separate
+ * plugin.
+ */
+class CalcPmeReciprocalForceKernel : public KernelImpl {
+public:
+    class IO;
+    static std::string Name() {
+        return "CalcPmeReciprocalForce";
+    }
+    CalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
+    /**
+     * Begin computing the force and energy.
+     * 
+     * @param io               an object that coordinates data transfer
+     * @param periodicBoxSize  the size of the periodic box (measured in nm)
+     * @param includeEnergy    true if potential energy should be computed
+     */
+    virtual void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) = 0;
+    /**
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
+     */
+    virtual double finishComputation(IO& io) = 0;
+};
+
+/**
+ * Any class that uses CalcPmeReciprocalForceKernel should create an implementation of this
+ * class, then pass it to the kernel to manage communication with it.
+ */
+class CalcPmeReciprocalForceKernel::IO {
+public:
+    /**
+     * Get a pointer to the atom charges and positions.  This array should contain four
+     * elements for each atom: x, y, z, and q in that order.
+     */
+    virtual float* getPosq() = 0;
+    /**
+     * Record the forces calculated by the kernel.
+     * 
+     * @param force    an array containing four elements for each atom.  The first three
+     *                 are the x, y, and z components of the force, while the fourth element
+     *                 should be ignored.
+     */
+    virtual void setForce(float* force) = 0;
+};
+
+
 } // namespace OpenMM

 #endif /*OPENMM_KERNELS_H_*/
--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -81,6 +81,13 @@ public:
        static const std::string key = "CudaPrecision";
        return key;
    }
+    /**
+     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
+     */
+    static const std::string& CudaUseCpuPme() {
+        static const std::string key = "CudaUseCpuPme";
+        return key;
+    }
    /**
     * This is the name of the parameter for specifying the path to the CUDA compiler.
     */
@@ -99,14 +106,15 @@ public:

 class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
-            const std::string& compilerProperty, const std::string& tempProperty);
+    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
+    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM, peerAccessSupported;
+    bool removeCM, peerAccessSupported, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    for (int i = 0; i < (int) preComputations.size(); i++)
+        delete preComputations[i];
+    for (int i = 0; i < (int) postComputations.size(); i++)
+        delete postComputations[i];
    if (pinnedBuffer != NULL)
        cuMemFreeHost(pinnedBuffer);
    if (posq != NULL)
@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
    reorderListeners.push_back(listener);
 }

+void CudaContext::addPreComputation(ForcePreComputation* computation) {
+    preComputations.push_back(computation);
+}
+
+void CudaContext::addPostComputation(ForcePostComputation* computation) {
+    postComputations.push_back(computation);
+}
+
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda/src/CudaContext.h
+++ b/platforms/cuda/src/CudaContext.h
@@ -70,6 +70,8 @@ public:
    class WorkTask;
    class WorkThread;
    class ReorderListener;
+    class ForcePreComputation;
+    class ForcePostComputation;
    static const int ThreadBlockSize;
    static const int TileSize;
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
@@ -454,6 +456,28 @@ public:
    std::vector<ReorderListener*>& getReorderListeners() {
        return reorderListeners;
    }
+    /**
+     * Add a pre-computation that should be called at the very start of force and energy evaluations.
+     * The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addPreComputation(ForcePreComputation* computation);
+    /**
+     * Get the list of ForcePreComputations.
+     */
+    std::vector<ForcePreComputation*>& getPreComputations() {
+        return preComputations;
+    }
+    /**
+     * Add a post-computation that should be called at the very end of force and energy evaluations.
+     * The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addPostComputation(ForcePostComputation* computation);
+    /**
+     * Get the list of ForcePostComputations.
+     */
+    std::vector<ForcePostComputation*>& getPostComputations() {
+        return postComputations;
+    }
    /**
     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
     * This should be called whenever force field parameters change.  It will cause the definitions
@@ -519,6 +543,8 @@ private:
    std::vector<CUdeviceptr> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
+    std::vector<ForcePreComputation*> preComputations;
+    std::vector<ForcePostComputation*> postComputations;
    CudaIntegrationUtilities* integration;
    CudaExpressionUtilities* expression;
    CudaBondedUtilities* bonded;
@@ -580,7 +606,7 @@ private:

 /**
 * This abstract class defines a function to be executed whenever atoms get reordered.
- * Objects that need to know when reordering happens should create a reorderListener
+ * Objects that need to know when reordering happens should create a ReorderListener
 * and register it by calling addReorderListener().
 */
 class CudaContext::ReorderListener {
@@ -590,6 +616,39 @@ public:
    }
 };

+/**
+ * This abstract class defines a function to be executed at the very beginning of force and
+ * energy evaluation, before any other calculation has been done.  It is useful for operations
+ * that need to be performed at a nonstandard point in the process.  After creating a
+ * ForcePreComputation, register it by calling addForcePreComputation().
+ */
+class CudaContext::ForcePreComputation {
+public:
+    /**
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    virtual void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+};
+
+/**
+ * This abstract class defines a function to be executed at the very end of force and
+ * energy evaluation, after all other calculations have been done.  It is useful for operations
+ * that need to be performed at a nonstandard point in the process.  After creating a
+ * ForcePostComputation, register it by calling addForcePostComputation().
+ */
+class CudaContext::ForcePostComputation {
+public:
+    /**
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return an optional contribution to add to the potential energy.
+     */
+    virtual double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_CUDACONTEXT_H_*/
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {

 void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
    cu.setAsCurrent();
+    cu.clearAutoclearBuffers();
+    for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
+        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cu.setComputeForceCount(cu.getComputeForceCount()+1);
-    cu.clearAutoclearBuffers();
    if (includeNonbonded)
        nb.prepareInteractions();
 }
@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
    cu.getBondedUtilities().computeInteractions(groups);
    if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
        cu.getNonbondedUtilities().computeInteractions();
-    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    double sum = 0.0;
+    for (vector<CudaContext::ForcePostComputation*>::iterator iter = cu.getPostComputations().begin(); iter != cu.getPostComputations().end(); ++iter)
+        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
+    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    if (includeEnergy) {
        CudaArray& energyArray = cu.getEnergyBuffer();
        if (cu.getUseDoublePrecision()) {
@@ -1330,6 +1334,58 @@ private:
    const NonbondedForce& force;
 };

+class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel), forceTemp(NULL) {
+        forceTemp = CudaArray::create<float4>(cu, cu.getNumAtoms(), "PmeForce");
+    }
+    ~PmeIO() {
+        if (forceTemp != NULL)
+            delete forceTemp;
+    }
+    float* getPosq() {
+        cu.setAsCurrent();
+        cu.getPosq().download(posq);
+        return (float*) &posq[0];
+    }
+    void setForce(float* force) {
+        forceTemp->upload(force);
+        void* args[] = {&forceTemp->getDevicePointer(), &cu.getForce().getDevicePointer()};
+        cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
+    }
+private:
+    CudaContext& cu;
+    vector<float4> posq;
+    CudaArray* forceTemp;
+    CUfunction addForcesKernel;
+};
+
+class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
+public:
+    PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        Vec3 boxSize(cu.getPeriodicBoxSize().x, cu.getPeriodicBoxSize().y, cu.getPeriodicBoxSize().z);
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
+    }
+private:
+    CudaContext& cu;
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
+class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
+public:
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+    }
+private:
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
 CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
    cu.setAsCurrent();
    if (sigmaEpsilon != NULL)
@@ -1354,6 +1410,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomGridIndex;
    if (sort != NULL)
        delete sort;
+    if (pmeio != NULL)
+        delete pmeio;
    if (hasInitializedFFT) {
        cufftDestroy(fftForward);
        cufftDestroy(fftBackward);
@@ -1457,7 +1515,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cu.getContextIndex() == 0) {
        // Compute the Ewald parameters.

        int kmaxx, kmaxy, kmaxz;
@@ -1465,7 +1523,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);

        // Create the reciprocal space kernels.

@@ -1484,7 +1542,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
        cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
    }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
+    else if (force.getNonbondedMethod() == NonbondedForce::PME && cu.getContextIndex() == 0) {
        // Compute the PME parameters.

        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1497,7 +1555,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
        pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -1510,6 +1568,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        if (cu.getUseDoublePrecision())
            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+        if (cu.getPlatformData().useCpuPme) {
+            // Create the CPU PME kernel.
+            
+            try {
+                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
+                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                CUfunction addForcesKernel = cu.getKernel(module, "addForces");
+                pmeio = new PmeIO(cu, addForcesKernel);
+                cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
+                cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+            }
+            catch (OpenMMException& ex) {
+                // The CPU PME plugin isn't available.
+            }
+        }
+        if (pmeio == NULL) {
            pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
            pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
            pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
@@ -1618,6 +1692,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                }
            }
        }
+    }
    else
        ewaldSelfEnergy = 0.0;

@@ -1654,13 +1729,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
 }

 double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
-    if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
+    if (cosSinSums != NULL && includeReciprocal) {
        void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
        cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize());
        void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
    }
-    if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
+    if (directPmeGrid != NULL && includeReciprocal) {
        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());


--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -557,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -596,6 +596,9 @@ private:
        const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class PmeIO;
+    class PmePreComputation;
+    class PmePostComputation;
    CudaContext& cu;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
@@ -609,6 +612,8 @@ private:
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
+    Kernel cpuPme;
+    PmeIO* pmeio;
    cufftHandle fftForward;
    cufftHandle fftBackward;
    CUfunction ewaldSumsKernel;

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -87,12 +87,14 @@ CudaPlatform::CudaPlatform() {
    platformProperties.push_back(CudaDeviceName());
    platformProperties.push_back(CudaUseBlockingSync());
    platformProperties.push_back(CudaPrecision());
+    platformProperties.push_back(CudaUseCpuPme());
    platformProperties.push_back(CudaCompiler());
    platformProperties.push_back(CudaTempDirectory());
    setPropertyDefaultValue(CudaDeviceIndex(), "");
    setPropertyDefaultValue(CudaDeviceName(), "");
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
    setPropertyDefaultValue(CudaPrecision(), "single");
+    setPropertyDefaultValue(CudaUseCpuPme(), "false");
 #ifdef _MSC_VER
    char* bindir = getenv("CUDA_BIN_PATH");
    string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
@@ -141,13 +143,20 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
    string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
            getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
+    string cpuPmePropValue = (properties.find(CudaUseCpuPme()) == properties.end() ?
+            getPropertyDefaultValue(CudaUseCpuPme()) : properties.find(CudaUseCpuPme())->second);
    const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
-    context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
+    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
+    vector<string> pmeKernelName;
+    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
+    if (!supportsKernels(pmeKernelName))
+        cpuPmePropValue = "false";
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -155,8 +164,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
    delete data;
 }

-CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
@@ -185,10 +194,12 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
    }
+    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
    propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
+    propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
    contextEnergy.resize(contexts.size());

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -271,3 +271,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
    }
 }
+
+extern "C" __global__
+void addForces(const real4* __restrict__ forces, unsigned long long* __restrict__ forceBuffers) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real4 f = forces[atom];
+        forceBuffers[atom] += static_cast<unsigned long long>((long long) (f.x*0x100000000));
+        forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.y*0x100000000));
+        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
+    }
+}
--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -54,7 +54,7 @@ void testGaussian() {
    System system;
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
-    CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {

    System system;
    system.addParticle(0.0);
-    CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();

--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -87,17 +87,25 @@ public:
        static const std::string key = "OpenCLPrecision";
        return key;
    }
+    /**
+     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
+     */
+    static const std::string& OpenCLUseCpuPme() {
+        static const std::string key = "OpenCLUseCpuPme";
+        return key;
+    }
 };

 class OPENMM_EXPORT_OPENCL OpenCLPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
+    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty, const std::string& cpuPmeProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
+    ContextImpl* context;
    std::vector<OpenCLContext*> contexts;
    std::vector<double> contextEnergy;
-    bool removeCM;
+    bool removeCM, useCpuPme;
    int cmMotionFrequency;
    int stepCount, computeForceCount;
    double time;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -334,6 +334,10 @@ OpenCLContext::~OpenCLContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    for (int i = 0; i < (int) preComputations.size(); i++)
+        delete preComputations[i];
+    for (int i = 0; i < (int) postComputations.size(); i++)
+        delete postComputations[i];
    if (pinnedBuffer != NULL)
        delete pinnedBuffer;
    if (posq != NULL)
@@ -1106,6 +1110,14 @@ void OpenCLContext::addReorderListener(ReorderListener* listener) {
    reorderListeners.push_back(listener);
 }

+void OpenCLContext::addPreComputation(ForcePreComputation* computation) {
+    preComputations.push_back(computation);
+}
+
+void OpenCLContext::addPostComputation(ForcePostComputation* computation) {
+    postComputations.push_back(computation);
+}
+
 struct OpenCLContext::WorkThread::ThreadData {
    ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -158,6 +158,8 @@ public:
    class WorkTask;
    class WorkThread;
    class ReorderListener;
+    class ForcePreComputation;
+    class ForcePostComputation;
    static const int ThreadBlockSize;
    static const int TileSize;
    OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
@@ -554,6 +556,28 @@ public:
    std::vector<ReorderListener*>& getReorderListeners() {
        return reorderListeners;
    }
+    /**
+     * Add a pre-computation that should be called at the very start of force and energy evaluations.
+     * The OpenCLContext assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addPreComputation(ForcePreComputation* computation);
+    /**
+     * Get the list of ForcePreComputations.
+     */
+    std::vector<ForcePreComputation*>& getPreComputations() {
+        return preComputations;
+    }
+    /**
+     * Add a post-computation that should be called at the very end of force and energy evaluations.
+     * The OpenCLContext assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addPostComputation(ForcePostComputation* computation);
+    /**
+     * Get the list of ForcePostComputations.
+     */
+    std::vector<ForcePostComputation*>& getPostComputations() {
+        return postComputations;
+    }
    /**
     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
     * This should be called whenever force field parameters change.  It will cause the definitions
@@ -625,6 +649,8 @@ private:
    std::vector<cl::Memory*> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
+    std::vector<ForcePreComputation*> preComputations;
+    std::vector<ForcePostComputation*> postComputations;
    OpenCLIntegrationUtilities* integration;
    OpenCLExpressionUtilities* expression;
    OpenCLBondedUtilities* bonded;
@@ -686,7 +712,7 @@ private:

 /**
 * This abstract class defines a function to be executed whenever atoms get reordered.
- * Objects that need to know when reordering happens should create a reorderListener
+ * Objects that need to know when reordering happens should create a ReorderListener
 * and register it by calling addReorderListener().
 */
 class OpenCLContext::ReorderListener {
@@ -696,6 +722,39 @@ public:
    }
 };

+/**
+ * This abstract class defines a function to be executed at the very beginning of force and
+ * energy evaluation, before any other calculation has been done.  It is useful for operations
+ * that need to be performed at a nonstandard point in the process.  After creating a
+ * ForcePreComputation, register it by calling addForcePreComputation().
+ */
+class OpenCLContext::ForcePreComputation {
+public:
+    /**
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    virtual void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+};
+
+/**
+ * This abstract class defines a function to be executed at the very end of force and
+ * energy evaluation, after all other calculations have been done.  It is useful for operations
+ * that need to be performed at a nonstandard point in the process.  After creating a
+ * ForcePostComputation, register it by calling addForcePostComputation().
+ */
+class OpenCLContext::ForcePostComputation {
+public:
+    /**
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return an optional contribution to add to the potential energy.
+     */
+    virtual double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_OPENCLCONTEXT_H_*/
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -104,10 +104,12 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
 }

 void OpenCLCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
+    cl.clearAutoclearBuffers();
+    for (vector<OpenCLContext::ForcePreComputation*>::iterator iter = cl.getPreComputations().begin(); iter != cl.getPreComputations().end(); ++iter)
+        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cl.setComputeForceCount(cl.getComputeForceCount()+1);
-    cl.clearAutoclearBuffers();
    if (includeNonbonded)
        nb.prepareInteractions();
 }
@@ -117,8 +119,10 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
    if ((groups&(1<<cl.getNonbondedUtilities().getForceGroup())) != 0)
        cl.getNonbondedUtilities().computeInteractions();
    cl.reduceForces();
+    double sum = 0.0;
+    for (vector<OpenCLContext::ForcePostComputation*>::iterator iter = cl.getPostComputations().begin(); iter != cl.getPostComputations().end(); ++iter)
+        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
-    double sum = 0.0f;
    if (includeEnergy) {
        OpenCLArray& energyArray = cl.getEnergyBuffer();
        if (cl.getUseDoublePrecision()) {
@@ -1323,6 +1327,58 @@ private:
    const NonbondedForce& force;
 };

+class OpenCLCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    PmeIO(OpenCLContext& cl, cl::Kernel addForcesKernel) : cl(cl), addForcesKernel(addForcesKernel), forceTemp(NULL) {
+        forceTemp = OpenCLArray::create<mm_float4>(cl, cl.getNumAtoms(), "PmeForce");
+        addForcesKernel.setArg<cl::Buffer>(0, forceTemp->getDeviceBuffer());
+    }
+    ~PmeIO() {
+        if (forceTemp != NULL)
+            delete forceTemp;
+    }
+    float* getPosq() {
+        cl.getPosq().download(posq);
+        return (float*) &posq[0];
+    }
+    void setForce(float* force) {
+        forceTemp->upload(force);
+        addForcesKernel.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
+        cl.executeKernel(addForcesKernel, cl.getNumAtoms());
+    }
+private:
+    OpenCLContext& cl;
+    vector<mm_float4> posq;
+    OpenCLArray* forceTemp;
+    cl::Kernel addForcesKernel;
+};
+
+class OpenCLCalcNonbondedForceKernel::PmePreComputation : public OpenCLContext::ForcePreComputation {
+public:
+    PmePreComputation(OpenCLContext& cl, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cl(cl), pme(pme), io(io) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        Vec3 boxSize(cl.getPeriodicBoxSize().x, cl.getPeriodicBoxSize().y, cl.getPeriodicBoxSize().z);
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
+    }
+private:
+    OpenCLContext& cl;
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
+class OpenCLCalcNonbondedForceKernel::PmePostComputation : public OpenCLContext::ForcePostComputation {
+public:
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+    }
+private:
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
 OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
    if (sigmaEpsilon != NULL)
        delete sigmaEpsilon;
@@ -1350,6 +1406,8 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
        delete sort;
    if (fft != NULL)
        delete fft;
+    if (pmeio != NULL)
+        delete pmeio;
 }

 void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -1430,7 +1488,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cl.getContextIndex() == 0) {
        // Compute the Ewald parameters.

        int kmaxx, kmaxy, kmaxz;
@@ -1438,7 +1496,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cl.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);

        // Create the reciprocal space kernels.

@@ -1454,7 +1512,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
        cosSinSums = new OpenCLArray(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
    }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
+    else if (force.getNonbondedMethod() == NonbondedForce::PME && cl.getContextIndex() == 0) {
        // Compute the PME parameters.

        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1465,7 +1523,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        defines["EWALD_ALPHA"] = cl.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cl.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cl.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
        pmeDefines["PME_ORDER"] = cl.intToString(PmeOrder);
        pmeDefines["NUM_ATOMS"] = cl.intToString(numParticles);
        pmeDefines["RECIP_EXP_FACTOR"] = cl.doubleToString(M_PI*M_PI/(alpha*alpha));
@@ -1476,7 +1534,23 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
        if (deviceIsCpu)
            pmeDefines["DEVICE_IS_CPU"] = "1";
+        if (cl.getPlatformData().useCpuPme) {
+            // Create the CPU PME kernel.
            
+            try {
+                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cl.getPlatformData().context);
+                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
+                cl::Kernel addForcesKernel = cl::Kernel(program, "addForces");
+                pmeio = new PmeIO(cl, addForcesKernel);
+                cl.addPreComputation(new PmePreComputation(cl, cpuPme, *pmeio));
+                cl.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+            }
+            catch (OpenMMException& ex) {
+                // The CPU PME plugin isn't available.
+            }
+        }
+        if (pmeio == NULL) {
            // Create required data structures.

            int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
@@ -1565,6 +1639,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                }
            }
        }
+    }
    else
        ewaldSelfEnergy = 0.0;

@@ -1650,7 +1725,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            }
       }
    }
-    if (cosSinSums != NULL && cl.getContextIndex() == 0 && includeReciprocal) {
+    if (cosSinSums != NULL && includeReciprocal) {
        mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
        mm_double4 recipBoxSize = mm_double4(2*M_PI/boxSize.x, 2*M_PI/boxSize.y, 2*M_PI/boxSize.z, 0.0);
        double recipCoefficient = ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z);
@@ -1669,7 +1744,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        cl.executeKernel(ewaldSumsKernel, cosSinSums->getSize());
        cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
    }
-    if (pmeGrid != NULL && cl.getContextIndex() == 0 && includeReciprocal) {
+    if (pmeGrid != NULL && includeReciprocal) {
        setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4);
        setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5);
        cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -557,7 +557,7 @@ public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
-            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
+            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
    /**
@@ -596,6 +596,9 @@ private:
        const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class PmeIO;
+    class PmePreComputation;
+    class PmePostComputation;
    OpenCLContext& cl;
    bool hasInitializedKernel;
    OpenCLArray* sigmaEpsilon;
@@ -611,6 +614,8 @@ private:
    OpenCLArray* pmeAtomGridIndex;
    OpenCLSort* sort;
    OpenCLFFT3D* fft;
+    Kernel cpuPme;
+    PmeIO* pmeio;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
    cl::Kernel pmeGridIndexKernel;

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -31,6 +31,7 @@
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/Context.h"
 #include "openmm/System.h"
+#include <algorithm>
 #include <sstream>

 using namespace OpenMM;
@@ -78,11 +79,13 @@ OpenCLPlatform::OpenCLPlatform() {
    platformProperties.push_back(OpenCLPlatformIndex());
    platformProperties.push_back(OpenCLPlatformName());
    platformProperties.push_back(OpenCLPrecision());
+    platformProperties.push_back(OpenCLUseCpuPme());
    setPropertyDefaultValue(OpenCLDeviceIndex(), "");
    setPropertyDefaultValue(OpenCLDeviceName(), "");
    setPropertyDefaultValue(OpenCLPlatformIndex(), "");
    setPropertyDefaultValue(OpenCLPlatformName(), "");
    setPropertyDefaultValue(OpenCLPrecision(), "single");
+    setPropertyDefaultValue(OpenCLUseCpuPme(), "false");
 }

 double OpenCLPlatform::getSpeed() const {
@@ -112,7 +115,15 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
            getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
    string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
            getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
-    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
+    string cpuPmePropValue = (properties.find(OpenCLUseCpuPme()) == properties.end() ?
+            getPropertyDefaultValue(OpenCLUseCpuPme()) : properties.find(OpenCLUseCpuPme())->second);
+    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
+    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
+    vector<string> pmeKernelName;
+    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
+    if (!supportsKernels(pmeKernelName))
+        cpuPmePropValue = "false";
+    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue, cpuPmePropValue));
 }

 void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
@@ -121,7 +132,7 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
 }

 OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
-        const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+        const string& precisionProperty, const string& cpuPmeProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    int platformIndex = 0;
    if (platformPropValue.length() > 0)
        stringstream(platformPropValue) >> platformIndex;
@@ -150,6 +161,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
        deviceIndex << contexts[i]->getDeviceIndex();
        deviceName << contexts[i]->getDevice().getInfo<CL_DEVICE_NAME>();
    }
+    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = deviceIndex.str();
    propertyValues[OpenCLPlatform::OpenCLDeviceName()] = deviceName.str();
    propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = contexts[0]->intToString(platformIndex);
@@ -157,6 +169,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
    cl::Platform::get(&platforms);
    propertyValues[OpenCLPlatform::OpenCLPlatformName()] = platforms[platformIndex].getInfo<CL_PLATFORM_NAME>();
    propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
+    propertyValues[OpenCLPlatform::OpenCLUseCpuPme()] = useCpuPme ? "true" : "false";
    contextEnergy.resize(contexts.size());
 }


--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -391,3 +391,8 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
        forceBuffers[atom] = totalForce;
    }
 }
+
+__kernel void addForces(__global const real4* restrict forces, __global real4* restrict forceBuffers) {
+    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0))
+        forceBuffers[atom] += forces[atom];
+}
--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -54,7 +54,7 @@ template <class Real2>
 void testTransform() {
    System system;
    system.addParticle(0.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"));
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;