Merge branch 'pme' of https://github.com/peastman/openmm into pme

68f8aed8 · Peter · 84acc13d · 5e0b0e3a · 68f8aed8 · 68f8aed8
Commit 68f8aed8 authored Jun 24, 2013 by Peter
17 changed files
--- a/cmake_modules/FindFFTW.cmake
+++ b/cmake_modules/FindFFTW.cmake
+# - Find FFTW
+# Find the native FFTW includes and library
+#
+#  FFTW_INCLUDES        - where to find fftw3.h
+#  FFTW_LIBRARY         - the main FFTW library.
+#  FFTW_THREADS_LIBRARY - the FFTW multithreading support library.
+#  FFTW_FOUND           - True if FFTW found.
+
+if (FFTW_INCLUDES)
+  # Already in cache, be silent
+  set (FFTW_FIND_QUIETLY TRUE)
+endif (FFTW_INCLUDES)
+
+find_path (FFTW_INCLUDES fftw3.h)
+
+find_library (FFTW_LIBRARY NAMES fftw3f)
+find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads)
+
+# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
+# all listed variables are TRUE
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES)
+find_package_handle_standard_args (FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES)
+
+mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES)
--- a/olla/include/openmm/kernels.h
+++ b/olla/include/openmm/kernels.h
@@ -1152,6 +1152,70 @@ public:
    virtual void execute(ContextImpl& context) = 0;
 };

+/**
+ * This kernel performs the reciprocal space calculation for PME.  In most cases, this
+ * calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
+ * In some cases it may want to outsource the work to a different kernel.  In particular,
+ * GPU based platforms sometimes use a CPU based implementation provided by a separate
+ * plugin.
+ */
+class CalcPmeReciprocalForceKernel : public KernelImpl {
+public:
+    class IO;
+    static std::string Name() {
+        return "CalcPmeReciprocalForce";
+    }
+    CalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
+    /**
+     * Begin computing the force and energy.
+     * 
+     * @param io               an object that coordinates data transfer
+     * @param periodicBoxSize  the size of the periodic box (measured in nm)
+     * @param includeEnergy    true if potential energy should be computed
+     */
+    virtual void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) = 0;
+    /**
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
+     */
+    virtual double finishComputation(IO& io) = 0;
+};
+
+/**
+ * Any class that uses CalcPmeReciprocalForceKernel should create an implementation of this
+ * class, then pass it to the kernel to manage communication with it.
+ */
+class CalcPmeReciprocalForceKernel::IO {
+public:
+    /**
+     * Get a pointer to the atom charges and positions.  This array should contain four
+     * elements for each atom: x, y, z, and q in that order.
+     */
+    virtual float* getPosq() = 0;
+    /**
+     * Record the forces calculated by the kernel.
+     * 
+     * @param force    an array containing four elements for each atom.  The first three
+     *                 are the x, y, and z components of the force, while the fourth element
+     *                 should be ignored.
+     */
+    virtual void setForce(float* force) = 0;
+};
+
+
 } // namespace OpenMM

 #endif /*OPENMM_KERNELS_H_*/
--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -99,11 +99,12 @@ public:

 class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& compilerProperty, const std::string& tempProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
+    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
    bool removeCM, peerAccessSupported;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -231,6 +231,10 @@ CudaContext::~CudaContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    for (int i = 0; i < (int) preComputations.size(); i++)
+        delete preComputations[i];
+    for (int i = 0; i < (int) postComputations.size(); i++)
+        delete postComputations[i];
    if (pinnedBuffer != NULL)
        cuMemFreeHost(pinnedBuffer);
    if (posq != NULL)
@@ -1102,6 +1106,14 @@ void CudaContext::addReorderListener(ReorderListener* listener) {
    reorderListeners.push_back(listener);
 }

+void CudaContext::addPreComputation(ForcePreComputation* computation) {
+    preComputations.push_back(computation);
+}
+
+void CudaContext::addPostComputation(ForcePostComputation* computation) {
+    postComputations.push_back(computation);
+}
+
 struct CudaContext::WorkThread::ThreadData {
    ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/cuda/src/CudaContext.h
+++ b/platforms/cuda/src/CudaContext.h
@@ -70,6 +70,8 @@ public:
    class WorkTask;
    class WorkThread;
    class ReorderListener;
+    class ForcePreComputation;
+    class ForcePostComputation;
    static const int ThreadBlockSize;
    static const int TileSize;
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
@@ -454,6 +456,28 @@ public:
    std::vector<ReorderListener*>& getReorderListeners() {
        return reorderListeners;
    }
+    /**
+     * Add a pre-computation that should be called at the very start of force and energy evalutations.
+     * The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addPreComputation(ForcePreComputation* computation);
+    /**
+     * Get the list of ForcePreComputations.
+     */
+    std::vector<ForcePreComputation*>& getPreComputations() {
+        return preComputations;
+    }
+    /**
+     * Add a post-computation that should be called at the very end of force and energy evalutations.
+     * The CudaContext assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addPostComputation(ForcePostComputation* computation);
+    /**
+     * Get the list of ForcePostComputations.
+     */
+    std::vector<ForcePostComputation*>& getPostComputations() {
+        return postComputations;
+    }
    /**
     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
     * This should be called whenever force field parameters change.  It will cause the definitions
@@ -519,6 +543,8 @@ private:
    std::vector<CUdeviceptr> autoclearBuffers;
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
+    std::vector<ForcePreComputation*> preComputations;
+    std::vector<ForcePostComputation*> postComputations;
    CudaIntegrationUtilities* integration;
    CudaExpressionUtilities* expression;
    CudaBondedUtilities* bonded;
@@ -580,7 +606,7 @@ private:

 /**
 * This abstract class defines a function to be executed whenever atoms get reordered.
- * Objects that need to know when reordering happens should create a reorderListener
+ * Objects that need to know when reordering happens should create a ReorderListener
 * and register it by calling addReorderListener().
 */
 class CudaContext::ReorderListener {
@@ -590,6 +616,38 @@ public:
    }
 };

+/**
+ * This abstract class defines a function to be executed at the very beginning of force and
+ * energy evaluation, before any other calculation has been done.  It is useful for operations
+ * that need to be performed at a nonstandard point in the process.  After creating a
+ * ForcePreComputation, register it by calling addForcePreComputation().
+ */
+class CudaContext::ForcePreComputation {
+public:
+    /**
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     */
+    virtual void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+};
+
+/**
+ * This abstract class defines a function to be executed at the very end of force and
+ * energy evaluation, after all other calculations have been done.  It is useful for operations
+ * that need to be performed at a nonstandard point in the process.  After creating a
+ * ForcePostComputation, register it by calling addForcePostComputation().
+ */
+class CudaContext::ForcePostComputation {
+public:
+    /**
+     * @param includeForce  true if forces should be computed
+     * @param includeEnergy true if potential energy should be computed
+     * @param groups        a set of bit flags for which force groups to include
+     * @return an optional contribution to add to the potential energy.     */
+    virtual double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) = 0;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_CUDACONTEXT_H_*/
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {

 void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
    cu.setAsCurrent();
+    cu.clearAutoclearBuffers();
+    for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
+        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cu.setComputeForceCount(cu.getComputeForceCount()+1);
-    cu.clearAutoclearBuffers();
    if (includeNonbonded)
        nb.prepareInteractions();
 }
@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
    cu.getBondedUtilities().computeInteractions(groups);
    if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
        cu.getNonbondedUtilities().computeInteractions();
-    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    double sum = 0.0;
+    for (vector<CudaContext::ForcePostComputation*>::iterator iter = cu.getPostComputations().begin(); iter != cu.getPostComputations().end(); ++iter)
+        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
+    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    if (includeEnergy) {
        CudaArray& energyArray = cu.getEnergyBuffer();
        if (cu.getUseDoublePrecision()) {
@@ -1330,6 +1334,59 @@ private:
    const NonbondedForce& force;
 };

+class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel), forceTemp(NULL) {
+        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));        
+        forceTemp = new CudaArray(cu, cu.getNumAtoms(), elementSize, "PmeForce");
+    }
+    ~PmeIO() {
+        if (forceTemp != NULL)
+            delete forceTemp;
+    }
+    float* getPosq() {
+        cu.setAsCurrent();
+        cu.getPosq().download(posq);
+        return (float*) &posq[0];
+    }
+    void setForce(float* force) {
+        forceTemp->upload(force);
+        void* args[] = {&forceTemp->getDevicePointer(), &cu.getForce().getDevicePointer()};
+        cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
+    }
+private:
+    CudaContext& cu;
+    vector<float4> posq;
+    CudaArray* forceTemp;
+    CUfunction addForcesKernel;
+};
+
+class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
+public:
+    PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
+    }
+    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        Vec3 boxSize(cu.getPeriodicBoxSize().x, cu.getPeriodicBoxSize().y, cu.getPeriodicBoxSize().z);
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
+    }
+private:
+    CudaContext& cu;
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
+class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
+public:
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
+    }
+    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
+    }
+private:
+    Kernel pme;
+    CalcPmeReciprocalForceKernel::IO& io;
+};
+
 CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
    cu.setAsCurrent();
    if (sigmaEpsilon != NULL)
@@ -1354,6 +1411,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomGridIndex;
    if (sort != NULL)
        delete sort;
+    if (pmeio != NULL)
+        delete pmeio;
    if (hasInitializedFFT) {
        cufftDestroy(fftForward);
        cufftDestroy(fftBackward);
@@ -1457,7 +1516,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald && cu.getContextIndex() == 0) {
        // Compute the Ewald parameters.

        int kmaxx, kmaxy, kmaxz;
@@ -1465,7 +1524,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);

        // Create the reciprocal space kernels.

@@ -1484,7 +1543,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
        cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
    }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
+    else if (force.getNonbondedMethod() == NonbondedForce::PME && cu.getContextIndex() == 0) {
        // Compute the PME parameters.

        int gridSizeX, gridSizeY, gridSizeZ;
@@ -1497,7 +1556,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
-        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
+        ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
        pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
@@ -1510,111 +1569,127 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        if (cu.getUseDoublePrecision())
            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
-        pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
-        pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
-        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
-        pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
-        pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
-        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
-        cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
-        cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
-
-        // Create required data structures.
-
-        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-
-        directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
-        reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
-
-        cu.addAutoclearBuffer(*directPmeGrid);
-
-        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
-        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
-        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
-        pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-        pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
-        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
-
-        cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
-        if (result != CUFFT_SUCCESS)
-            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-        result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
-        if (result != CUFFT_SUCCESS)
-            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-
-        cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
-        cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
+        bool useCpuPme = true;
+        if (useCpuPme) {
+            try {
+                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
+                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+                CUfunction addForcesKernel = cu.getKernel(module, "addForces");
+                pmeio = new PmeIO(cu, addForcesKernel);
+                cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
+                cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+            }
+            catch (OpenMMException& ex) {
+                // The CPU PME plugin isn't available.
+            }
+        }
+        if (pmeio == NULL) {
+            pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
+            pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+            pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+            pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
+            pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
+            pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+            cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
+            cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
+
+            // Create required data structures.
+
+            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+
+            directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
+            reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
+
+            cu.addAutoclearBuffer(*directPmeGrid);
+
+            pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
+            pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
+            pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+            pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+            pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
+            sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
+
+            cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
+            if (result != CUFFT_SUCCESS)
+                throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+            result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
+            if (result != CUFFT_SUCCESS)
+                throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+
+            cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
+            cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
+
+            hasInitializedFFT = true;
+
+            // Initialize the b-spline moduli.
+
+            int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+            vector<double> data(PmeOrder);
+            vector<double> ddata(PmeOrder);
+            vector<double> bsplines_data(maxSize);
+            data[PmeOrder-1] = 0.0;
+            data[1] = 0.0;
+            data[0] = 1.0;
+            for (int i = 3; i < PmeOrder; i++) {
+                double div = 1.0/(i-1.0);
+                data[i-1] = 0.0;
+                for (int j = 1; j < (i-1); j++)
+                    data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+                data[0] = div*data[0];
+            }

-        hasInitializedFFT = true;
+            // Differentiate.

-        // Initialize the b-spline moduli.
-        
-        int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
-        vector<double> data(PmeOrder);
-        vector<double> ddata(PmeOrder);
-        vector<double> bsplines_data(maxSize);
-        data[PmeOrder-1] = 0.0;
-        data[1] = 0.0;
-        data[0] = 1.0;
-        for (int i = 3; i < PmeOrder; i++) {
-            double div = 1.0/(i-1.0);
-            data[i-1] = 0.0;
-            for (int j = 1; j < (i-1); j++)
-                data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+            ddata[0] = -data[0];
+            for (int i = 1; i < PmeOrder; i++)
+                ddata[i] = data[i-1]-data[i];
+            double div = 1.0/(PmeOrder-1);
+            data[PmeOrder-1] = 0.0;
+            for (int i = 1; i < (PmeOrder-1); i++)
+                data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
            data[0] = div*data[0];
-        }
-
-        // Differentiate.
-
-        ddata[0] = -data[0];
-        for (int i = 1; i < PmeOrder; i++)
-            ddata[i] = data[i-1]-data[i];
-        double div = 1.0/(PmeOrder-1);
-        data[PmeOrder-1] = 0.0;
-        for (int i = 1; i < (PmeOrder-1); i++)
-            data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
-        data[0] = div*data[0];
-        for (int i = 0; i < maxSize; i++)
-            bsplines_data[i] = 0.0;
-        for (int i = 1; i <= PmeOrder; i++)
-            bsplines_data[i] = data[i-1];
-
-        // Evaluate the actual bspline moduli for X/Y/Z.
-
-        for(int dim = 0; dim < 3; dim++) {
-            int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
-            vector<double> moduli(ndata);
-            for (int i = 0; i < ndata; i++) {
-                double sc = 0.0;
-                double ss = 0.0;
-                for (int j = 0; j < ndata; j++) {
-                    double arg = (2.0*M_PI*i*j)/ndata;
-                    sc += bsplines_data[j]*cos(arg);
-                    ss += bsplines_data[j]*sin(arg);
+            for (int i = 0; i < maxSize; i++)
+                bsplines_data[i] = 0.0;
+            for (int i = 1; i <= PmeOrder; i++)
+                bsplines_data[i] = data[i-1];
+
+            // Evaluate the actual bspline moduli for X/Y/Z.
+
+            for(int dim = 0; dim < 3; dim++) {
+                int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
+                vector<double> moduli(ndata);
+                for (int i = 0; i < ndata; i++) {
+                    double sc = 0.0;
+                    double ss = 0.0;
+                    for (int j = 0; j < ndata; j++) {
+                        double arg = (2.0*M_PI*i*j)/ndata;
+                        sc += bsplines_data[j]*cos(arg);
+                        ss += bsplines_data[j]*sin(arg);
+                    }
+                    moduli[i] = sc*sc+ss*ss;
                }
-                moduli[i] = sc*sc+ss*ss;
-            }
-            for (int i = 0; i < ndata; i++)
-                if (moduli[i] < 1.0e-7)
-                    moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
-            if (cu.getUseDoublePrecision()) {
-                if (dim == 0)
-                    pmeBsplineModuliX->upload(moduli);
-                else if (dim == 1)
-                    pmeBsplineModuliY->upload(moduli);
-                else
-                    pmeBsplineModuliZ->upload(moduli);
-            }
-            else {
-                vector<float> modulif(ndata);
                for (int i = 0; i < ndata; i++)
-                    modulif[i] = (float) moduli[i];
-                if (dim == 0)
-                    pmeBsplineModuliX->upload(modulif);
-                else if (dim == 1)
-                    pmeBsplineModuliY->upload(modulif);
-                else
-                    pmeBsplineModuliZ->upload(modulif);
+                    if (moduli[i] < 1.0e-7)
+                        moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
+                if (cu.getUseDoublePrecision()) {
+                    if (dim == 0)
+                        pmeBsplineModuliX->upload(moduli);
+                    else if (dim == 1)
+                        pmeBsplineModuliY->upload(moduli);
+                    else
+                        pmeBsplineModuliZ->upload(moduli);
+                }
+                else {
+                    vector<float> modulif(ndata);
+                    for (int i = 0; i < ndata; i++)
+                        modulif[i] = (float) moduli[i];
+                    if (dim == 0)
+                        pmeBsplineModuliX->upload(modulif);
+                    else if (dim == 1)
+                        pmeBsplineModuliY->upload(modulif);
+                    else
+                        pmeBsplineModuliZ->upload(modulif);
+                }
            }
        }
    }
@@ -1654,13 +1729,14 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
 }

 double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
-    if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
+    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
+    if (cosSinSums != NULL && includeReciprocal) {
        void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
        cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize());
        void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
    }
-    if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
+    if (directPmeGrid != NULL && includeReciprocal) {
        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());

@@ -1699,7 +1775,6 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
        cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);

    }
-    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    if (dispersionCoefficient != 0.0 && includeDirect) {
        double4 boxSize = cu.getPeriodicBoxSize();
        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);

--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -557,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -596,6 +596,9 @@ private:
        const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
        const char* getSortKey() const {return "value.y";}
    };
+    class PmeIO;
+    class PmePreComputation;
+    class PmePostComputation;
    CudaContext& cu;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
@@ -609,6 +612,8 @@ private:
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
+    Kernel cpuPme;
+    PmeIO* pmeio;
    cufftHandle fftForward;
    cufftHandle fftBackward;
    CUfunction ewaldSumsKernel;

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -147,7 +147,7 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
-    context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -155,8 +155,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
    delete data;
 }

-CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+            const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -266,8 +266,18 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
            }
        }
        real q = pos.w*EPSILON_FACTOR;
-        forceBuffers[atom] +=  static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
-        forceBuffers[atom+PADDED_NUM_ATOMS] +=  static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
-        forceBuffers[atom+2*PADDED_NUM_ATOMS] +=  static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
+        forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
+        forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
+        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
    }
 }
+
+extern "C" __global__
+void addForces(const real4* __restrict__ forces, unsigned long long* __restrict__ forceBuffers) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real4 f = forces[atom];
+        forceBuffers[atom] += static_cast<unsigned long long>((long long) (f.x*0x100000000));
+        forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.y*0x100000000));
+        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
+    }
+}
\ No newline at end of file
--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -54,7 +54,7 @@ void testGaussian() {
    System system;
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
-    CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {

    System system;
    system.addParticle(0.0);
-    CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();

--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CPU PME Plugin
+#
+# Creates plugin library, base name=OpenMMPME.
+# Default libraries are shared & optimized.
+#
+# Windows:
+#   OpenMMPME[_d].dll
+#   OpenMMPME[_d].lib
+# Unix:
+#   libOpenMMPME[_d].so
+#----------------------------------------------------
+
+IF (APPLE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
+ENDIF (APPLE)
+
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+
+
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+
+SET(OPENMMPME_LIBRARY_NAME OpenMMPME)
+
+SET(SHARED_TARGET ${OPENMMPME_LIBRARY_NAME})
+
+
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+
+
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+
+# Find the include files.
+SET(API_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_INCLUDE_FILES ${API_INCLUDE_FILES} ${fullpaths})
+ENDFOREACH(dir)
+
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+
+
+
+# Include FFTW related files.
+INCLUDE(FindFFTW)
+INCLUDE_DIRECTORIES(${FFTW_INCLUDES})
+
+# Build the plugin library.
+ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
+
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
+ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY})
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY")
+
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/plugins/cpupme/include/internal/windowsExportPme.h
+++ b/plugins/cpupme/include/internal/windowsExportPme.h
+#ifndef OPENMM_WINDOWSEXPORTPME_H_
+#define OPENMM_WINDOWSEXPORTPME_H_
+
+/*
+ * Shared libraries are messy in Visual Studio. We have to distinguish three
+ * cases:
+ *   (1) this header is being used to build the OpenMM shared library
+ *       (dllexport)
+ *   (2) this header is being used by a *client* of the OpenMM shared
+ *       library (dllimport)
+ *   (3) we are building the OpenMM static library, or the client is
+ *       being compiled with the expectation of linking with the
+ *       OpenMM static library (nothing special needed)
+ * In the CMake script for building this library, we define one of the symbols
+ *     OPENMM_PME_BUILDING_{SHARED|STATIC}_LIBRARY
+ * Client code normally has no special symbol defined, in which case we'll
+ * assume it wants to use the shared library. However, if the client defines
+ * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
+ * that the client code can be linked with static libraries. Note that
+ * the client symbol is not library dependent, while the library symbols
+ * affect only the OpenMM library, meaning that other libraries can
+ * be clients of this one. However, we are assuming all-static or all-shared.
+ */
+
+#ifdef _MSC_VER
+    // We don't want to hear about how sprintf is "unsafe".
+    #pragma warning(disable:4996)
+    // Keep MS VC++ quiet about lack of dll export of private members.
+    #pragma warning(disable:4251)
+    #if defined(OPENMM_PME_BUILDING_SHARED_LIBRARY)
+        #define OPENMM_EXPORT_PME __declspec(dllexport)
+    #elif defined(OPENMM_PME_BUILDING_STATIC_LIBRARY) || defined(OPENMM_PME_USE_STATIC_LIBRARIES)
+        #define OPENMM_EXPORT_PME
+    #else
+        #define OPENMM_EXPORT_PME __declspec(dllimport)   // i.e., a client of a shared library
+    #endif
+#else
+    #define OPENMM_EXPORT_PME // Linux, Mac
+#endif
+
+#endif // OPENMM_WINDOWSEXPORTPME_H_
--- a/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                              OpenMMAmoeba                                  *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuPmeKernelFactory.h"
+#include "CpuPmeKernels.h"
+#include "internal/windowsExportPme.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+
+extern "C" void registerPlatforms() {
+}
+
+extern "C" void registerKernelFactories() {
+    if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
+        CpuPmeKernelFactory* factory = new CpuPmeKernelFactory();
+        for (int i = 0; i < Platform::getNumPlatforms(); i++)
+            Platform::getPlatform(i).registerKernelFactory(CalcPmeReciprocalForceKernel::Name(), factory);
+    }
+}
+
+extern "C" OPENMM_EXPORT_PME void registerCpuPmeKernelFactories() {
+    registerKernelFactories();
+}
+
+KernelImpl* CpuPmeKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    if (name == CalcPmeReciprocalForceKernel::Name())
+        return new CpuCalcPmeReciprocalForceKernel(name, platform);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/plugins/cpupme/src/CpuPmeKernelFactory.h
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.h
+#ifndef OPENMM_CPUPMEKERNELFACTORY_H_
+#define OPENMM_CPUPMEKERNELFACTORY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/KernelFactory.h"
+
+namespace OpenMM {
+
+/**
+ * This KernelFactory creates kernels for the CPU implementation of PME.
+ */
+
+class CpuPmeKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPUPMEKERNELFACTORY_H_*/
--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
+#ifndef OPENMM_CPU_PME_KERNELS_H_
+#define OPENMM_CPU_PME_KERNELS_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "internal/windowsExportPme.h"
+#include "openmm/kernels.h"
+#include "openmm/Vec3.h"
+#include <fftw3.h>
+#include <pthread.h>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ */
+
+class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
+public:
+    class ThreadData;
+    CpuCalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
+            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
+    }
+    void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha);
+    ~CpuCalcPmeReciprocalForceKernel();
+    void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy);
+    double finishComputation(IO& io);
+    void runThread(int index);
+    static bool isProcessorSupported();
+private:
+    void threadWait();
+    void advanceThreads();
+    static bool hasInitializedThreads;
+    static int numThreads;
+    int gridx, gridy, gridz, numParticles;
+    double alpha;
+    bool hasCreatedPlan, isFinished, isDeleted;
+    std::vector<float> force;
+    std::vector<float> bsplineModuli[3];
+    float* realGrid;
+    fftwf_complex* complexGrid;
+    fftwf_plan forwardFFT, backwardFFT;
+    int waitCount;
+    pthread_cond_t startCondition, endCondition;
+    pthread_cond_t mainThreadStartCondition, mainThreadEndCondition;
+    pthread_mutex_t lock;
+    pthread_t mainThread;
+    std::vector<pthread_t> thread;
+    std::vector<ThreadData*> threadData;
+    // The following variables are used to store information about the calculation currently being performed.
+    IO* io;
+    float energy;
+    float* posq;
+    Vec3 periodicBoxSize;
+    bool includeEnergy;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPU_PME_KERNELS_H_*/