Moved CPU based PME to a separate plugin

5e0b0e3a · peastman · 932a61b6 · 5e0b0e3a · 5e0b0e3a · 5e0b0e3a
Commit 5e0b0e3a authored Jun 24, 2013 by peastman
14 changed files
--- a/olla/include/openmm/kernels.h
+++ b/olla/include/openmm/kernels.h
@@ -1152,6 +1152,70 @@ public:
    virtual void execute(ContextImpl& context) = 0;
 };
+/**
+ * This kernel performs the reciprocal space calculation for PME.  In most cases, this
+ * calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
+ * In some cases it may want to outsource the work to a different kernel.  In particular,
+ * GPU based platforms sometimes use a CPU based implementation provided by a separate
+ * plugin.
+ */
+class CalcPmeReciprocalForceKernel : public KernelImpl {
+public:
+    class IO;
+    static std::string Name() {
+        return "CalcPmeReciprocalForce";
+    }
+    CalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
+    /**
+     * Begin computing the force and energy.
+     * 
+     * @param io               an object that coordinates data transfer
+     * @param periodicBoxSize  the size of the periodic box (measured in nm)
+     * @param includeEnergy    true if potential energy should be computed
+     */
+    virtual void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) = 0;
+    /**
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
+     */
+    virtual double finishComputation(IO& io) = 0;
+};
+/**
+ * Any class that uses CalcPmeReciprocalForceKernel should create an implementation of this
+ * class, then pass it to the kernel to manage communication with it.
+ */
+class CalcPmeReciprocalForceKernel::IO {
+public:
+    /**
+     * Get a pointer to the atom charges and positions.  This array should contain four
+     * elements for each atom: x, y, z, and q in that order.
+     */
+    virtual float* getPosq() = 0;
+    /**
+     * Record the forces calculated by the kernel.
+     * 
+     * @param force    an array containing four elements for each atom.  The first three
+     *                 are the x, y, and z components of the force, while the fourth element
+     *                 should be ignored.
+     */
+    virtual void setForce(float* force) = 0;
+};
 } // namespace OpenMM
 #endif /*OPENMM_KERNELS_H_*/
--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -99,11 +99,12 @@ public:
 class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
+    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& compilerProperty, const std::string& tempProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
+    ContextImpl* context;
    std::vector<CudaContext*> contexts;
    std::vector<double> contextEnergy;
    bool removeCM, peerAccessSupported;

--- a/platforms/cuda/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda/sharedTarget/CMakeLists.txt
@@ -2,8 +2,7 @@
 # Include CUDA related files.
 #
 INCLUDE(FindCUDA)
-INCLUDE(FindFFTW)
+INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})
-INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE} ${FFTW_INCLUDES})
 FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
 ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
@@ -19,7 +18,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY})
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB})
 SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CUDA_BUILDING_SHARED_LIBRARY")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1334,7 +1334,7 @@ private:
    const NonbondedForce& force;
 };
-class CudaCalcNonbondedForceKernel::PmeIO : public CpuPme::IO {
+class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
 public:
    PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel), forceTemp(NULL) {
        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));        
@@ -1363,28 +1363,28 @@ private:
 class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
 public:
-    PmePreComputation(CudaContext& cu, CpuPme& pme, CpuPme::IO& io) : cu(cu), pme(pme), io(io) {
+    PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
    }
    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        Vec3 boxSize(cu.getPeriodicBoxSize().x, cu.getPeriodicBoxSize().y, cu.getPeriodicBoxSize().z);
-        pme.beginComputation(io, boxSize, includeEnergy);
+        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
    }
 private:
    CudaContext& cu;
-    CpuPme& pme;
+    Kernel pme;
-    CpuPme::IO& io;
+    CalcPmeReciprocalForceKernel::IO& io;
 };
 class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
 public:
-    PmePostComputation(CpuPme& pme, CpuPme::IO& io) : pme(pme), io(io) {
+    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
    }
    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
-        return pme.finishComputation(io);
+        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
    }
 private:
-    CpuPme& pme;
+    Kernel pme;
-    CpuPme::IO& io;
+    CalcPmeReciprocalForceKernel::IO& io;
 };
 CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
@@ -1411,8 +1411,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomGridIndex;
    if (sort != NULL)
        delete sort;
-    if (cpuPme != NULL)
-        delete cpuPme;
    if (pmeio != NULL)
        delete pmeio;
    if (hasInitializedFFT) {
@@ -1571,15 +1569,21 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        if (cu.getUseDoublePrecision())
            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
-        bool useCpuPme = false;
+        bool useCpuPme = true;
        if (useCpuPme) {
-            cpuPme = new CpuPme(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
+            try {
-            CUfunction addForcesKernel = cu.getKernel(module, "addForces");
+                cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
-            pmeio = new PmeIO(cu, addForcesKernel);
+                cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
-            cu.addPreComputation(new PmePreComputation(cu, *cpuPme, *pmeio));
+                CUfunction addForcesKernel = cu.getKernel(module, "addForces");
-            cu.addPostComputation(new PmePostComputation(*cpuPme, *pmeio));
+                pmeio = new PmeIO(cu, addForcesKernel);
+                cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
+                cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
+            }
+            catch (OpenMMException& ex) {
+                // The CPU PME plugin isn't available.
+            }
        }
-        else {
+        if (pmeio == NULL) {
            pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
            pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
            pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");

--- a/platforms/cuda/src/CudaKernels.h
+++ b/platforms/cuda/src/CudaKernels.h
@@ -34,7 +34,6 @@
 #include "CudaSort.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
-#include "CpuPme.h"
 #include <cufft.h>
 namespace OpenMM {
@@ -558,7 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), cpuPme(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -613,7 +612,7 @@ private:
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
-    CpuPme* cpuPme;
+    Kernel cpuPme;
    PmeIO* pmeio;
    cufftHandle fftForward;
    cufftHandle fftBackward;

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -147,7 +147,7 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
-    context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
 }
 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -155,8 +155,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
    delete data;
 }
-CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
+CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+            const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -54,7 +54,7 @@ void testGaussian() {
    System system;
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
-    CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {
    System system;
    system.addParticle(0.0);
-    CudaPlatform::PlatformData platformData(system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"),
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();

--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CPU PME Plugin
+#
+# Creates plugin library, base name=OpenMMPME.
+# Default libraries are shared & optimized.
+#
+# Windows:
+#   OpenMMPME[_d].dll
+#   OpenMMPME[_d].lib
+# Unix:
+#   libOpenMMPME[_d].so
+#----------------------------------------------------
+IF (APPLE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
+ENDIF (APPLE)
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+SET(OPENMMPME_LIBRARY_NAME OpenMMPME)
+SET(SHARED_TARGET ${OPENMMPME_LIBRARY_NAME})
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+# Find the include files.
+SET(API_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_INCLUDE_FILES ${API_INCLUDE_FILES} ${fullpaths})
+ENDFOREACH(dir)
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+# Include FFTW related files.
+INCLUDE(FindFFTW)
+INCLUDE_DIRECTORIES(${FFTW_INCLUDES})
+# Build the plugin library.
+ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
+ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY})
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY")
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/plugins/cpupme/include/internal/windowsExportPme.h
+++ b/plugins/cpupme/include/internal/windowsExportPme.h
+#ifndef OPENMM_WINDOWSEXPORTPME_H_
+#define OPENMM_WINDOWSEXPORTPME_H_
+/*
+ * Shared libraries are messy in Visual Studio. We have to distinguish three
+ * cases:
+ *   (1) this header is being used to build the OpenMM shared library
+ *       (dllexport)
+ *   (2) this header is being used by a *client* of the OpenMM shared
+ *       library (dllimport)
+ *   (3) we are building the OpenMM static library, or the client is
+ *       being compiled with the expectation of linking with the
+ *       OpenMM static library (nothing special needed)
+ * In the CMake script for building this library, we define one of the symbols
+ *     OPENMM_PME_BUILDING_{SHARED|STATIC}_LIBRARY
+ * Client code normally has no special symbol defined, in which case we'll
+ * assume it wants to use the shared library. However, if the client defines
+ * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
+ * that the client code can be linked with static libraries. Note that
+ * the client symbol is not library dependent, while the library symbols
+ * affect only the OpenMM library, meaning that other libraries can
+ * be clients of this one. However, we are assuming all-static or all-shared.
+ */
+#ifdef _MSC_VER
+    // We don't want to hear about how sprintf is "unsafe".
+    #pragma warning(disable:4996)
+    // Keep MS VC++ quiet about lack of dll export of private members.
+    #pragma warning(disable:4251)
+    #if defined(OPENMM_PME_BUILDING_SHARED_LIBRARY)
+        #define OPENMM_EXPORT_PME __declspec(dllexport)
+    #elif defined(OPENMM_PME_BUILDING_STATIC_LIBRARY) || defined(OPENMM_PME_USE_STATIC_LIBRARIES)
+        #define OPENMM_EXPORT_PME
+    #else
+        #define OPENMM_EXPORT_PME __declspec(dllimport)   // i.e., a client of a shared library
+    #endif
+#else
+    #define OPENMM_EXPORT_PME // Linux, Mac
+#endif
+#endif // OPENMM_WINDOWSEXPORTPME_H_
--- a/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                              OpenMMAmoeba                                  *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CpuPmeKernelFactory.h"
+#include "CpuPmeKernels.h"
+#include "internal/windowsExportPme.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+using namespace OpenMM;
+extern "C" void registerPlatforms() {
+}
+extern "C" void registerKernelFactories() {
+    if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
+        CpuPmeKernelFactory* factory = new CpuPmeKernelFactory();
+        for (int i = 0; i < Platform::getNumPlatforms(); i++)
+            Platform::getPlatform(i).registerKernelFactory(CalcPmeReciprocalForceKernel::Name(), factory);
+    }
+}
+extern "C" OPENMM_EXPORT_PME void registerCpuPmeKernelFactories() {
+    registerKernelFactories();
+}
+KernelImpl* CpuPmeKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    if (name == CalcPmeReciprocalForceKernel::Name())
+        return new CpuCalcPmeReciprocalForceKernel(name, platform);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/plugins/cpupme/src/CpuPmeKernelFactory.h
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.h
+#ifndef OPENMM_CPUPMEKERNELFACTORY_H_
+#define OPENMM_CPUPMEKERNELFACTORY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "openmm/KernelFactory.h"
+namespace OpenMM {
+/**
+ * This KernelFactory creates kernels for the CPU implementation of PME.
+ */
+class CpuPmeKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+} // namespace OpenMM
+#endif /*OPENMM_CPUPMEKERNELFACTORY_H_*/
--- a/platforms/cuda/src/CpuPme.cpp
+++ b/platforms/cuda/src/CpuPme.cpp
@@ -32,7 +32,7 @@
 #ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
 #endif
-#include "CpuPme.h"
+#include "CpuPmeKernels.h"
 #include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
 #include <cmath>
 #include <smmintrin.h>
@@ -42,8 +42,8 @@ using namespace std;
 static const int PME_ORDER = 5;
-bool CpuPme::hasInitializedThreads = false;
+bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
-int CpuPme::numThreads = 0;
+int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
 static float extractFloat(__m128 v, unsigned int element) {
    float f[4];
@@ -51,6 +51,8 @@ static float extractFloat(__m128 v, unsigned int element) {
    return f[element];
 }
+// Define function to get the number of processors.
 #ifdef __APPLE__
   #include <sys/sysctl.h>
   #include <dlfcn.h>
@@ -90,6 +92,23 @@ static int getNumProcessors() {
 #endif
 }
+// Define a function to check the CPU's capabilities.
+#ifdef _WIN32
+#define cpuid __cpuid
+#else
+static void cpuid(int cpuInfo[4], int infoType){
+    __asm__ __volatile__ (
+        "cpuid":
+        "=a" (cpuInfo[0]),
+        "=b" (cpuInfo[1]),
+        "=c" (cpuInfo[2]),
+        "=d" (cpuInfo[3]) :
+        "a" (infoType)
+    );
+}
+#endif
 static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
    float temp[4];
    __m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]);
@@ -336,17 +355,17 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
    }
 }
-class CpuPme::ThreadData {
+class CpuCalcPmeReciprocalForceKernel::ThreadData {
 public:
-    CpuPme& owner;
+    CpuCalcPmeReciprocalForceKernel& owner;
    int index;
    float* tempGrid;
-    ThreadData(CpuPme& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
+    ThreadData(CpuCalcPmeReciprocalForceKernel& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
    }
 };
 static void* threadBody(void* args) {
-    CpuPme::ThreadData& data = *reinterpret_cast<CpuPme::ThreadData*>(args);
+    CpuCalcPmeReciprocalForceKernel::ThreadData& data = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel::ThreadData*>(args);
    data.owner.runThread(data.index);
    if (data.tempGrid != NULL)
        fftwf_free(data.tempGrid);
@@ -354,13 +373,19 @@ static void* threadBody(void* args) {
    return 0;
 }
-CpuPme::CpuPme(int gridx, int gridy, int gridz, int numParticles, double alpha) :
+void CpuCalcPmeReciprocalForceKernel::initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) {
-        gridx(gridx), gridy(gridy), gridz(gridz), numParticles(numParticles), alpha(alpha), hasCreatedPlan(false), isDeleted(false), force(4*numParticles), realGrid(NULL), complexGrid(NULL) {
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
+    this->gridx = gridx;
+    this->gridy = gridy;
+    this->gridz = gridz;
+    this->numParticles = numParticles;
+    this->alpha = alpha;
+    force.resize(4*numParticles);
    // Initialize threads.
    pthread_cond_init(&startCondition, NULL);
@@ -442,7 +467,7 @@ CpuPme::CpuPme(int gridx, int gridy, int gridz, int numParticles, double alpha)
    }
 }
-CpuPme::~CpuPme() {
+CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
    isDeleted = true;
    pthread_mutex_lock(&lock);
    pthread_cond_broadcast(&startCondition);
@@ -469,7 +494,7 @@ double diff(struct timeval t1, struct timeval t2) {
    return t2.tv_usec-t1.tv_usec+1e6*(t2.tv_sec-t1.tv_sec);
 }
-void CpuPme::runThread(int index) {
+void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
    if (index == -1) {
        // This is the main thread that coordinates all the other ones.
@@ -541,7 +566,7 @@ void CpuPme::runThread(int index) {
    }
 }
-void CpuPme::threadWait() {
+void CpuCalcPmeReciprocalForceKernel::threadWait() {
    pthread_mutex_lock(&lock);
    waitCount++;
    pthread_cond_signal(&endCondition);
@@ -549,7 +574,7 @@ void CpuPme::threadWait() {
    pthread_mutex_unlock(&lock);
 }
-void CpuPme::advanceThreads() {
+void CpuCalcPmeReciprocalForceKernel::advanceThreads() {
    waitCount = 0;
    pthread_cond_broadcast(&startCondition);
    while (waitCount < numThreads) {
@@ -557,7 +582,7 @@ void CpuPme::advanceThreads() {
    }
 }
-void CpuPme::beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) {
+void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) {
    this->io = &io;
    this->periodicBoxSize = periodicBoxSize;
    this->includeEnergy = includeEnergy;
@@ -568,7 +593,7 @@ void CpuPme::beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy)
    pthread_mutex_unlock(&lock);
 }
-double CpuPme::finishComputation(IO& io) {
+double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
    pthread_mutex_lock(&lock);
    while (!isFinished) {
        pthread_cond_wait(&mainThreadEndCondition, &lock);
@@ -577,3 +602,13 @@ double CpuPme::finishComputation(IO& io) {
    io.setForce(&force[0]);
    return energy;
 }
+bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 19)) != 0); // Require SSE 4.1
+    }
+    return false;
+}
\ No newline at end of file
--- a/platforms/cuda/src/CpuPme.h
+++ b/platforms/cuda/src/CpuPme.h
-#ifndef OPENMM_CPU_PME_H_
+#ifndef OPENMM_CPU_PME_KERNELS_H_
-#define OPENMM_CPU_PME_H_
+#define OPENMM_CPU_PME_KERNELS_H_
 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
@@ -32,7 +32,8 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
-#include "windowsExportCuda.h"
+#include "internal/windowsExportPme.h"
+#include "openmm/kernels.h"
 #include "openmm/Vec3.h"
 #include <fftw3.h>
 #include <pthread.h>
@@ -43,17 +44,18 @@ namespace OpenMM {
 /**
 */
-class OPENMM_EXPORT_CUDA CpuPme {
+class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
 public:
-    class IO;
    class ThreadData;
-    /**
+    CpuCalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
-     */
+            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
-    CpuPme(int gridx, int gridy, int gridz, int numParticles, double alpha);
+    }
-    ~CpuPme();
+    void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha);
+    ~CpuCalcPmeReciprocalForceKernel();
    void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy);
    double finishComputation(IO& io);
    void runThread(int index);
+    static bool isProcessorSupported();
 private:
    void threadWait();
    void advanceThreads();
@@ -82,12 +84,6 @@ private:
    bool includeEnergy;
 };
-class CpuPme::IO {
-public:
-    virtual float* getPosq() = 0;
-    virtual void setForce(float* force) = 0;
-};
 } // namespace OpenMM
-#endif /*OPENMM_CPU_PME_H_*/
+#endif /*OPENMM_CPU_PME_KERNELS_H_*/