Merge pull request #32 from peastman/pme

Created CPU implementation of PME

Merge pull request #32 from peastman/pme
Created CPU implementation of PME
b1d621b3 · Yutong Zhao · cce1ca81 · 794ab9a7 · b1d621b3 · b1d621b3
Commit b1d621b3 authored Jul 01, 2013 by Yutong Zhao
10 changed files
--- a/platforms/opencl/tests/TestOpenCLRandom.cpp
+++ b/platforms/opencl/tests/TestOpenCLRandom.cpp
@@ -54,7 +54,7 @@ void testGaussian() {
    System system;
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"));
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/opencl/tests/TestOpenCLSort.cpp
+++ b/platforms/opencl/tests/TestOpenCLSort.cpp
@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {

    System system;
    system.addParticle(0.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"));
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    OpenCLArray data(context, array.size(), sizeof(float), "sortData");

--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
+#---------------------------------------------------
+# OpenMM CPU PME Plugin
+#
+# Creates plugin library, base name=OpenMMPME.
+# Default libraries are shared & optimized.
+#
+# Windows:
+#   OpenMMPME[_d].dll
+#   OpenMMPME[_d].lib
+# Unix:
+#   libOpenMMPME[_d].so
+#----------------------------------------------------
+
+IF (APPLE)
+    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
+ENDIF (APPLE)
+
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+
+
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+
+SET(OPENMMPME_LIBRARY_NAME OpenMMPME)
+
+SET(SHARED_TARGET ${OPENMMPME_LIBRARY_NAME})
+
+
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+
+
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+
+# Find the include files.
+SET(API_INCLUDE_FILES)
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_INCLUDE_FILES ${API_INCLUDE_FILES} ${fullpaths})
+ENDFOREACH(dir)
+
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB_RECURSE src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1")
+
+
+# Include FFTW related files.
+INCLUDE_DIRECTORIES(${FFTW_INCLUDES})
+
+# Build the plugin library.
+ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
+
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
+ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY})
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY")
+
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
+
+SUBDIRS(tests)
--- a/plugins/cpupme/include/internal/windowsExportPme.h
+++ b/plugins/cpupme/include/internal/windowsExportPme.h
+#ifndef OPENMM_WINDOWSEXPORTPME_H_
+#define OPENMM_WINDOWSEXPORTPME_H_
+
+/*
+ * Shared libraries are messy in Visual Studio. We have to distinguish three
+ * cases:
+ *   (1) this header is being used to build the OpenMM shared library
+ *       (dllexport)
+ *   (2) this header is being used by a *client* of the OpenMM shared
+ *       library (dllimport)
+ *   (3) we are building the OpenMM static library, or the client is
+ *       being compiled with the expectation of linking with the
+ *       OpenMM static library (nothing special needed)
+ * In the CMake script for building this library, we define one of the symbols
+ *     OPENMM_PME_BUILDING_{SHARED|STATIC}_LIBRARY
+ * Client code normally has no special symbol defined, in which case we'll
+ * assume it wants to use the shared library. However, if the client defines
+ * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
+ * that the client code can be linked with static libraries. Note that
+ * the client symbol is not library dependent, while the library symbols
+ * affect only the OpenMM library, meaning that other libraries can
+ * be clients of this one. However, we are assuming all-static or all-shared.
+ */
+
+#ifdef _MSC_VER
+    // We don't want to hear about how sprintf is "unsafe".
+    #pragma warning(disable:4996)
+    // Keep MS VC++ quiet about lack of dll export of private members.
+    #pragma warning(disable:4251)
+    #if defined(OPENMM_PME_BUILDING_SHARED_LIBRARY)
+        #define OPENMM_EXPORT_PME __declspec(dllexport)
+    #elif defined(OPENMM_PME_BUILDING_STATIC_LIBRARY) || defined(OPENMM_PME_USE_STATIC_LIBRARIES)
+        #define OPENMM_EXPORT_PME
+    #else
+        #define OPENMM_EXPORT_PME __declspec(dllimport)   // i.e., a client of a shared library
+    #endif
+#else
+    #define OPENMM_EXPORT_PME // Linux, Mac
+#endif
+
+#endif // OPENMM_WINDOWSEXPORTPME_H_
--- a/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                              OpenMMCpuPme                                  *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CpuPmeKernelFactory.h"
+#include "CpuPmeKernels.h"
+#include "internal/windowsExportPme.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+
+using namespace OpenMM;
+
+extern "C" void registerPlatforms() {
+}
+
+extern "C" void registerKernelFactories() {
+    if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
+        CpuPmeKernelFactory* factory = new CpuPmeKernelFactory();
+        for (int i = 0; i < Platform::getNumPlatforms(); i++)
+            Platform::getPlatform(i).registerKernelFactory(CalcPmeReciprocalForceKernel::Name(), factory);
+    }
+}
+
+extern "C" OPENMM_EXPORT_PME void registerCpuPmeKernelFactories() {
+    registerKernelFactories();
+}
+
+KernelImpl* CpuPmeKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    if (name == CalcPmeReciprocalForceKernel::Name())
+        return new CpuCalcPmeReciprocalForceKernel(name, platform);
+    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
+}
--- a/plugins/cpupme/src/CpuPmeKernelFactory.h
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.h
+#ifndef OPENMM_CPUPMEKERNELFACTORY_H_
+#define OPENMM_CPUPMEKERNELFACTORY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/KernelFactory.h"
+
+namespace OpenMM {
+
+/**
+ * This KernelFactory creates kernels for the CPU implementation of PME.
+ */
+
+class CpuPmeKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPUPMEKERNELFACTORY_H_*/
--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
+#include "CpuPmeKernels.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include <cmath>
+#include <cstring>
+#include <smmintrin.h>
+
+using namespace OpenMM;
+using namespace std;
+
+static const int PME_ORDER = 5;
+
+bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
+int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
+
+#define EXTRACT_FLOAT(v, element) _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, element)))
+
+// Define function to get the number of processors.
+
+#ifdef __APPLE__
+   #include <sys/sysctl.h>
+   #include <dlfcn.h>
+#else
+   #ifdef WIN32
+      #include <windows.h>
+   #else
+      #include <dlfcn.h>
+      #include <unistd.h>
+   #endif
+#endif
+
+static int getNumProcessors() {
+#ifdef __APPLE__
+    int ncpu;
+    size_t len = 4;
+    if (sysctlbyname("hw.logicalcpu", &ncpu, &len, NULL, 0) == 0)
+       return ncpu;
+    else
+       return 1;
+#else
+#ifdef WIN32
+    SYSTEM_INFO siSysInfo;
+    int ncpu;
+    GetSystemInfo(&siSysInfo);
+    ncpu = siSysInfo.dwNumberOfProcessors;
+    if (ncpu < 1)
+        ncpu = 1;
+    return ncpu;
+#else
+    long nProcessorsOnline = sysconf(_SC_NPROCESSORS_ONLN);
+    if (nProcessorsOnline == -1)
+        return 1;
+    else
+        return (int) nProcessorsOnline;
+#endif
+#endif
+}
+
+// Define a function to check the CPU's capabilities.
+
+#ifdef _WIN32
+#define cpuid __cpuid
+#else
+static void cpuid(int cpuInfo[4], int infoType){
+    __asm__ __volatile__ (
+        "cpuid":
+        "=a" (cpuInfo[0]),
+        "=b" (cpuInfo[1]),
+        "=c" (cpuInfo[2]),
+        "=d" (cpuInfo[3]) :
+        "a" (infoType)
+    );
+}
+#endif
+
+static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
+    float temp[4];
+    __m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]);
+    __m128 invBoxSize = _mm_set_ps(0, (float) (1/periodicBoxSize[2]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[0]));
+    __m128 gridSize = _mm_set_ps(0, gridz, gridy, gridx);
+    __m128i gridSizeInt = _mm_set_epi32(0, gridz, gridy, gridx);
+    __m128 one  = _mm_set1_ps(1);
+    __m128 scale = _mm_set1_ps(1.0f/(PME_ORDER-1));
+    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
+    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
+    for (int i = start; i < end; i++) {
+        // Find the position relative to the nearest grid point.
+        
+        __m128 pos = _mm_load_ps(&posq[4*i]);
+        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize))));
+        __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
+        __m128i ti = _mm_cvttps_epi32(t);
+        __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
+        __m128i gridIndex = _mm_sub_epi32(ti, _mm_and_si128(gridSizeInt, _mm_cmpeq_epi32(ti, gridSizeInt)));
+        
+        // Compute the B-spline coefficients.
+        
+        __m128 data[PME_ORDER];
+        data[PME_ORDER-1] = _mm_setzero_ps();
+        data[1] = dr;
+        data[0] = _mm_sub_ps(one, dr);
+        for (int j = 3; j < PME_ORDER; j++) {
+            __m128 div = _mm_set1_ps(1.0f/(j-1));
+            data[j-1] = _mm_mul_ps(_mm_mul_ps(div, dr), data[j-2]);
+            for (int k = 1; k < j-1; k++)
+                data[j-k-1] = _mm_mul_ps(div, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(k)), data[j-k-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(j-k), dr), data[j-k-1])));
+            data[0] = _mm_mul_ps(_mm_mul_ps(div, _mm_sub_ps(one, dr)), data[0]);
+        }
+        data[PME_ORDER-1] = _mm_mul_ps(_mm_mul_ps(scale, dr), data[PME_ORDER-2]);
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = _mm_mul_ps(scale, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(j)), data[PME_ORDER-j-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(PME_ORDER-j), dr), data[PME_ORDER-j-1])));
+        data[0] = _mm_mul_ps(_mm_mul_ps(scale, _mm_sub_ps(one, dr)), data[0]);
+        
+        // Spread the charges.
+        
+        int gridIndexX = _mm_extract_epi32(gridIndex, 0);
+        int gridIndexY = _mm_extract_epi32(gridIndex, 1);
+        int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
+        int zindex[PME_ORDER];
+        for (int j = 0; j < PME_ORDER; j++) {
+            zindex[j] = gridIndexZ+j;
+            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
+        }
+        float charge = epsilonFactor*posq[4*i+3];
+        __m128 zdata0to3 = _mm_set_ps(EXTRACT_FLOAT(data[3], 2), EXTRACT_FLOAT(data[2], 2), EXTRACT_FLOAT(data[1], 2), EXTRACT_FLOAT(data[0], 2));
+        float zdata4 = EXTRACT_FLOAT(data[4], 2);
+        if (gridIndexZ+4 < gridz) {
+            for (int ix = 0; ix < PME_ORDER; ix++) {
+                int xbase = gridIndexX+ix;
+                xbase -= (xbase >= gridx ? gridx : 0);
+                xbase = xbase*gridy*gridz;
+                float xdata = charge*EXTRACT_FLOAT(data[ix], 0);
+                for (int iy = 0; iy < PME_ORDER; iy++) {
+                    int ybase = gridIndexY+iy;
+                    ybase -= (ybase >= gridy ? gridy : 0);
+                    ybase = xbase + ybase*gridz;
+                    float multiplier = xdata*EXTRACT_FLOAT(data[iy], 1);
+                    __m128 add0to3 = _mm_mul_ps(zdata0to3, _mm_set1_ps(multiplier));
+                    _mm_storeu_ps(&grid[ybase+gridIndexZ], _mm_add_ps(_mm_loadu_ps(&grid[ybase+gridIndexZ]), add0to3));
+                    grid[ybase+zindex[4]] += multiplier*zdata4;
+                }
+            }
+        }
+        else {
+            for (int ix = 0; ix < PME_ORDER; ix++) {
+                int xbase = gridIndexX+ix;
+                xbase -= (xbase >= gridx ? gridx : 0);
+                xbase = xbase*gridy*gridz;
+                float xdata = charge*EXTRACT_FLOAT(data[ix], 0);
+                for (int iy = 0; iy < PME_ORDER; iy++) {
+                    int ybase = gridIndexY+iy;
+                    ybase -= (ybase >= gridy ? gridy : 0);
+                    ybase = xbase + ybase*gridz;
+                    float multiplier = xdata*EXTRACT_FLOAT(data[iy], 1);
+                    __m128 add0to3 = _mm_mul_ps(zdata0to3, _mm_set1_ps(multiplier));
+                    _mm_store_ps(temp, add0to3);
+                    grid[ybase+zindex[0]] += temp[0];
+                    grid[ybase+zindex[1]] += temp[1];
+                    grid[ybase+zindex[2]] += temp[2];
+                    grid[ybase+zindex[3]] += temp[3];
+                    grid[ybase+zindex[4]] += multiplier*zdata4;
+                }
+            }
+        }
+    }
+}
+
+static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3 periodicBoxSize) {
+    const unsigned int zsize = gridz/2+1;
+    const unsigned int yzsize = gridy*zsize;
+    const float scaleFactor = (float) (M_PI*periodicBoxSize[0]*periodicBoxSize[1]*periodicBoxSize[2]);
+    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
+    const float invPeriodicBoxSizeX = (float) (1.0/periodicBoxSize[0]);
+    const float invPeriodicBoxSizeY = (float) (1.0/periodicBoxSize[1]);
+    const float invPeriodicBoxSizeZ = (float) (1.0/periodicBoxSize[2]);
+
+    int firstz = (start == 0 ? 1 : 0);
+    for (int kx = start; kx < end; kx++) {
+        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
+        float mhx = mx*invPeriodicBoxSizeX;
+        float bx = scaleFactor*bsplineModuli[0][kx];
+        for (int ky = 0; ky < gridy; ky++) {
+            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
+            float mhy = my*invPeriodicBoxSizeY;
+            float mhx2y2 = mhx*mhx + mhy*mhy;
+            float bxby = bx*bsplineModuli[1][ky];
+            for (int kz = firstz; kz < zsize; kz++) {
+                int index = kx*yzsize + ky*zsize + kz;
+                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
+                float mhz = mz*invPeriodicBoxSizeZ;
+                float bz = bsplineModuli[2][kz];
+                float m2 = mhx2y2 + mhz*mhz;
+                float denom = m2*bxby*bz;
+                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
+            }
+            firstz = 0;
+        }
+    }
+}
+
+static float reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3 periodicBoxSize) {
+    const unsigned int zsizeHalf = gridz/2+1;
+    const unsigned int yzsizeHalf = gridy*zsizeHalf;
+    const float scaleFactor = (float) (M_PI*periodicBoxSize[0]*periodicBoxSize[1]*periodicBoxSize[2]);
+    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
+    const float invPeriodicBoxSizeX = (float) (1.0/periodicBoxSize[0]);
+    const float invPeriodicBoxSizeY = (float) (1.0/periodicBoxSize[1]);
+    const float invPeriodicBoxSizeZ = (float) (1.0/periodicBoxSize[2]);
+    float energy = 0.0f;
+
+    int firstz = (start == 0 ? 1 : 0);
+    for (int kx = start; kx < end; kx++) {
+        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
+        float mhx = mx*invPeriodicBoxSizeX;
+        float bx = scaleFactor*bsplineModuli[0][kx];
+        for (int ky = 0; ky < gridy; ky++) {
+            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
+            float mhy = my*invPeriodicBoxSizeY;
+            float mhx2y2 = mhx*mhx + mhy*mhy;
+            float bxby = bx*bsplineModuli[1][ky];
+            for (int kz = firstz; kz < gridz; kz++) {
+                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
+                float mhz = mz*invPeriodicBoxSizeZ;
+                float bz = bsplineModuli[2][kz];
+                float m2 = mhx2y2 + mhz*mhz;
+                float denom = m2*bxby*bz;
+                float eterm = exp(-recipExpFactor*m2)/denom;
+                int kx1, ky1, kz1;
+                if (kz >= gridz/2+1) {
+                    kx1 = (kx == 0 ? kx : gridx-kx);
+                    ky1 = (ky == 0 ? ky : gridy-ky);
+                    kz1 = gridz-kz;
+                }
+                else {
+                    kx1 = kx;
+                    ky1 = ky;
+                    kz1 = kz;
+                }
+                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
+                float gridReal = grid[index][0];
+                float gridImag = grid[index][1];
+                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
+            }
+            firstz = 0;
+        }
+    }
+    return 0.5f*energy;
+}
+
+static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, vector<float>& recipEterm) {
+    const unsigned int zsize = gridz/2+1;
+    const unsigned int yzsize = gridy*zsize;
+
+    int firstz = (start == 0 ? 1 : 0);
+    for (int kx = start; kx < end; kx++) {
+        for (int ky = 0; ky < gridy; ky++) {
+            for (int kz = firstz; kz < zsize; kz++) {
+                int index = kx*yzsize + ky*zsize + kz;
+                float eterm = recipEterm[index];
+                grid[index][0] *= eterm;
+                grid[index][1] *= eterm;
+            }
+            firstz = 0;
+        }
+    }
+}
+
+static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
+    __m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]);
+    __m128 invBoxSize = _mm_set_ps(0, (float) (1/periodicBoxSize[2]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[0]));
+    __m128 gridSize = _mm_set_ps(0, gridz, gridy, gridx);
+    __m128i gridSizeInt = _mm_set_epi32(0, gridz, gridy, gridx);
+    __m128 one  = _mm_set1_ps(1);
+    __m128 scale = _mm_set1_ps(1.0f/(PME_ORDER-1));
+    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
+    for (int i = start; i < end; i++) {
+        // Find the position relative to the nearest grid point.
+        
+        __m128 pos = _mm_load_ps(&posq[4*i]);
+        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize))));
+        __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
+        __m128i ti = _mm_cvttps_epi32(t);
+        __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
+        __m128i gridIndex = _mm_sub_epi32(ti, _mm_and_si128(gridSizeInt, _mm_cmpeq_epi32(ti, gridSizeInt)));
+        
+        // Compute the B-spline coefficients.
+        
+        __m128 data[PME_ORDER];
+        __m128 ddata[PME_ORDER];
+        data[PME_ORDER-1] = _mm_setzero_ps();
+        data[1] = dr;
+        data[0] = _mm_sub_ps(one, dr);
+        for (int j = 3; j < PME_ORDER; j++) {
+            __m128 div = _mm_set1_ps(1.0f/(j-1));
+            data[j-1] = _mm_mul_ps(_mm_mul_ps(div, dr), data[j-2]);
+            for (int k = 1; k < j-1; k++)
+                data[j-k-1] = _mm_mul_ps(div, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(k)), data[j-k-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(j-k), dr), data[j-k-1])));
+            data[0] = _mm_mul_ps(_mm_mul_ps(div, _mm_sub_ps(one, dr)), data[0]);
+        }
+        ddata[0] = _mm_sub_ps(_mm_set1_ps(0), data[0]);
+        for (int j = 1; j < PME_ORDER; j++)
+            ddata[j] = _mm_sub_ps(data[j-1], data[j]);
+        data[PME_ORDER-1] = _mm_mul_ps(_mm_mul_ps(scale, dr), data[PME_ORDER-2]);
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = _mm_mul_ps(scale, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(j)), data[PME_ORDER-j-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(PME_ORDER-j), dr), data[PME_ORDER-j-1])));
+        data[0] = _mm_mul_ps(_mm_mul_ps(scale, _mm_sub_ps(one, dr)), data[0]);
+                
+        // Compute the force on this atom.
+        
+        int gridIndexX = _mm_extract_epi32(gridIndex, 0);
+        int gridIndexY = _mm_extract_epi32(gridIndex, 1);
+        int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
+        int zindex[PME_ORDER];
+        for (int j = 0; j < PME_ORDER; j++) {
+            zindex[j] = gridIndexZ+j;
+            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
+        }
+        __m128 zdata[PME_ORDER];
+        for (int j = 0; j < PME_ORDER; j++)
+            zdata[j] = _mm_set_ps(0, EXTRACT_FLOAT(ddata[j], 2), EXTRACT_FLOAT(data[j], 2), EXTRACT_FLOAT(data[j], 2));
+        __m128 f = _mm_set1_ps(0);
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndexX+ix;
+            xbase -= (xbase >= gridx ? gridx : 0);
+            xbase = xbase*gridy*gridz;
+            float dx = EXTRACT_FLOAT(data[ix], 0);
+            float ddx = EXTRACT_FLOAT(ddata[ix], 0);
+            __m128 xdata = _mm_set_ps(0, dx, dx, ddx);
+
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndexY+iy;
+                ybase -= (ybase >= gridy ? gridy : 0);
+                ybase = xbase + ybase*gridz;
+                float dy = EXTRACT_FLOAT(data[iy], 1);
+                float ddy = EXTRACT_FLOAT(ddata[iy], 1);
+                __m128 xydata = _mm_mul_ps(xdata, _mm_set_ps(0, dy, ddy, dy));
+
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    __m128 gridValue = _mm_set1_ps(grid[ybase+zindex[iz]]);
+                    f = _mm_add_ps(f, _mm_mul_ps(xydata, _mm_mul_ps(zdata[iz], gridValue)));
+                }
+            }
+        }
+        f = _mm_mul_ps(invBoxSize, _mm_mul_ps(gridSize, _mm_mul_ps(f, _mm_set1_ps(-epsilonFactor*posq[4*i+3]))));
+        _mm_store_ps(&force[4*i], f);        
+    }
+}
+
+class CpuCalcPmeReciprocalForceKernel::ThreadData {
+public:
+    CpuCalcPmeReciprocalForceKernel& owner;
+    int index;
+    float* tempGrid;
+    ThreadData(CpuCalcPmeReciprocalForceKernel& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
+    }
+};
+
+static void* threadBody(void* args) {
+    CpuCalcPmeReciprocalForceKernel::ThreadData& data = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel::ThreadData*>(args);
+    data.owner.runThread(data.index);
+    if (data.tempGrid != NULL)
+        fftwf_free(data.tempGrid);
+    delete &data;
+    return 0;
+}
+
+void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
+    if (!hasInitializedThreads) {
+        numThreads = getNumProcessors();
+        fftwf_init_threads();
+        hasInitializedThreads = true;
+    }
+    gridx = findFFTDimension(xsize);
+    gridy = findFFTDimension(ysize);
+    gridz = findFFTDimension(zsize);
+    this->numParticles = numParticles;
+    this->alpha = alpha;
+    force.resize(4*numParticles);
+    recipEterm.resize(gridx*gridy*gridz);
+    
+    // Initialize threads.
+    
+    pthread_cond_init(&startCondition, NULL);
+    pthread_cond_init(&endCondition, NULL);
+    pthread_cond_init(&mainThreadStartCondition, NULL);
+    pthread_cond_init(&mainThreadEndCondition, NULL);
+    pthread_mutex_init(&lock, NULL);
+    thread.resize(numThreads);
+    for (int i = 0; i < numThreads; i++) {
+        ThreadData* data = new ThreadData(*this, i);
+        threadData.push_back(data);
+        pthread_create(&thread[i], NULL, threadBody, data);
+        data->tempGrid = (float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3));
+    }
+    pthread_create(&mainThread, NULL, threadBody, new ThreadData(*this, -1));
+    
+    // Initialize FFTW.
+    
+    realGrid = threadData[0]->tempGrid;
+    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
+    fftwf_plan_with_nthreads(numThreads);
+    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
+    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
+    hasCreatedPlan = true;
+    
+    // Initialize the b-spline moduli.
+
+    int maxSize = max(max(gridx, gridy), gridz);
+    vector<double> data(PME_ORDER);
+    vector<double> ddata(PME_ORDER);
+    vector<double> bsplinesData(maxSize);
+    data[PME_ORDER-1] = 0.0;
+    data[1] = 0.0;
+    data[0] = 1.0;
+    for (int i = 3; i < PME_ORDER; i++) {
+        double div = 1.0/(i-1.0);
+        data[i-1] = 0.0;
+        for (int j = 1; j < (i-1); j++)
+            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+        data[0] = div*data[0];
+    }
+
+    // Differentiate.
+
+    ddata[0] = -data[0];
+    for (int i = 1; i < PME_ORDER; i++)
+        ddata[i] = data[i-1]-data[i];
+    double div = 1.0/(PME_ORDER-1);
+    data[PME_ORDER-1] = 0.0;
+    for (int i = 1; i < (PME_ORDER-1); i++)
+        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
+    data[0] = div*data[0];
+    for (int i = 0; i < maxSize; i++)
+        bsplinesData[i] = 0.0;
+    for (int i = 1; i <= PME_ORDER; i++)
+        bsplinesData[i] = data[i-1];
+
+    // Evaluate the actual bspline moduli for X/Y/Z.
+
+    bsplineModuli[0].resize(gridx);
+    bsplineModuli[1].resize(gridy);
+    bsplineModuli[2].resize(gridz);
+    for (int dim = 0; dim < 3; dim++) {
+        int ndata = bsplineModuli[dim].size();
+        vector<float>& moduli = bsplineModuli[dim];
+        for (int i = 0; i < ndata; i++) {
+            double sc = 0.0;
+            double ss = 0.0;
+            for (int j = 0; j < ndata; j++) {
+                double arg = (2.0*M_PI*i*j)/ndata;
+                sc += bsplinesData[j]*cos(arg);
+                ss += bsplinesData[j]*sin(arg);
+            }
+            moduli[i] = (float) (sc*sc+ss*ss);
+        }
+        for (int i = 0; i < ndata; i++)
+            if (moduli[i] < 1.0e-7f)
+                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
+    }
+}
+
+CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
+    isDeleted = true;
+    pthread_mutex_lock(&lock);
+    pthread_cond_broadcast(&startCondition);
+    pthread_cond_broadcast(&mainThreadStartCondition);
+    pthread_mutex_unlock(&lock);
+    for (int i = 0; i < (int) thread.size(); i++)
+        pthread_join(thread[i], NULL);
+    pthread_join(mainThread, NULL);
+    pthread_mutex_destroy(&lock);
+    pthread_cond_destroy(&startCondition);
+    pthread_cond_destroy(&endCondition);
+    pthread_cond_destroy(&mainThreadStartCondition);
+    pthread_cond_destroy(&mainThreadEndCondition);
+    if (complexGrid != NULL)
+        fftwf_free(complexGrid);
+    if (hasCreatedPlan) {
+        fftwf_destroy_plan(forwardFFT);
+        fftwf_destroy_plan(backwardFFT);
+    }
+}
+
+void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
+    if (index == -1) {
+        // This is the main thread that coordinates all the other ones.
+        
+        pthread_mutex_lock(&lock);
+        while (true) {
+            // Wait for the signal to start.
+            
+            pthread_cond_wait(&mainThreadStartCondition, &lock);
+            if (isDeleted)
+                break;
+            posq = io->getPosq();
+            advanceThreads(); // Signal threads to perform charge spreading.
+            advanceThreads(); // Signal threads to sum the charge grids.
+            fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
+            if (lastBoxSize != periodicBoxSize)
+                advanceThreads(); // Signal threads to compute the reciprocal scale factors.
+            if (includeEnergy)
+                advanceThreads(); // Signal threads to compute energy.
+            advanceThreads(); // Signal threads to perform reciprocal convolution.
+            fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
+            advanceThreads(); // Signal threads to interpolate forces.
+            isFinished = true;
+            lastBoxSize = periodicBoxSize;
+            pthread_cond_signal(&mainThreadEndCondition);
+        }
+        pthread_mutex_unlock(&lock);
+    }
+    else {
+        // This is a worker thread.
+        
+        int particleStart = (index*numParticles)/numThreads;
+        int particleEnd = ((index+1)*numParticles)/numThreads;
+        int gridxStart = (index*gridx)/numThreads;
+        int gridxEnd = ((index+1)*gridx)/numThreads;
+        int gridSize = (gridx*gridy*gridz+3)/4;
+        int gridStart = 4*((index*gridSize)/numThreads);
+        int gridEnd = 4*(((index+1)*gridSize)/numThreads);
+        while (true) {
+            threadWait();
+            if (isDeleted)
+                break;
+            spreadCharge(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxSize);
+            threadWait();
+            int numGrids = threadData.size();
+            for (int i = gridStart; i < gridEnd; i += 4) {
+                __m128 sum = _mm_load_ps(&realGrid[i]);
+                for (int j = 1; j < numGrids; j++)
+                    sum = _mm_add_ps(sum, _mm_load_ps(&threadData[j]->tempGrid[i]));
+                _mm_store_ps(&realGrid[i], sum);
+            }
+            threadWait();
+            if (lastBoxSize != periodicBoxSize) {
+                computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxSize);
+                threadWait();
+            }
+            if (includeEnergy) {
+                double threadEnergy = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxSize);
+                pthread_mutex_lock(&lock);
+                energy += threadEnergy;
+                pthread_mutex_unlock(&lock);
+                threadWait();
+            }
+            reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm);
+            threadWait();
+            interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxSize);
+        }
+    }
+}
+
+void CpuCalcPmeReciprocalForceKernel::threadWait() {
+    pthread_mutex_lock(&lock);
+    waitCount++;
+    pthread_cond_signal(&endCondition);
+    pthread_cond_wait(&startCondition, &lock);
+    pthread_mutex_unlock(&lock);
+}
+
+void CpuCalcPmeReciprocalForceKernel::advanceThreads() {
+    waitCount = 0;
+    pthread_cond_broadcast(&startCondition);
+    while (waitCount < numThreads) {
+        pthread_cond_wait(&endCondition, &lock);
+    }
+}
+
+void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy) {
+    this->io = &io;
+    this->periodicBoxSize = periodicBoxSize;
+    this->includeEnergy = includeEnergy;
+    energy = 0.0;
+    pthread_mutex_lock(&lock);
+    isFinished = false;
+    pthread_cond_signal(&mainThreadStartCondition);
+    pthread_mutex_unlock(&lock);
+}
+
+double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
+    pthread_mutex_lock(&lock);
+    while (!isFinished) {
+        pthread_cond_wait(&mainThreadEndCondition, &lock);
+    }
+    pthread_mutex_unlock(&lock);
+    io.setForce(&force[0]);
+    return energy;
+}
+
+bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 19)) != 0); // Require SSE 4.1
+    }
+    return false;
+}
+
+int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+
+        int unfactored = minimum;
+        for (int factor = 2; factor < 12; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1)
+            return minimum;
+        minimum++;
+    }
+}
--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
+#ifndef OPENMM_CPU_PME_KERNELS_H_
+#define OPENMM_CPU_PME_KERNELS_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "internal/windowsExportPme.h"
+#include "openmm/kernels.h"
+#include "openmm/Vec3.h"
+#include <fftw3.h>
+#include <pthread.h>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This is an optimized CPU implementation of CalcPmeReciprocalForceKernel.  It is both
+ * vectorized (requiring SSE 4.1) and multithreaded.  It uses FFTW to perform the FFTs.
+ */
+
+class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
+public:
+    class ThreadData;
+    CpuCalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
+            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    void initialize(int xsize, int ysize, int zsize, int numParticles, double alpha);
+    ~CpuCalcPmeReciprocalForceKernel();
+    /**
+     * Begin computing the force and energy.
+     * 
+     * @param io               an object that coordinates data transfer
+     * @param periodicBoxSize  the size of the periodic box (measured in nm)
+     * @param includeEnergy    true if potential energy should be computed
+     */
+    void beginComputation(IO& io, Vec3 periodicBoxSize, bool includeEnergy);
+    /**
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
+     */
+    double finishComputation(IO& io);
+    /**
+     * This routine contains the code executed by each thread.
+     */
+    void runThread(int index);
+    /**
+     * Get whether the current CPU supports all features needed by this kernel.
+     */
+    static bool isProcessorSupported();
+private:
+    /**
+     * This is called by the worker threads to wait until the master thread instructs them to advance.
+     */
+    void threadWait();
+    /**
+     * This is called by the master thread to instruct all the worker threads to advance.
+     */
+    void advanceThreads();
+    /**
+     * Select a size for one grid dimension that FFTW can handle efficiently.
+     */
+    int findFFTDimension(int minimum);
+    static bool hasInitializedThreads;
+    static int numThreads;
+    int gridx, gridy, gridz, numParticles;
+    double alpha;
+    bool hasCreatedPlan, isFinished, isDeleted;
+    std::vector<float> force;
+    std::vector<float> bsplineModuli[3];
+    std::vector<float> recipEterm;
+    Vec3 lastBoxSize;
+    float* realGrid;
+    fftwf_complex* complexGrid;
+    fftwf_plan forwardFFT, backwardFFT;
+    int waitCount;
+    pthread_cond_t startCondition, endCondition;
+    pthread_cond_t mainThreadStartCondition, mainThreadEndCondition;
+    pthread_mutex_t lock;
+    pthread_t mainThread;
+    std::vector<pthread_t> thread;
+    std::vector<ThreadData*> threadData;
+    // The following variables are used to store information about the calculation currently being performed.
+    IO* io;
+    float energy;
+    float* posq;
+    Vec3 periodicBoxSize;
+    bool includeEnergy;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPU_PME_KERNELS_H_*/
--- a/plugins/cpupme/tests/CMakeLists.txt
+++ b/plugins/cpupme/tests/CMakeLists.txt
+#
+# Testing
+#
+ENABLE_TESTING()
+
+SET(SHARED_OPENMM_PME_TARGET OpenMMPME)
+
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_CUDA_TARGET ${SHARED_CUDA_TARGET}_d)
+    SET(SHARED_OPENMM_PME_TARGET ${SHARED_OPENMM_PME_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+
+#LINK_DIRECTORIES
+
+# Automatically create tests using files named "Test*.cpp"
+FILE(GLOB TEST_PROGS "*Test*.cpp")
+FOREACH(TEST_PROG ${TEST_PROGS})
+    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
+
+    # Link with shared library
+
+    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET} ${SHARED_OPENMM_TARGET} ${SHARED_OPENMM_PME_TARGET})
+    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/plugins/cpupme/tests/TestCpuPme.cpp
+++ b/plugins/cpupme/tests/TestCpuPme.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CPU implementation of PME.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/internal/NonbondedForceImpl.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "openmm/internal/ContextImpl.h"
+#include "../src/CpuPmeKernels.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+class IO : public CalcPmeReciprocalForceKernel::IO {
+public:
+    vector<float> posq;
+    float* force;
+    float* getPosq() {
+        return &posq[0];
+    }
+    void setForce(float* force) {
+        this->force = force;
+    }
+};
+
+void testPME() {
+    // Create a cloud of random point charges.
+
+    const int numParticles = 51;
+    const double boxWidth = 5.0;
+    const double cutoff = 1.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxWidth, 0, 0), Vec3(0, boxWidth, 0), Vec3(0, 0, boxWidth));
+    NonbondedForce* force = new NonbondedForce();
+    system.addForce(force);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        force->addParticle(-1.0+i*2.0/(numParticles-1), 1.0, 0.0);
+        positions[i] = Vec3(boxWidth*genrand_real2(sfmt), boxWidth*genrand_real2(sfmt), boxWidth*genrand_real2(sfmt));
+    }
+    force->setNonbondedMethod(NonbondedForce::PME);
+    force->setCutoffDistance(cutoff);
+    force->setReciprocalSpaceForceGroup(1);
+    force->setEwaldErrorTolerance(1e-4);
+    
+    // Compute the reciprocal space forces with the reference platform.
+    
+    Platform& platform = Platform::getPlatformByName("Reference");
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State refState = context.getState(State::Forces | State::Energy, false, 1<<1);
+    
+    // Now compute them with the optimized kernel.
+    
+    double alpha;
+    int gridx, gridy, gridz;
+    NonbondedForceImpl::calcPMEParameters(system, *force, alpha, gridx, gridy, gridz);
+    CpuCalcPmeReciprocalForceKernel pme(CalcPmeReciprocalForceKernel::Name(), platform);
+    IO io;
+    double sumSquaredCharges = 0;
+    for (int i = 0; i < numParticles; i++) {
+        io.posq.push_back(positions[i][0]);
+        io.posq.push_back(positions[i][1]);
+        io.posq.push_back(positions[i][2]);
+        double charge, sigma, epsilon;
+        force->getParticleParameters(i, charge, sigma, epsilon);
+        io.posq.push_back(charge);
+        sumSquaredCharges += charge*charge;
+    }
+    double ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
+    pme.initialize(gridx, gridy, gridz, numParticles, alpha);
+    pme.beginComputation(io, Vec3(boxWidth, boxWidth, boxWidth), true);
+    double energy = pme.finishComputation(io);
+    
+    // See if they match.
+    
+    ASSERT_EQUAL_TOL(refState.getPotentialEnergy(), energy+ewaldSelfEnergy, 1e-3);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(refState.getForces()[i], Vec3(io.force[4*i], io.force[4*i+1], io.force[4*i+2]), 1e-3);
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        testPME();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}