Added initial CUDA LJPME code, and modified cpupme plugin to handle LJPME as a standalone method.

7b66ba19 · Andy Simmonett · 94cb8614 · 7b66ba19 · 7b66ba19 · 7b66ba19
Commit 7b66ba19 authored Jan 19, 2017 by Andy Simmonett
16 changed files
--- a/olla/include/openmm/kernels.h
+++ b/olla/include/openmm/kernels.h
@@ -1345,6 +1345,57 @@ public:
 };


+/**
+ * This kernel performs the dispersion reciprocal space calculation for LJPME.  In most cases, this
+ * calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
+ * In some cases it may want to outsource the work to a different kernel.  In particular,
+ * GPU based platforms sometimes use a CPU based implementation provided by a separate
+ * plugin.
+ */
+class CalcDispersionPmeReciprocalForceKernel : public KernelImpl {
+public:
+    class IO;
+    static std::string Name() {
+        return "CalcDispersionPmeReciprocalForce";
+    }
+    CalcDispersionPmeReciprocalForceKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    virtual void initialize(int gridx, int gridy, int gridz, int numParticles, double alpha) = 0;
+    /**
+     * Begin computing the force and energy.
+     *
+     * @param io                  an object that coordinates data transfer
+     * @param periodicBoxVectors  the vectors defining the periodic box (measured in nm)
+     * @param includeEnergy       true if potential energy should be computed
+     */
+    virtual void beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) = 0;
+    /**
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
+     */
+    virtual double finishComputation(IO& io) = 0;
+    /**
+     * Get the parameters being used for PME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    virtual void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const = 0;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_KERNELS_H_*/
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -674,8 +674,9 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            if (useOptimizedPme) {
                optimizedPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
                optimizedPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSize[0], gridSize[1], gridSize[2], numParticles, ewaldAlpha);
-                optimizedDispersionPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), context);
-                optimizedDispersionPme.getAs<CalcPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1], dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
+                optimizedDispersionPme = getPlatform().createKernel(CalcDispersionPmeReciprocalForceKernel::Name(), context);
+                optimizedDispersionPme.getAs<CalcDispersionPmeReciprocalForceKernel>().initialize(dispersionGridSize[0], dispersionGridSize[1],
+                                                                                                  dispersionGridSize[2], numParticles, ewaldDispersionAlpha);
            }
        }
    }

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -598,8 +598,10 @@ private:
 class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
-            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
+            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), C6s(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
+            directDispersionPmeGrid(NULL), reciprocalDispersionPmeGrid(NULL),
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeAtomDispersionGridIndex(NULL),
+            pmeEnergyBuffer(NULL), dispersionPmeEnergyBuffer(NULL), sort(NULL), dispersionFft(NULL), fft(NULL), pmeio(NULL), dispersionPmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -636,6 +638,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class SortTrait : public CudaSort::SortTrait {
        int getDataSize() const {return 8;}
@@ -655,38 +666,55 @@ private:
    CudaContext& cu;
    bool hasInitializedFFT;
    CudaArray* sigmaEpsilon;
+    CudaArray* C6s;
    CudaArray* exceptionParams;
    CudaArray* cosSinSums;
    CudaArray* directPmeGrid;
    CudaArray* reciprocalPmeGrid;
+    CudaArray* directDispersionPmeGrid;
+    CudaArray* reciprocalDispersionPmeGrid;
    CudaArray* pmeBsplineModuliX;
    CudaArray* pmeBsplineModuliY;
    CudaArray* pmeBsplineModuliZ;
    CudaArray* pmeAtomRange;
    CudaArray* pmeAtomGridIndex;
+    CudaArray* pmeAtomDispersionGridIndex;
    CudaArray* pmeEnergyBuffer;
+    CudaArray* dispersionPmeEnergyBuffer;
    CudaSort* sort;
    Kernel cpuPme;
+    Kernel cpuDispersionPme;
    PmeIO* pmeio;
-    CUstream pmeStream;
-    CUevent pmeSyncEvent;
+    PmeIO* dispersionPmeio;
+    CUstream pmeStream, dispersionPmeStream;
+    CUevent pmeSyncEvent, dispersionPmeSyncEvent;
    CudaFFT3D* fft;
    cufftHandle fftForward;
    cufftHandle fftBackward;
+    CudaFFT3D* dispersionFft;
+    cufftHandle dispersionFftForward;
+    cufftHandle dispersionFftBackward;
    CUfunction ewaldSumsKernel;
    CUfunction ewaldForcesKernel;
    CUfunction pmeGridIndexKernel;
+    CUfunction pmeDispersionGridIndexKernel;
    CUfunction pmeSpreadChargeKernel;
+    CUfunction pmeDispersionSpreadChargeKernel;
    CUfunction pmeFinishSpreadChargeKernel;
+    CUfunction pmeDispersionFinishSpreadChargeKernel;
    CUfunction pmeEvalEnergyKernel;
+    CUfunction pmeEvalDispersionEnergyKernel;
    CUfunction pmeConvolutionKernel;
+    CUfunction pmeDispersionConvolutionKernel;
    CUfunction pmeInterpolateForceKernel;
+    CUfunction pmeInterpolateDispersionForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
-    double ewaldSelfEnergy, dispersionCoefficient, alpha;
+    double ewaldSelfEnergy, dispersionSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int interpolateForceThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
+    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -439,6 +439,15 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class Task;
    CudaPlatform::PlatformData& data;

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
 }

+void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
+}
+
 class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
 public:
    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
    }
+    size_t printfsize;
+    cuCtxGetLimit(&printfsize, CU_LIMIT_PRINTF_FIFO_SIZE);
+    cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, 10*printfsize);
+
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
    disablePmeStream = (pmeStreamProperty == "true");
    deterministicForces = (deterministicForcesProperty == "true");

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -17,6 +17,25 @@
    const real erfcAlphaR = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expAlphaRSqr;
 #endif
    real tempForce = 0.0f;
+#if HAS_LENNARD_JONES
+        // The multiplicative term to correct for the multiplicative terms that are always
+        // present in reciprocal space.  The real terms have an additive contribution
+        // added in, but for excluded terms the multiplicative term is just subtracted.
+        // These factors are needed in both clauses of the needCorrection statement, so
+        // I declare them up here.
+        #if DO_LJPME
+            const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
+            const real dar2 = dispersionAlphaR*dispersionAlphaR;
+            const real dar4 = dar2*dar2;
+            const real dar6 = dar4*dar2;
+            const real invR2 = invR*invR;
+            const real expDar2 = EXP(-dar2);
+            const real c6 = C6s1*C6s2;
+            const real coef = invR2*invR2*invR2*c6;
+            const real eprefac = 1.0f + dar2 + 0.5f*dar4;
+            const real dprefac = eprefac + dar6/6.0f;
+        #endif
+#endif
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

@@ -29,6 +48,13 @@
            includeInteraction = false;
            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
        }
+#if HAS_LENNARD_JONES
+        #if DO_LJPME
+            // The multiplicative grid term
+            tempEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+        #endif
+#endif
    }
    else {
 #if HAS_LENNARD_JONES
@@ -36,7 +62,8 @@
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
        #if USE_LJ_SWITCH
@@ -48,6 +75,22 @@
            ljEnergy *= switchValue;
        }
        #endif
+        #if DO_LJPME
+            // The multiplicative grid term
+            ljEnergy += coef*(1.0f - expDar2*eprefac);
+            tempForce += 6.0f*coef*(1.0f - expDar2*dprefac);
+            // The potential shift accounts for the step at the cutoff introduced by the
+            // transition from additive to multiplicative combintion rules and is only
+            // needed for the real (not excluded) terms.  By addin these terms to ljEnergy
+            // instead of tempEnergy here, the includeInteraction mask is correctly applied.
+            sig2 = sig*sig;
+            sig6 = sig2*sig2*sig2*INVCUT6;
+            epssig6 = eps*sig6;
+            // The additive part of the potential shift
+            ljEnergy += epssig6*(1.0f - sig6);
+            // The multiplicative part of the potential shift
+            ljEnergy += MULTSHIFT6*c6;
+        #endif
        tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
        tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
 #else

--- a/platforms/cuda/src/kernels/ljpme.cu
+++ b/platforms/cuda/src/kernels/ljpme.cu
+extern "C" __global__ void findAtomDispersionGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
+            real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+            real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+    // Compute the index of the grid point each atom is associated with.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        real4 pos = posq[i];
+        APPLY_PERIODIC_TO_POS(pos)
+        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
+                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
+                             pos.z*recipBoxVecZ.z);
+        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
+        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
+        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
+        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
+                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
+                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
+        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z+gridIndex.y*DISPERSION_GRID_SIZE_Z+gridIndex.z);
+    }
+}
+
+extern "C" __global__ void gridSpreadC6(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex,
+        const real* __restrict__ C6s) {
+    real3 data[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real4 pos = posq[atom];
+        APPLY_PERIODIC_TO_POS(pos)
+        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
+                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
+                             pos.z*recipBoxVecZ.z);
+        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
+        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
+        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
+        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
+                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
+                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
+
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
+        data[PME_ORDER-1] = make_real3(0);
+        data[1] = dr;
+        data[0] = make_real3(1)-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            real div = RECIP(j-1);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+            data[0] = div*(make_real3(1)-dr)*data[0];
+        }
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+        data[0] = scale*(make_real3(1)-dr)*data[0];
+        
+        // Spread the charge from this atom onto each grid point.
+         
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
+            xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+            real dx = data[ix].x;
+            
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
+                real dy = data[iy].y;
+                
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;
+
+                    // We need to grab the C6 coefficient from the array
+                    real add = C6s[atom]*dx*dy*data[iz].z;
+#ifdef USE_DOUBLE_PRECISION
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    int gridIndex = index;
+                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z)/2);
+                    atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+#else
+                    atomicAdd(&originalPmeGrid[index], add);
+#endif
+
+                }
+            }
+        }
+    }
+}
+
+
+extern "C" __global__ void finishSpreadC6(long long* __restrict__ originalPmeGrid) {
+    real* floatGrid = (real*) originalPmeGrid;
+    const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+    real scale = 1.0f/(real) 0x100000000;
+#ifdef USE_DOUBLE_PRECISION
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
+        floatGrid[index] = scale*originalPmeGrid[index];
+#else
+    for (int index = 2*(blockIdx.x*blockDim.x+threadIdx.x); index < gridSize; index += 2*blockDim.x*gridDim.x) {
+        floatGrid[index] = scale*originalPmeGrid[index/2];
+        if (index+1 < gridSize)
+            floatGrid[index+1] = scale*originalPmeGrid[(index+gridSize+1)/2];
+    }
+#endif
+}
+
+
+// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
+extern "C" __global__ void 
+reciprocalDispersionConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer, 
+                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, 
+                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+    const real scaleFactor =  -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+
+    const real alpha = EWALD_DISPERSION_ALPHA;
+    real bfac = M_PI / alpha;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = alpha*alpha*alpha;
+    real fac3 = -2*alpha*M_PI*M_PI;
+
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+        int ky = remainder/(GRID_SIZE_Z/2+1);
+        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = scaleFactor/(bx*by*bz);
+        real2 grid = halfcomplex_pmeGrid[index];
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+#if FAST_ERFC
+        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+        // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
+        real t = 1.0f/(1.0f+0.3275911f*b);
+        real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
+#else
+        real erfcterm = ERFC(b);
+#endif
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+        halfcomplex_pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
+    }
+}
+
+
+extern "C" __global__ void
+gridEvaluateDispersionEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
+                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
+                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = DISPERSION_GRID_SIZE_X*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+    const real scaleFactor =  -2*M_PI*SQRT(M_PI)*RECIP(6*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
+
+    const real alpha = EWALD_DISPERSION_ALPHA;
+    real bfac = M_PI / alpha;
+    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
+    real fac2 = alpha*alpha*alpha;
+    real fac3 = -2*alpha*M_PI*M_PI;
+
+    mixed energy = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        // real indices
+        int kx = index/(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z));
+        int remainder = index-kx*DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z);
+        int ky = remainder/(DISPERSION_GRID_SIZE_Z);
+        int kz = remainder-ky*(DISPERSION_GRID_SIZE_Z);
+        int mx = (kx < (DISPERSION_GRID_SIZE_X+1)/2) ? kx : (kx-DISPERSION_GRID_SIZE_X);
+        int my = (ky < (DISPERSION_GRID_SIZE_Y+1)/2) ? ky : (ky-DISPERSION_GRID_SIZE_Y);
+        int mz = (kz < (DISPERSION_GRID_SIZE_Z+1)/2) ? kz : (kz-DISPERSION_GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = scaleFactor/(bx*by*bz);
+        real m = SQRT(m2);
+        real m3 = m*m2;
+        real b = bfac*m;
+        real expfac = -b*b;
+        real expterm = EXP(expfac);
+#if FAST_ERFC
+        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+        // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
+        real t = 1.0f/(1.0f+0.3275911f*b);
+        real erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
+#else
+        real erfcterm = ERFC(b);
+#endif
+        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
+
+        if (kz >= (DISPERSION_GRID_SIZE_Z/2+1)) {
+            kx = ((kx == 0) ? kx : DISPERSION_GRID_SIZE_X-kx);
+            ky = ((ky == 0) ? ky : DISPERSION_GRID_SIZE_Y-ky);
+            kz = DISPERSION_GRID_SIZE_Z-kz;
+        } 
+        int indexInHalfComplexGrid = kz + ky*(DISPERSION_GRID_SIZE_Z/2+1)+kx*(DISPERSION_GRID_SIZE_Y*(DISPERSION_GRID_SIZE_Z/2+1));
+        real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
+        // N.B. We inlcude the 0,0,0 point for dispersion
+        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+    }
+#ifdef USE_PME_STREAM
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = 0.5f*energy;
+#else
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
+#endif
+}
+
+
+extern "C" __global__
+void gridInterpolateDispersionForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ C6s) {
+    real3 data[PME_ORDER];
+    real3 ddata[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real3 force = make_real3(0);
+        real4 pos = posq[atom];
+        APPLY_PERIODIC_TO_POS(pos)
+        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
+                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
+                             pos.z*recipBoxVecZ.z);
+        t.x = (t.x-floor(t.x))*DISPERSION_GRID_SIZE_X;
+        t.y = (t.y-floor(t.y))*DISPERSION_GRID_SIZE_Y;
+        t.z = (t.z-floor(t.z))*DISPERSION_GRID_SIZE_Z;
+        int3 gridIndex = make_int3(((int) t.x) % DISPERSION_GRID_SIZE_X,
+                                   ((int) t.y) % DISPERSION_GRID_SIZE_Y,
+                                   ((int) t.z) % DISPERSION_GRID_SIZE_Z);
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
+        data[PME_ORDER-1] = make_real3(0);
+        data[1] = dr;
+        data[0] = make_real3(1)-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            real div = RECIP(j-1);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+            data[0] = div*(make_real3(1)-dr)*data[0];
+        }
+        ddata[0] = -data[0];
+        for (int j = 1; j < PME_ORDER; j++)
+            ddata[j] = data[j-1]-data[j];
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
+        data[0] = scale*(make_real3(1)-dr)*data[0];
+
+        
+        // Compute the force on this atom.
+         
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= DISPERSION_GRID_SIZE_X ? DISPERSION_GRID_SIZE_X : 0);
+            xbase = xbase*DISPERSION_GRID_SIZE_Y*DISPERSION_GRID_SIZE_Z;
+            real dx = data[ix].x;
+            real ddx = ddata[ix].x;
+            
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= DISPERSION_GRID_SIZE_Y ? DISPERSION_GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*DISPERSION_GRID_SIZE_Z;
+                real dy = data[iy].y;
+                real ddy = ddata[iy].y;
+                
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= DISPERSION_GRID_SIZE_Z ? DISPERSION_GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;
+                    real gridvalue = originalPmeGrid[index];
+                    force.x += ddx*dy*data[iz].z*gridvalue;
+                    force.y += dx*ddy*data[iz].z*gridvalue;
+                    force.z += dx*dy*ddata[iz].z*gridvalue;
+                }
+            }
+        }
+        real q = C6s[atom];
+        real forceX = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecX.x);
+        real forceY = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecY.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecY.y);
+        real forceZ = -q*(force.x*DISPERSION_GRID_SIZE_X*recipBoxVecZ.x+force.y*DISPERSION_GRID_SIZE_Y*recipBoxVecZ.y+force.z*DISPERSION_GRID_SIZE_Z*recipBoxVecZ.z);
+        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (forceX*0x100000000)));
+        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceY*0x100000000)));
+        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceZ*0x100000000)));
+    }
+}
+
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -2209,12 +2209,13 @@ void OpenCLCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx,
    if (nonbondedMethod != LJPME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (cl.getPlatformData().useCpuPme)
-        cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+        //cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
+        throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
    else {
-        alpha = this->alpha;
-        nx = gridSizeX;
-        ny = gridSizeY;
-        nz = gridSizeZ;
+        alpha = this->dispersionAlpha;
+        nx = dispersionGridSizeX;
+        ny = dispersionGridSizeY;
+        nz = dispersionGridSizeZ;
    }
 }


--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1155,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            NonbondedForce nb;
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
-            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ);
+            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
            gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
            gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
            gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);

--- a/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.cpp
@@ -55,5 +55,7 @@ extern "C" OPENMM_EXPORT_PME void registerPlatforms() {
 KernelImpl* CpuPmeKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
    if (name == CalcPmeReciprocalForceKernel::Name())
        return new CpuCalcPmeReciprocalForceKernel(name, platform);
+    if (name == CalcDispersionPmeReciprocalForceKernel::Name())
+        return new CpuCalcDispersionPmeReciprocalForceKernel(name, platform);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
 }
--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -48,8 +48,8 @@ using namespace std;

 static const int PME_ORDER = 5;

-bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
-int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
+bool CpuCalcDispersionPmeReciprocalForceKernel::hasInitializedThreads = false;
+int CpuCalcDispersionPmeReciprocalForceKernel::numThreads = 0;

 static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
    float temp[4];
@@ -590,7 +590,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
    int complexSize = gridx*gridy*(gridz/2+1);
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
    int complexEnd = (((index+1)*complexSize)/numThreads);
-    const float epsilonFactor = calculationType==Electrostatic ? sqrt(ONE_4PI_EPS0) : 1.0f;
+    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
    threads.syncThreads();
    int numGrids = tempGrid.size();
@@ -601,37 +601,16 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
        sum.store(&realGrid[i]);
    }
    threads.syncThreads();
-    switch(calculationType){
-    case Electrostatic:
-        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
-            computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
-            threads.syncThreads();
-        }
-        if (includeEnergy) {
-            threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
-            threads.syncThreads();
-        }
-        reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
+    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
+        computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
        threads.syncThreads();
-        break;
-    case Dispersion:
-        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
-            computeReciprocalDispersionEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
-            threads.syncThreads();
-        }
-        if (includeEnergy) {
-            threadEnergy[index] = reciprocalDispersionEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
-            threads.syncThreads();
-        }
-        // For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
-        complexStart = (index*complexSize)/numThreads;
-        reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
+    }
+    if (includeEnergy) {
+        threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
        threads.syncThreads();
-        break;
-    default:
-        throw OpenMMException("Unimplemented convolution type");
    }
-
+    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
+    threads.syncThreads();
    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
 }

@@ -702,3 +681,296 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
        minimum++;
    }
 }
+
+/*
+ * Everything below here is just a clone of the above, but to handle the dispersion term
+ * instead of electrostatics.
+ */
+
+bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
+int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
+
+
+class CpuCalcDispersionPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
+public:
+    ComputeTask(CpuCalcDispersionPmeReciprocalForceKernel& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.runWorkerThread(threads, threadIndex);
+    }
+    CpuCalcDispersionPmeReciprocalForceKernel& owner;
+};
+
+static void* dispersionThreadBody(void* args) {
+    CpuCalcDispersionPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcDispersionPmeReciprocalForceKernel*>(args);
+    owner.runMainThread();
+    return 0;
+}
+
+void CpuCalcDispersionPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
+    if (!hasInitializedThreads) {
+        numThreads = getNumProcessors();
+        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
+        if (threadsEnv != NULL)
+            stringstream(threadsEnv) >> numThreads;
+        fftwf_init_threads();
+        hasInitializedThreads = true;
+    }
+    threadEnergy.resize(numThreads);
+    gridx = findFFTDimension(xsize, false);
+    gridy = findFFTDimension(ysize, false);
+    gridz = findFFTDimension(zsize, true);
+    this->numParticles = numParticles;
+    this->alpha = alpha;
+    force.resize(4*numParticles);
+    recipEterm.resize(gridx*gridy*gridz);
+    
+    // Initialize threads.
+    
+    isFinished = false;
+    pthread_cond_init(&startCondition, NULL);
+    pthread_cond_init(&endCondition, NULL);
+    pthread_mutex_init(&lock, NULL);
+    pthread_create(&mainThread, NULL, dispersionThreadBody, this);
+    
+    // Wait until the main thread is up and running.
+    
+    pthread_mutex_lock(&lock);
+    while (!isFinished)
+        pthread_cond_wait(&endCondition, &lock);
+    pthread_mutex_unlock(&lock);
+    
+    // Initialize FFTW.
+    
+    for (int i = 0; i < numThreads; i++)
+        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
+    realGrid = tempGrid[0];
+    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
+    fftwf_plan_with_nthreads(numThreads);
+    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
+    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
+    hasCreatedPlan = true;
+    
+    // Initialize the b-spline moduli.
+
+    int maxSize = std::max(std::max(gridx, gridy), gridz);
+    vector<double> data(PME_ORDER);
+    vector<double> ddata(PME_ORDER);
+    vector<double> bsplinesData(maxSize);
+    data[PME_ORDER-1] = 0.0;
+    data[1] = 0.0;
+    data[0] = 1.0;
+    for (int i = 3; i < PME_ORDER; i++) {
+        double div = 1.0/(i-1.0);
+        data[i-1] = 0.0;
+        for (int j = 1; j < (i-1); j++)
+            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+        data[0] = div*data[0];
+    }
+
+    // Differentiate.
+
+    ddata[0] = -data[0];
+    for (int i = 1; i < PME_ORDER; i++)
+        ddata[i] = data[i-1]-data[i];
+    double div = 1.0/(PME_ORDER-1);
+    data[PME_ORDER-1] = 0.0;
+    for (int i = 1; i < (PME_ORDER-1); i++)
+        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
+    data[0] = div*data[0];
+    for (int i = 0; i < maxSize; i++)
+        bsplinesData[i] = 0.0;
+    for (int i = 1; i <= PME_ORDER; i++)
+        bsplinesData[i] = data[i-1];
+
+    // Evaluate the actual bspline moduli for X/Y/Z.
+
+    bsplineModuli[0].resize(gridx);
+    bsplineModuli[1].resize(gridy);
+    bsplineModuli[2].resize(gridz);
+    for (int dim = 0; dim < 3; dim++) {
+        int ndata = bsplineModuli[dim].size();
+        vector<float>& moduli = bsplineModuli[dim];
+        for (int i = 0; i < ndata; i++) {
+            double sc = 0.0;
+            double ss = 0.0;
+            for (int j = 0; j < ndata; j++) {
+                double arg = (2.0*M_PI*i*j)/ndata;
+                sc += bsplinesData[j]*cos(arg);
+                ss += bsplinesData[j]*sin(arg);
+            }
+            moduli[i] = (float) (sc*sc+ss*ss);
+        }
+        for (int i = 0; i < ndata; i++)
+            if (moduli[i] < 1.0e-7f)
+                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
+    }
+}
+
+CpuCalcDispersionPmeReciprocalForceKernel::~CpuCalcDispersionPmeReciprocalForceKernel() {
+    isDeleted = true;
+    pthread_mutex_lock(&lock);
+    pthread_cond_broadcast(&startCondition);
+    pthread_mutex_unlock(&lock);
+    pthread_join(mainThread, NULL);
+    pthread_mutex_destroy(&lock);
+    pthread_cond_destroy(&startCondition);
+    pthread_cond_destroy(&endCondition);
+    for (int i = 0; i < (int) tempGrid.size(); i++)
+        fftwf_free(tempGrid[i]);
+    if (complexGrid != NULL)
+        fftwf_free(complexGrid);
+    if (hasCreatedPlan) {
+        fftwf_destroy_plan(forwardFFT);
+        fftwf_destroy_plan(backwardFFT);
+    }
+}
+
+void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
+    // This is the main thread that coordinates all the other ones.
+
+    pthread_mutex_lock(&lock);
+    isFinished = true;
+    pthread_cond_signal(&endCondition);
+    ThreadPool threads(numThreads);
+    while (true) {
+        // Wait for the signal to start.
+
+        pthread_cond_wait(&startCondition, &lock);
+        if (isDeleted)
+            break;
+        posq = io->getPosq();
+        ComputeTask task(*this);
+        gmx_atomic_set(&atomicCounter, 0);
+        threads.execute(task); // Signal threads to perform charge spreading.
+        threads.waitForThreads();
+        threads.resumeThreads(); // Signal threads to sum the charge grids.
+        threads.waitForThreads();
+        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
+        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
+            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
+            threads.waitForThreads();
+        }
+        if (includeEnergy) {
+            threads.resumeThreads(); // Signal threads to compute energy.
+            threads.waitForThreads();
+            for (int i = 0; i < (int) threadEnergy.size(); i++)
+                energy += threadEnergy[i];
+        }
+        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
+        threads.waitForThreads();
+        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
+        gmx_atomic_set(&atomicCounter, 0);
+        threads.resumeThreads(); // Signal threads to interpolate forces.
+        threads.waitForThreads();
+        isFinished = true;
+        lastBoxVectors[0] = periodicBoxVectors[0];
+        lastBoxVectors[1] = periodicBoxVectors[1];
+        lastBoxVectors[2] = periodicBoxVectors[2];
+        pthread_cond_signal(&endCondition);
+    }
+    pthread_mutex_unlock(&lock);
+}
+
+void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
+    int gridxStart = (index*gridx)/numThreads;
+    int gridxEnd = ((index+1)*gridx)/numThreads;
+    int gridSize = (gridx*gridy*gridz+3)/4;
+    int gridStart = 4*((index*gridSize)/numThreads);
+    int gridEnd = 4*(((index+1)*gridSize)/numThreads);
+    int complexSize = gridx*gridy*(gridz/2+1);
+    int complexStart = std::max(1, ((index*complexSize)/numThreads));
+    int complexEnd = (((index+1)*complexSize)/numThreads);
+    const float epsilonFactor = 1.0f;
+    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
+    threads.syncThreads();
+    int numGrids = tempGrid.size();
+    for (int i = gridStart; i < gridEnd; i += 4) {
+        fvec4 sum(&realGrid[i]);
+        for (int j = 1; j < numGrids; j++)
+            sum += fvec4(&tempGrid[j][i]);
+        sum.store(&realGrid[i]);
+    }
+    threads.syncThreads();
+    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
+        computeReciprocalDispersionEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
+        threads.syncThreads();
+    }
+    if (includeEnergy) {
+        threadEnergy[index] = reciprocalDispersionEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
+        threads.syncThreads();
+    }
+    // For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
+    complexStart = (index*complexSize)/numThreads;
+    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
+    threads.syncThreads();
+    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
+}
+
+void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
+    this->io = &io;
+    this->periodicBoxVectors[0] = periodicBoxVectors[0];
+    this->periodicBoxVectors[1] = periodicBoxVectors[1];
+    this->periodicBoxVectors[2] = periodicBoxVectors[2];
+    this->includeEnergy = includeEnergy;
+    energy = 0.0;
+
+    // Invert the box vectors.
+
+    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
+    double scale = 1.0/determinant;
+    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
+    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
+    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;
+
+    // Do the calculation.
+
+    pthread_mutex_lock(&lock);
+    isFinished = false;
+    pthread_cond_signal(&startCondition);
+    pthread_mutex_unlock(&lock);
+}
+
+double CpuCalcDispersionPmeReciprocalForceKernel::finishComputation(IO& io) {
+    pthread_mutex_lock(&lock);
+    while (!isFinished) {
+        pthread_cond_wait(&endCondition, &lock);
+    }
+    pthread_mutex_unlock(&lock);
+    io.setForce(&force[0]);
+    return energy;
+}
+
+bool CpuCalcDispersionPmeReciprocalForceKernel::isProcessorSupported() {
+    return isVec4Supported();
+}
+
+void CpuCalcDispersionPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    alpha = this->alpha;
+    nx = gridx;
+    ny = gridy;
+    nz = gridz;
+}
+
+int CpuCalcDispersionPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+
+        if (isZ && minimum%2 == 1) {
+            // Force the last dimension to be even, since this produces better performance in FFTW.
+
+            minimum++;
+            continue;
+        }
+        int unfactored = minimum;
+        for (int factor = 2; factor < 8; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
+            return minimum;
+        minimum++;
+    }
+}
--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
@@ -51,10 +51,8 @@ namespace OpenMM {

 class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
 public:
-    enum CalculationType { Electrostatic=0, Dispersion=1 };
-
    CpuCalcPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
-            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL), calculationType(Electrostatic) {
+            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
    }
    /**
     * Initialize the kernel.
@@ -103,11 +101,98 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+private:
+    class ComputeTask;
+    /**
+     * Select a size for one grid dimension that FFTW can handle efficiently.
+     */
+    int findFFTDimension(int minimum, bool isZ);
+    static bool hasInitializedThreads;
+    static int numThreads;
+    int gridx, gridy, gridz, numParticles;
+    double alpha;
+    bool hasCreatedPlan, isFinished, isDeleted;
+    std::vector<float> force;
+    std::vector<float> bsplineModuli[3];
+    std::vector<float> recipEterm;
+    Vec3 lastBoxVectors[3];
+    std::vector<float> threadEnergy;
+    std::vector<float*> tempGrid;
+    float* realGrid;
+    fftwf_complex* complexGrid;
+    fftwf_plan forwardFFT, backwardFFT;
+    int waitCount;
+    pthread_cond_t startCondition, endCondition;
+    pthread_mutex_t lock;
+    pthread_t mainThread;
+    // The following variables are used to store information about the calculation currently being performed.
+    IO* io;
+    float energy;
+    float* posq;
+    Vec3 periodicBoxVectors[3], recipBoxVectors[3];
+    bool includeEnergy;
+    gmx_atomic_t atomicCounter;
+};
+
+
+
+/**
+ * This is an optimized CPU implementation of CalcDispersionPmeReciprocalForceKernel.  It is both
+ * vectorized (requiring SSE 4.1) and multithreaded.  It uses FFTW to perform the FFTs.
+ */
+
+class OPENMM_EXPORT_PME CpuCalcDispersionPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
+public:
+    CpuCalcDispersionPmeReciprocalForceKernel(std::string name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
+            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL)  {
+    }
+    /**
+     * Initialize the kernel.
+     * 
+     * @param gridx        the x size of the PME grid
+     * @param gridy        the y size of the PME grid
+     * @param gridz        the z size of the PME grid
+     * @param numParticles the number of particles in the system
+     * @param alpha        the Ewald blending parameter
+     */
+    void initialize(int xsize, int ysize, int zsize, int numParticles, double alpha);
+    ~CpuCalcDispersionPmeReciprocalForceKernel();
+    /**
+     * Begin computing the force and energy.
+     * 
+     * @param io                  an object that coordinates data transfer
+     * @param periodicBoxVectors  the vectors defining the periodic box (measured in nm)
+     * @param includeEnergy       true if potential energy should be computed
+     */
+    void beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy);
    /**
-     * Sets the type of reciprocal space computation to perform (Electrostatic or Dispersion).
-     * @param type The type of computation
+     * Finish computing the force and energy.
+     * 
+     * @param io   an object that coordinates data transfer
+     * @return the potential energy due to the PME reciprocal space interactions
     */
-    void setCalculationType(CalculationType type) { calculationType = type; }
+    double finishComputation(IO& io);
+    /**
+     * This routine contains the code executed by the main thread.
+     */
+    void runMainThread();
+    /**
+     * This routine contains the code executed by each worker thread.
+     */
+    void runWorkerThread(ThreadPool& threads, int index);
+    /**
+     * Get whether the current CPU supports all features needed by this kernel.
+     */
+    static bool isProcessorSupported();
+    /**
+     * Get the parameters being used for PME.
+     * 
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    class ComputeTask;
    /**
@@ -138,7 +223,6 @@ private:
    float* posq;
    Vec3 periodicBoxVectors[3], recipBoxVectors[3];
    bool includeEnergy;
-    CalculationType calculationType;
    gmx_atomic_t atomicCounter;
 };


--- a/plugins/cpupme/tests/TestCpuPme.cpp
+++ b/plugins/cpupme/tests/TestCpuPme.cpp
@@ -523,8 +523,7 @@ void test_water2_dpme_energies_forces_no_exclusions() {
    const vector<Vec3>& refforces = state.getForces();

    // Optimized CPU calculation
-    CpuCalcPmeReciprocalForceKernel pme(CalcPmeReciprocalForceKernel::Name(), platform);
-    pme.setCalculationType(CpuCalcPmeReciprocalForceKernel::Dispersion);
+    CpuCalcDispersionPmeReciprocalForceKernel pme(CalcPmeReciprocalForceKernel::Name(), platform);
    IO io;
    double selfEwaldEnergy = 0;
    double dalpha6 = pow(dalpha, 6.0);

--- a/tests/TestNonbondedForce.h
+++ b/tests/TestNonbondedForce.h
@@ -1695,8 +1695,10 @@ void test_water125_dpme_vs_long_cutoff_with_exclusions() {
    ASSERT_EQUAL_TOL(refenergy, energy, 5E-4);
    ASSERT_EQUAL_TOL(gromacs_energy, energy, 5E-5);

+    // Forces accumulated in single precision are tested to a more permissive criterion; the double
+    // precision platform can match to 5E-5.
    for(int n = 0; n < NATOMS; ++n)
-        ASSERT_EQUAL_VEC(refforces[n], forces[n], 5E-5);
+        ASSERT_EQUAL_VEC(refforces[n], forces[n], 1E-4);

 }