Beginnings of the CUDA implementation of PME

a4b2a13b · Peter Eastman · e009228a · a4b2a13b · a4b2a13b · a4b2a13b
Commit a4b2a13b authored Aug 27, 2009 by Peter Eastman
10 changed files
--- a/platforms/cuda/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda/sharedTarget/CMakeLists.txt
@@ -17,7 +17,7 @@ CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
 CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${OPENMM_LIBRARY_NAME}_d optimized ${OPENMM_LIBRARY_NAME} cudpp cutil)
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${OPENMM_LIBRARY_NAME}_d optimized ${OPENMM_LIBRARY_NAME} cudpp cutil ${CUFFT_TARGET_LINK})
 SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -298,7 +298,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
            method = PERIODIC;
        }
-        if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
            Vec3 boxVectors[3];
            force.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
            gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
@@ -308,17 +308,23 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            double my = boxVectors[1][1]/force.getCutoffDistance();
            double mz = boxVectors[2][2]/force.getCutoffDistance();
            double pi = 3.1415926535897932385;
-            int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+            if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
-            int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+                int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
-            int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
+                int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
-            if (kmaxx%2 == 0)
+                int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
-                kmaxx++;
+                if (kmaxx%2 == 0)
-            if (kmaxy%2 == 0)
+                    kmaxx++;
-                kmaxy++;
+                if (kmaxy%2 == 0)
-            if (kmaxz%2 == 0)
+                    kmaxy++;
-                kmaxz++;
+                if (kmaxz%2 == 0)
-            gpuSetEwaldParameters(gpu, (float)alpha, kmaxx, kmaxy, kmaxz);
+                    kmaxz++;
-            method = EWALD;
+                gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
+                method = EWALD;
+            }
+            else {
+                gpuSetPMEParameters(gpu, (float) alpha);
+                method = PARTICLE_MESH_EWALD;
+            }
        }
        data.nonbondedMethod = method;
        gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -77,6 +77,8 @@ extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
+extern void SetCalculatePMESim(gpuContext gpu);
+extern void GetCalculatePMESim(gpuContext gpu);
 extern void SetForcesSim(gpuContext gpu);
 extern void GetForcesSim(gpuContext gpu);
 extern void SetShakeHSim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -35,6 +35,7 @@
 #include <string>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#include <cufft.h>
 #include <builtin_types.h>
 #include <vector_functions.h>
@@ -236,12 +237,15 @@ static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
 static const float PI = 3.14159265358979323846f;
+static const int PME_ORDER = 4;
 enum CudaNonbondedMethod
 {
    NO_CUTOFF,
    CUTOFF,
    PERIODIC,
-    EWALD
+    EWALD,
+    PARTICLE_MESH_EWALD
 };
 enum ExpressionOp {
@@ -354,6 +358,13 @@ struct cudaGmxSimulation {
    float4*         pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
    float4*         pTabulatedFunctionParams;       // The min, max, and spacing for each tabulated function
    float2*         pEwaldCosSinSum;                // Pointer to the cos/sin sums (ewald)
+    int3            pmeGridSize;                    // The dimensions of the grid for particle mesh Ewald
+    cufftComplex*   pPmeGrid;                       // Grid points for particle mesh Ewald
+    float*          pPmeBsplineModuli[3];
+    float4*         pPmeBsplineTheta;
+    float4*         pPmeBsplineDtheta;
+    int4*           pPmeParticleIndex;              // The grid indices for each atom
+    float4*         pPmeParticleFraction;           // Fractional offset in the grid for each atom in all three dimensions.
    unsigned int    bonds;                          // Number of bonds
    int4*           pBondID;                        // Bond atom and output buffer IDs
    float2*         pBondParameter;                 // Bond parameters

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -757,6 +757,31 @@ void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, in
    gpu->sim.pEwaldCosSinSum    = gpu->psEwaldCosSinSum->_pDevStream[0];
 }
+extern "C"
+void gpuSetPMEParameters(gpuContext gpu, float alpha)
+{
+    gpu->sim.alphaEwald         = alpha;
+    int3 gridSize = make_int3(16, 16, 16);
+    gpu->sim.pmeGridSize = gridSize;
+    cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
+    gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
+    gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
+    gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
+    gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
+    gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
+    gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
+    gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
+    gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
+    gpu->psPmeBsplineTheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineTheta");
+    gpu->sim.pPmeBsplineTheta = gpu->psPmeBsplineTheta->_pDevData;
+    gpu->psPmeBsplineDtheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineDtheta");
+    gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
+    gpu->psPmeParticleIndex = new CUDAStream<int4>(gpu->natoms, 1, "PmeParticleIndex");
+    gpu->sim.pPmeParticleIndex = gpu->psPmeParticleIndex->_pDevData;
+    gpu->psPmeParticleFraction = new CUDAStream<float4>(gpu->natoms, 1, "PmeParticleFraction");
+    gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
+}
 extern "C"
 void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
 {
@@ -1589,6 +1614,14 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psCustomExceptionID        = NULL;
    gpu->psCustomExceptionParams    = NULL;
    gpu->psEwaldCosSinSum           = NULL;
+    gpu->psPmeGrid                  = NULL;
+    gpu->psPmeBsplineModuli[0]      = NULL;
+    gpu->psPmeBsplineModuli[1]      = NULL;
+    gpu->psPmeBsplineModuli[2]      = NULL;
+    gpu->psPmeBsplineTheta          = NULL;
+    gpu->psPmeBsplineDtheta         = NULL;
+    gpu->psPmeParticleIndex         = NULL;
+    gpu->psPmeParticleFraction      = NULL;
    gpu->psShakeID                  = NULL;
    gpu->psShakeParameter           = NULL;
    gpu->psSettleID                 = NULL;
@@ -1747,6 +1780,17 @@ void gpuShutDown(gpuContext gpu)
    }
    if (gpu->psEwaldCosSinSum != NULL)
        delete gpu->psEwaldCosSinSum;
+    if (gpu->psPmeGrid != NULL) {
+        delete gpu->psPmeGrid;
+        delete gpu->psPmeBsplineModuli[0];
+        delete gpu->psPmeBsplineModuli[1];
+        delete gpu->psPmeBsplineModuli[2];
+        delete gpu->psPmeBsplineTheta;
+        delete gpu->psPmeBsplineDtheta;
+        delete gpu->psPmeParticleIndex;
+        delete gpu->psPmeParticleFraction;
+        cufftDestroy(gpu->fftplan);
+    }
    delete gpu->psObcData;
    delete gpu->psObcChain;
    delete gpu->psBornForce;
@@ -2102,6 +2146,7 @@ int gpuSetConstants(gpuContext gpu)
    SetCalculateObcGbsaBornSumSim(gpu);
    SetCalculateObcGbsaForces2Sim(gpu);
    SetCalculateAndersenThermostatSim(gpu);
+    SetCalculatePMESim(gpu);
    SetForcesSim(gpu);
    SetShakeHSim(gpu);
    SetLangevinUpdateSim(gpu);

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -91,6 +91,7 @@ struct _gpuContext {
    unsigned long seed;
    SM_VERSION sm_version;
    compactionPlan compactPlan;
+    cufftHandle fftplan;
    CUDAStream<float4>* psPosq4;
    CUDAStream<float4>* psPosqP4;
    CUDAStream<float4>* psOldPosq4;
@@ -105,6 +106,12 @@ struct _gpuContext {
    CUDAStream<float4>* psCustomExceptionParams; // Parameters for custom nonbonded exceptions
    CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
    CUDAStream<float2>* psEwaldCosSinSum;
+    CUDAStream<cufftComplex>* psPmeGrid;    // Grid points for particle mesh Ewald
+    CUDAStream<float>* psPmeBsplineModuli[3];
+    CUDAStream<float4>* psPmeBsplineTheta;
+    CUDAStream<float4>* psPmeBsplineDtheta;
+    CUDAStream<int4>* psPmeParticleIndex; // The grid indices for each atom
+    CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
    CUDAStream<float2>* psObcData; 
    CUDAStream<float>* psObcChain;
    CUDAStream<float>* psBornForce;
@@ -206,6 +213,9 @@ void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vect
 extern "C"
 void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
+extern "C"
+void gpuSetPMEParameters(gpuContext gpu, float alpha);
 extern "C"
 void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);

--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -120,6 +120,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
 // Reciprocal Space Ewald summation is in a separate kernel
 #include "kCalculateCDLJEwaldFastReciprocal.h"
+void kCalculatePME(gpuContext gpu);
 void kCalculateCDLJForces(gpuContext gpu)
 {
@@ -187,5 +188,22 @@ void kCalculateCDLJForces(gpuContext gpu)
            LAUNCHERROR("kCalculateEwaldFastCosSinSums");
            kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
            LAUNCHERROR("kCalculateEwaldFastForces");
+            break;
+        case PARTICLE_MESH_EWALD:
+            kFindBlockBoundsEwaldDirect_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsEwaldDirect");
+            kFindBlocksWithInteractionsEwaldDirect_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsEwaldDirect");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
+            kCalculatePME(gpu);
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateLocalForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateLocalForces.cu
@@ -448,7 +448,7 @@ __global__ void kCalculateLocalForces_kernel()
        pos += blockDim.x * gridDim.x;
    }   
-    if (cSim.nonbondedMethod == NO_CUTOFF || cSim.nonbondedMethod == EWALD)
+    if (cSim.nonbondedMethod == NO_CUTOFF || cSim.nonbondedMethod == EWALD || cSim.nonbondedMethod == PARTICLE_MESH_EWALD)
    {
        while (pos < cSim.LJ14_offset)
        {

--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman       *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+#include <cuda.h>
+using namespace std;
+static __constant__ cudaGmxSimulation cSim;
+void SetCalculatePMESim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCalculatePMESim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+inline __host__ __device__ int fast_mod(int a, int b)
+{
+    return (b & (b - 1)) ? a % b : a & (b - 1);
+}
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float a, float4 b)
+{
+    return make_float4(a + b.x, a + b.y, a + b.z,  a + b.w);
+}
+inline __host__ __device__ float4 operator*(float s, float4 a)
+{
+    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
+}
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w + b.w);
+}
+inline __host__ __device__ float4 make_float4(int3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0);
+}
+__global__ void kUpdateGridIndexAndFraction()
+{
+    unsigned int tnb = blockDim.x * gridDim.x;
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = tid; i < cSim.atoms; i += tnb)
+    {
+        float4 ftmp = cSim.pPosq[i];
+        __syncthreads();
+        float3 t = make_float3((ftmp.x/cSim.periodicBoxSizeX+1.0f)*cSim.pmeGridSize.x,
+                               (ftmp.y/cSim.periodicBoxSizeY+1.0f)*cSim.pmeGridSize.y,
+                               (ftmp.z/cSim.periodicBoxSizeZ+1.0f)*cSim.pmeGridSize.z);
+        float3 tix;
+        ftmp.x = modff(t.x, &tix.x);
+        ftmp.y = modff(t.y, &tix.y);
+        ftmp.z = modff(t.z, &tix.z);
+        cSim.pPmeParticleFraction[i] = ftmp;
+        /* avoid costly % operations if possible that is if dc_ngrid.* is pow. of 2 */
+        int4 itmp = make_int4(fast_mod(__float2int_rd(tix.x), cSim.pmeGridSize.x),
+                              fast_mod(__float2int_rd(tix.y), cSim.pmeGridSize.y),
+                              fast_mod(__float2int_rd(tix.z), cSim.pmeGridSize.z), 0);
+        cSim.pPmeParticleIndex[i] = itmp;
+        __syncthreads();
+    }
+}
+__global__ void kUpdateBsplines()
+{
+    unsigned int    tnb = blockDim.x * gridDim.x;
+    unsigned int    tid = blockIdx.x * blockDim.x + threadIdx.x;
+    extern __shared__ float4 bsplines_cache[]; /*size = 2 * block_size * pme_order*/
+    const float4 div_o   = make_float4(1.0f/(PME_ORDER - 1));
+    for (int i = tid; i < cSim.atoms; i += tnb)
+    {
+        float4* data    = &bsplines_cache[threadIdx.x*PME_ORDER];
+        float4* ddata   = &bsplines_cache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
+        for (int j = 0; j < PME_ORDER; j++)
+        {
+	    data[j] = make_float4(0.0f);
+            ddata[j] = make_float4(0.0f);
+        }
+        /* load data */
+        float4 dr = cSim.pPmeParticleFraction[i];
+        __syncthreads();
+        data[PME_ORDER - 1] = make_float4(0.0f);
+        data[1]            = dr;
+        data[0]            = make_float4(1.0f) - dr;
+        for (int j = 3; j < PME_ORDER; j++)
+        {
+            float div = 1.0f / ((float)j - 1.0f);
+            data[j - 1] = div * dr * data[j - 2];
+            for (int k = 1; k < (j - 1); k++)
+            {
+                data[j - k - 1] =
+                   div * (
+                           (dr + float(k))          * data[j - k - 2] +
+                           (-dr + ((float)(j - k))) * data[j - k - 1]);
+            }
+            data[0] = div * (- dr + 1) * data[0];
+        }
+        ddata[0] = -data[0];
+        for (int j = 1; j < PME_ORDER; j++)
+        {
+            ddata[j] = data[j - 1] - data[j];
+        }
+        data[PME_ORDER - 1] = div_o * dr * data[PME_ORDER - 2];
+        for (int j = 1; j < (PME_ORDER - 1); j++)
+        {
+            data[PME_ORDER - j - 1] =
+                div_o * (
+                    (dr + (float)j)                 * data[PME_ORDER - j - 2] +
+                    (-dr + ((float)(PME_ORDER - j))) * data[PME_ORDER - j - 1]
+                );
+        }
+        data[0] = div_o * (-dr + 1.0f) * data[0];
+        __syncthreads();
+        /* write back */
+        for (int j = 0; j < PME_ORDER; j++)
+        {
+            cSim.pPmeBsplineTheta[i * PME_ORDER + j] =  data[j];
+            cSim.pPmeBsplineDtheta[i * PME_ORDER + j] = ddata[j];
+        }
+        __syncthreads();
+    }
+}
+void kCalculatePME(gpuContext gpu)
+{
+//    printf("kCalculatePME\n");
+    kUpdateGridIndexAndFraction<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kUpdateGridIndexAndFraction");
+    kUpdateBsplines<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, 2*gpu->sim.update_threads_per_block*PME_ORDER*sizeof(float4)>>>();
+    LAUNCHERROR("kUpdateBsplines");
+}
--- a/platforms/cuda/staticTarget/CMakeLists.txt
+++ b/platforms/cuda/staticTarget/CMakeLists.txt
@@ -25,6 +25,6 @@ SET(CUDA_STATIC_COMPILE_FLAG "-DOPENMM_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STAT
 SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS ${CUDA_STATIC_COMPILE_FLAG})
 TARGET_LINK_LIBRARIES(${STATIC_TARGET} optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil )
-TARGET_LINK_LIBRARIES(${STATIC_TARGET} debug ${OPENMM_LIBRARY_NAME}_static_d optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil)
+TARGET_LINK_LIBRARIES(${STATIC_TARGET} debug ${OPENMM_LIBRARY_NAME}_static_d optimized ${OPENMM_LIBRARY_NAME}_static cudpp cutil ${CUFFT_TARGET_LINK})
 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})