Optimizations to PME reciprocal space calculation

592dc5a9 · Peter Eastman · c08d8a53 · 592dc5a9 · 592dc5a9 · 592dc5a9
Commit 592dc5a9 authored Sep 08, 2011 by Peter Eastman
4 changed files
--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2011 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1149,8 +1149,6 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
        delete pmeBsplineModuliZ;
    if (pmeBsplineTheta != NULL)
        delete pmeBsplineTheta;
-    if (pmeBsplineDtheta != NULL)
-        delete pmeBsplineDtheta;
    if (pmeAtomRange != NULL)
        delete pmeAtomRange;
    if (pmeAtomGridIndex != NULL)
@@ -1273,7 +1271,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
        pmeBsplineModuliY = new OpenCLArray<cl_float>(cl, gridSizeY, "pmeBsplineModuliY");
        pmeBsplineModuliZ = new OpenCLArray<cl_float>(cl, gridSizeZ, "pmeBsplineModuliZ");
        pmeBsplineTheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineTheta");
-        pmeBsplineDtheta = new OpenCLArray<mm_float4>(cl, PmeOrder*numParticles, "pmeBsplineDtheta");
        pmeAtomRange = new OpenCLArray<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
        pmeAtomGridIndex = new OpenCLArray<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
        sort = new OpenCLSort<mm_int2>(cl, cl.getNumAtoms(), "int2", "value.y");
@@ -1411,9 +1408,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(1, pmeBsplineTheta->getDeviceBuffer());
-            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer());
+            pmeUpdateBsplinesKernel.setArg(2, OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
-            pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
+            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(3, pmeAtomGridIndex->getDeviceBuffer());
-            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(4, pmeAtomGridIndex->getDeviceBuffer());
            pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
            pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
            pmeAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
@@ -1429,9 +1425,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
-            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeBsplineTheta->getDeviceBuffer());
+            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
-            pmeInterpolateForceKernel.setArg<cl::Buffer>(3, pmeBsplineDtheta->getDeviceBuffer());
+            pmeInterpolateForceKernel.setArg(5, 2*128*PmeOrder*sizeof(mm_float4), NULL);
-            pmeInterpolateForceKernel.setArg<cl::Buffer>(4, pmeGrid->getDeviceBuffer());
            if (cl.getSupports64BitGlobalAtomics()) {
                pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
@@ -1454,8 +1449,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    if (pmeGrid != NULL && cl.getContextIndex() == 0) {
        mm_float4 boxSize = cl.getPeriodicBoxSize();
        mm_float4 invBoxSize = cl.getInvPeriodicBoxSize();
-        pmeUpdateBsplinesKernel.setArg<mm_float4>(5, boxSize);
+        pmeUpdateBsplinesKernel.setArg<mm_float4>(4, boxSize);
-        pmeUpdateBsplinesKernel.setArg<mm_float4>(6, invBoxSize);
+        pmeUpdateBsplinesKernel.setArg<mm_float4>(5, invBoxSize);
        cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
        if (deviceIsCpu) {
            pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
@@ -1482,9 +1477,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        pmeConvolutionKernel.setArg<cl_float>(6, (float) (1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z)));
        cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
        fft->execFFT(*pmeGrid2, *pmeGrid, false);
-        pmeInterpolateForceKernel.setArg<mm_float4>(5, boxSize);
+        pmeInterpolateForceKernel.setArg<mm_float4>(3, boxSize);
-        pmeInterpolateForceKernel.setArg<mm_float4>(6, invBoxSize);
+        pmeInterpolateForceKernel.setArg<mm_float4>(4, invBoxSize);
-        cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
+        cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms(), 128);
    }
    double energy = ewaldSelfEnergy;
    if (dispersionCoefficient != 0.0) {

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -475,8 +475,8 @@ private:
 class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
-            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), exceptionIndices(NULL), cosSinSums(NULL), pmeGrid(NULL), pmeGrid2(NULL),
+            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), exceptionIndices(NULL), cosSinSums(NULL), pmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDtheta(NULL), pmeAtomRange(NULL),
+            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeAtomRange(NULL),
            pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
@@ -509,7 +509,6 @@ private:
    OpenCLArray<cl_float>* pmeBsplineModuliY;
    OpenCLArray<cl_float>* pmeBsplineModuliZ;
    OpenCLArray<mm_float4>* pmeBsplineTheta;
-    OpenCLArray<mm_float4>* pmeBsplineDtheta;
    OpenCLArray<cl_int>* pmeAtomRange;
    OpenCLArray<mm_int2>* pmeAtomGridIndex;
    OpenCLSort<mm_int2>* sort;

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
-__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
-        __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
-        for (int j = 0; j < PME_ORDER; j++) {
-	    data[j] = 0.0f;
-            ddata[j] = 0.0f;
-        }
        float4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
@@ -29,9 +24,6 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
                data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
            data[0] = div*(- dr+1.0f)*data[0];
        }
-        ddata[0] = -data[0];
-        for (int j = 1; j < PME_ORDER; j++)
-            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
@@ -39,7 +31,6 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
        for (int j = 0; j < PME_ORDER; j++) {
            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
-            pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
        }
    }
 }
@@ -81,28 +72,54 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
 #ifdef SUPPORTS_64_BIT_ATOMICS
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
+__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
 __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global long* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
    int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
    int iy = remainder/PME_ORDER;
    int iz = remainder-iy*PME_ORDER;
+    __local float4 theta[PME_ORDER];
+    __local float charge[BUFFER_SIZE];
+    __local int basex[BUFFER_SIZE];
+    __local int basey[BUFFER_SIZE];
+    __local int basez[BUFFER_SIZE];
    if (ix < PME_ORDER) {
-        for (int atomIndex = get_group_id(0); atomIndex < NUM_ATOMS; atomIndex += get_num_groups(0)) {
+        for (int baseIndex = get_group_id(0)*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += get_num_groups(0)*BUFFER_SIZE) {
+            // Load the next block of atoms into the buffers.
+            if (get_local_id(0) < BUFFER_SIZE) {
+                int atomIndex = baseIndex+get_local_id(0);
+                if (atomIndex < NUM_ATOMS) {
                    float4 pos = posq[atomIndex];
-            float atomCharge = pos.w;
+                    charge[get_local_id(0)] = pos.w;
-            float add = atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
                    pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
                    pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-            int x = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X)+ix;
+                    basex[get_local_id(0)] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
-            int y = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y)+iy;
+                    basey[get_local_id(0)] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
-            int z = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z)+iz;
+                    basez[get_local_id(0)] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
+            for (int index = 0; index < lastIndex; index++) {
+                int atomIndex = index+baseIndex;
+                if (get_local_id(0) < PME_ORDER)
+                    theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
+                barrier(CLK_LOCAL_MEM_FENCE);
+                float add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
+                int x = basex[index]+ix;
+                int y = basey[index]+iy;
+                int z = basez[index]+iz;
                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
            }
        }
+    }
 }
 __kernel void finishSpreadCharge(__global long* pmeGrid) {
@@ -203,7 +220,11 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }
-__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
+__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* bsplinesCache) {
+    const float4 scale = 1.0f/(PME_ORDER-1);
+    __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
+    __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
        float4 force = 0.0f;
        float4 pos = posq[atom];
@@ -216,26 +237,45 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        data[PME_ORDER-1] = 0.0f;
+        data[1] = dr;
+        data[0] = 1.0f-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            float div = 1.0f/(j-1.0f);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
+            data[0] = div*(- dr+1.0f)*data[0];
+        }
+        ddata[0] = -data[0];
+        for (int j = 1; j < PME_ORDER; j++)
+            ddata[j] = data[j-1]-data[j];
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
+        data[0] = scale*(-dr+1.0f)*data[0];
+        // Compute the force on this atom.
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-            float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
-            float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
-                float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-                    float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
-                    float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    float gridvalue = pmeGrid[index].x;
-                    force.x += dtx*ty*tz*gridvalue;
+                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
-                    force.y += tx*dty*tz*gridvalue;
+                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
-                    force.z += tx*ty*dtz*gridvalue;
+                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
                }
            }
        }