Fixes and optimizations to PME on CPU

0b96aa2e · Peter Eastman · e28ce558 · 0b96aa2e · 0b96aa2e · 0b96aa2e
Commit 0b96aa2e authored Oct 28, 2010 by Peter Eastman
4 changed files
--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -43,6 +43,8 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
 void OpenCLFFT3D::execFFT(OpenCLArray<mm_float2>& data, bool forward) {
    int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
+    if (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU)
+        maxSize = 1;
    xkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
    xkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
    context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1361,6 +1361,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
 }
 double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
        if (exceptionIndices != NULL) {
@@ -1379,7 +1380,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums->getDeviceBuffer());
        }
        if (pmeGrid != NULL) {
-            cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
+            string file = (deviceIsCpu ? OpenCLKernelSources::pme_cpu : OpenCLKernelSources::pme);
+            cl::Program program = cl.createProgram(file, pmeDefines);
            pmeUpdateBsplinesKernel = cl::Kernel(program, "updateBsplines");
            pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
            pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
@@ -1429,11 +1431,18 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        pmeUpdateBsplinesKernel.setArg<mm_float4>(5, boxSize);
        pmeUpdateBsplinesKernel.setArg<mm_float4>(6, invBoxSize);
        cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
+        if (deviceIsCpu) {
+            pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
+            pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize);
+            cl.executeKernel(pmeSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
+        }
+        else {
            sort->sort(*pmeAtomGridIndex);
            pmeAtomRangeKernel.setArg<mm_float4>(3, boxSize);
            pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
            cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
            cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
+        }
        fft->execFFT(*pmeGrid, true);
        pmeConvolutionKernel.setArg<mm_float4>(5, invBoxSize);
        pmeConvolutionKernel.setArg<cl_float>(6, (float) (1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z)));

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -91,7 +91,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
-                    if (!isExcluded) {
 #endif
                    float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
@@ -123,9 +122,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
 #endif
 #ifdef USE_CUTOFF
                    }
-#endif
-#ifdef USE_EXCLUSIONS
-                    }
 #endif
                    excl >>= 1;
                }
@@ -219,7 +215,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
 #ifdef USE_EXCLUSIONS
                        bool isExcluded = !(excl & 0x1);
-                        if (!isExcluded) {
 #endif
                        float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
@@ -260,7 +255,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                        }
 #endif
 #ifdef USE_EXCLUSIONS
-                        }
                        excl >>= 1;
 #endif
                    }

--- a/platforms/opencl/src/kernels/pme_cpu.cl
+++ b/platforms/opencl/src/kernels/pme_cpu.cl
+__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+    const float4 scale = 1.0f/(PME_ORDER-1);
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
+        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
+        __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
+        for (int j = 0; j < PME_ORDER; j++) {
+	    data[j] = 0.0f;
+            ddata[j] = 0.0f;
+        }
+        float4 pos = posq[i];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        data[PME_ORDER-1] = 0.0f;
+        data[1] = dr;
+        data[0] = 1.0f-dr;
+        for (int j = 3; j < PME_ORDER; j++) {
+            float div = 1.0f/(j-1.0f);
+            data[j-1] = div*dr*data[j-2];
+            for (int k = 1; k < (j-1); k++)
+                data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
+            data[0] = div*(- dr+1.0f)*data[0];
+        }
+        ddata[0] = -data[0];
+        for (int j = 1; j < PME_ORDER; j++)
+            ddata[j] = data[j-1]-data[j];
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
+        for (int j = 1; j < (PME_ORDER-1); j++)
+            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
+        data[0] = scale*(-dr+1.0f)*data[0];
+        for (int j = 0; j < PME_ORDER; j++) {
+            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
+            pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
+        }
+    }
+}
+/**
+ * This kernel is not actually used when running on a CPU.
+ */
+__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+}
+__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
+    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
+    for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
+        pmeGrid[gridIndex] = (float2) (0.0f, 0.0f);
+    for (int atom = 0; atom < NUM_ATOMS; atom++) {
+        float4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z, 0);
+        float atomCharge = pos.w*EPSILON_FACTOR;
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xindex = gridIndex.x+ix;
+            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            if (xindex < firstx || xindex >= lastx)
+                continue;
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int yindex = gridIndex.y+iy;
+                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                for(int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
+                    pmeGrid[index].x += atomCharge*pmeBsplineTheta[atom+ix*NUM_ATOMS].x*pmeBsplineTheta[atom+iy*NUM_ATOMS].y*pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
+                }
+            }
+        }
+    }
+}
+__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
+        __global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    float energy = 0.0f;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
+        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
+        int ky = remainder/GRID_SIZE_Z;
+        int kz = remainder-ky*GRID_SIZE_Z;
+        if (kx == 0 && ky == 0 && kz == 0)
+            continue;
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        float mhx = mx*invPeriodicBoxSize.x;
+        float mhy = my*invPeriodicBoxSize.y;
+        float mhz = mz*invPeriodicBoxSize.z;
+        float bx = pmeBsplineModuliX[kx];
+        float by = pmeBsplineModuliY[ky];
+        float bz = pmeBsplineModuliZ[kz];
+        float2 grid = pmeGrid[index];
+        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        float denom = m2*bx*by*bz;
+        float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
+        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+    }
+    energyBuffer[get_global_id(0)] += 0.5f*energy;
+}
+__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
+        float4 force = 0.0f;
+        float4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
+        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z, 0);
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xindex = gridIndex.x+ix;
+            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            float tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
+            float dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int yindex = gridIndex.y+iy;
+                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                float ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
+                float dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    float tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
+                    float dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
+                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
+                    float gridvalue = pmeGrid[index].x;
+                    force.x += dtx*ty*tz*gridvalue;
+                    force.y += tx*dty*tz*gridvalue;
+                    force.z += tx*ty*dtz*gridvalue;
+                }
+            }
+        }
+        float4 totalForce = forceBuffers[atom];
+        float q = pos.w*EPSILON_FACTOR;
+        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
+        totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
+        totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
+        forceBuffers[atom] = totalForce;
+    }
+}