Simplified PME code, eliminating an unnecessary kernel.

acb46362 · Peter Eastman · dae88c48 · acb46362 · acb46362 · acb46362
Commit acb46362 authored Sep 03, 2010 by Peter Eastman
3 changed files
--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
@@ -92,83 +92,6 @@ inline __host__ __device__ float4 make_float4(int3 a)
    return make_float4((float) a.x, (float) a.y, (float) a.z, 0);
 }
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kUpdateGridIndexAndFraction_kernel()
-{
-    unsigned int tnb = blockDim.x * gridDim.x;
-    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int i = tid; i < cSim.atoms; i += tnb)
-    {
-        float4 posq = cSim.pPosq[i];
-        posq.x -= floor(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
-        posq.y -= floor(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
-        posq.z -= floor(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
-                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
-                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
-        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
-                              ((int) t.y) % cSim.pmeGridSize.y,
-                              ((int) t.z) % cSim.pmeGridSize.z);
-        cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
-    }
-}
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kFindAtomRangeForGrid_kernel()
-{
-    int thread = blockIdx.x*blockDim.x+threadIdx.x;
-    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
-    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
-    int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i)
-    {
-        int2 atomData = cSim.pPmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last)
-        {
-            for (int j = last+1; j <= gridIndex; ++j)
-                cSim.pPmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-        // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
-        // some work in the charge spreading kernel.
-        float posz = cSim.pPosq[atomData.x].z;
-        posz -= floor(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
-        cSim.pPmeAtomGridIndex[i].y = z;
-    }
-    // Fill in values beyond the last atom.
-    if (thread == blockDim.x*gridDim.x-1)
-    {
-        int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
-        for (int j = last+1; j <= gridSize; ++j)
-            cSim.pPmeAtomRange[j] = cSim.atoms;
-    }
-}
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(1024, 1)
@@ -205,6 +128,10 @@ void kUpdateBsplines_kernel()
                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
        float4 dr = make_float4(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
+                              ((int) t.y) % cSim.pmeGridSize.y,
+                              ((int) t.z) % cSim.pmeGridSize.z);
+        cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
        data[PME_ORDER - 1] = make_float4(0.0f);
        data[1]            = dr;
@@ -250,6 +177,53 @@ void kUpdateBsplines_kernel()
    }
 }
+/**
+ * For each grid point, find the range of sorted atoms associated with that point.
+ */
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(1024, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(256, 1)
+#endif
+void kFindAtomRangeForGrid_kernel()
+{
+    int thread = blockIdx.x*blockDim.x+threadIdx.x;
+    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
+    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
+    int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
+    for (int i = start; i < end; ++i)
+    {
+        int2 atomData = cSim.pPmeAtomGridIndex[i];
+        int gridIndex = atomData.y;
+        if (gridIndex != last)
+        {
+            for (int j = last+1; j <= gridIndex; ++j)
+                cSim.pPmeAtomRange[j] = i;
+            last = gridIndex;
+        }
+        // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
+        // some work in the charge spreading kernel.
+        float posz = cSim.pPosq[atomData.x].z;
+        posz -= floor(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
+        int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
+        cSim.pPmeAtomGridIndex[i].y = z;
+    }
+    // Fill in values beyond the last atom.
+    if (thread == blockDim.x*gridDim.x-1)
+    {
+        int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
+        for (int j = last+1; j <= gridSize; ++j)
+            cSim.pPmeAtomRange[j] = cSim.atoms;
+    }
+}
 __global__ 
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(1024, 1)
@@ -418,14 +392,12 @@ void kCalculatePME(gpuContext gpu)
 //    printf("kCalculatePME\n");
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
    cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
-    kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kUpdateGridIndexAndFraction");
-    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
-    kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kFindAtomRangeForGrid");
    unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
    kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
    LAUNCHERROR("kUpdateBsplines");
+    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
+    kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kFindAtomRangeForGrid");
    kGridSpreadCharge_kernel<<<8*gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
    LAUNCHERROR("kGridSpreadCharge");
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1380,21 +1380,19 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        }
        if (pmeGrid != NULL) {
            cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
-            pmeGridIndexKernel = cl::Kernel(program, "updateGridIndexAndFraction");
-            pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
            pmeUpdateBsplinesKernel = cl::Kernel(program, "updateBsplines");
+            pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
            pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
            pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
            pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
-            pmeGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-            pmeGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
-            pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
-            pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(1, pmeBsplineTheta->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(2, pmeBsplineDtheta->getDeviceBuffer());
            pmeUpdateBsplinesKernel.setArg(3, 2*OpenCLContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL);
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(4, pmeAtomGridIndex->getDeviceBuffer());
+            pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
+            pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
+            pmeAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
@@ -1428,14 +1426,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
    if (pmeGrid != NULL) {
        mm_float4 boxSize = cl.getPeriodicBoxSize();
        mm_float4 invBoxSize = cl.getInvPeriodicBoxSize();
-        pmeGridIndexKernel.setArg<mm_float4>(2, boxSize);
-        pmeGridIndexKernel.setArg<mm_float4>(3, invBoxSize);
-        cl.executeKernel(pmeGridIndexKernel, cl.getNumAtoms());
-        sort->sort(*pmeAtomGridIndex);
-        cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
        pmeUpdateBsplinesKernel.setArg<mm_float4>(5, boxSize);
        pmeUpdateBsplinesKernel.setArg<mm_float4>(6, invBoxSize);
        cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
+        sort->sort(*pmeAtomGridIndex);
+        pmeAtomRangeKernel.setArg<mm_float4>(3, boxSize);
+        pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
+        cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
        cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
        fft->execFFT(*pmeGrid, true);
        pmeConvolutionKernel.setArg<mm_float4>(5, invBoxSize);

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
-__kernel void updateGridIndexAndFraction(__global float4* posq, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
-    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
-        float4 pos = posq[i];
-        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-        float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
-                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
-                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
-        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z, 0);
-        pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
-    }
-}
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange) {
-    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
-    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-    // Fill in values beyond the last atom.
-    if (get_global_id(0) == get_global_size(0)-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
 __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
@@ -58,6 +15,10 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
        float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
+        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
+                                 ((int) t.y) % GRID_SIZE_Y,
+                                 ((int) t.z) % GRID_SIZE_Z, 0);
+        pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
@@ -80,18 +41,40 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
            pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
        }
    }
+}
-    // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
+/**
-    // some work in the charge spreading kernel.
+ * For each grid point, find the range of sorted atoms associated with that point.
+ */
+__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
+    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
    for (int i = start; i < end; ++i) {
+        int2 atomData = pmeAtomGridIndex[i];
+        int gridIndex = atomData.y;
+        if (gridIndex != last) {
+            for (int j = last+1; j <= gridIndex; ++j)
+                pmeAtomRange[j] = i;
+            last = gridIndex;
+        }
+        // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
+        // some work in the charge spreading kernel.
        float posz = posq[pmeAtomGridIndex[i].x].z;
        posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
        int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
        pmeAtomGridIndex[i].y = z;
    }
+    // Fill in values beyond the last atom.
+    if (get_global_id(0) == get_global_size(0)-1) {
+        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+        for (int j = last+1; j <= gridSize; ++j)
+            pmeAtomRange[j] = NUM_ATOMS;
+    }
 }
 __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {