Optimization to improve cache coherency in charge spreading kernel

839ab51e · Peter Eastman · 92766e8a · 839ab51e · 839ab51e
Commit 839ab51e authored May 27, 2011 by Peter Eastman
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

platforms/cuda/src/kernels/kCalculatePME.cu platforms/cuda/src/kernels/kCalculatePME.cu +3 -2

platforms/opencl/src/kernels/pme.cl platforms/opencl/src/kernels/pme.cl +3 -2

No files found.
--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
@@ -171,6 +171,7 @@ void kUpdateBsplines_kernel()
        for (int j = 0; j < PME_ORDER; j++)
        {
+            data[j].w = posq.w; // Storing the charge here improves cache coherency in the charge spreading kernel
            cSim.pPmeBsplineTheta[i + j*cSim.atoms] =  data[j];
            cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
        }
@@ -256,7 +257,7 @@ void kGridSpreadCharge_kernel()
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
-                    float atomCharge = cSim.pPosq[atomIndex].w;
+                    float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
                    result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
                }
                if (z1 > gridPoint.z)
@@ -271,7 +272,7 @@ void kGridSpreadCharge_kernel()
                        int atomIndex = atomData.x;
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
-                        float atomCharge = cSim.pPosq[atomIndex].w;
+                        float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
                        result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
                    }
                }

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -37,6 +37,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
            data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
+            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
            pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
        }
@@ -108,7 +109,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                    float atomCharge = posq[atomIndex].w;
+                    float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                }
                if (z1 > gridPoint.z)
@@ -123,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
                        int atomIndex = atomData.x;
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
-                        float atomCharge = posq[atomIndex].w;
+                        float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    }
                }