Commit 839ab51e authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimization to improve cache coherency in charge spreading kernel

parent 92766e8a
...@@ -171,6 +171,7 @@ void kUpdateBsplines_kernel() ...@@ -171,6 +171,7 @@ void kUpdateBsplines_kernel()
for (int j = 0; j < PME_ORDER; j++) for (int j = 0; j < PME_ORDER; j++)
{ {
data[j].w = posq.w; // Storing the charge here improves cache coherency in the charge spreading kernel
cSim.pPmeBsplineTheta[i + j*cSim.atoms] = data[j]; cSim.pPmeBsplineTheta[i + j*cSim.atoms] = data[j];
cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j]; cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
} }
...@@ -256,7 +257,7 @@ void kGridSpreadCharge_kernel() ...@@ -256,7 +257,7 @@ void kGridSpreadCharge_kernel()
int atomIndex = atomData.x; int atomIndex = atomData.x;
int z = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z); int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = cSim.pPosq[atomIndex].w; float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z; result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
} }
if (z1 > gridPoint.z) if (z1 > gridPoint.z)
...@@ -271,7 +272,7 @@ void kGridSpreadCharge_kernel() ...@@ -271,7 +272,7 @@ void kGridSpreadCharge_kernel()
int atomIndex = atomData.x; int atomIndex = atomData.x;
int z = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z); int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = cSim.pPosq[atomIndex].w; float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z; result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
} }
} }
......
...@@ -37,6 +37,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT ...@@ -37,6 +37,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0]; data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) { for (int j = 0; j < PME_ORDER; j++) {
data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
pmeBsplineTheta[i+j*NUM_ATOMS] = data[j]; pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j]; pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
} }
...@@ -108,7 +109,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid ...@@ -108,7 +109,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
int atomIndex = atomData.x; int atomIndex = atomData.x;
int z = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z); int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = posq[atomIndex].w; float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z; result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
} }
if (z1 > gridPoint.z) if (z1 > gridPoint.z)
...@@ -123,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid ...@@ -123,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
int atomIndex = atomData.x; int atomIndex = atomData.x;
int z = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z); int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = posq[atomIndex].w; float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z; result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment