Commit 55801d55 authored by Marc Marí's avatar Marc Marí
Browse files

Use coarser granularity to split work in PME kernels

parent 7c7dc751
...@@ -66,96 +66,102 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri ...@@ -66,96 +66,102 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri
float posInBox[4] = {0,0,0,0}; float posInBox[4] = {0,0,0,0};
memset(grid, 0, sizeof(float)*gridx*gridy*gridz); memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
int i = threadIndex; const int groupSize = max(1, numParticles / (10 * numThreads));
int start = groupSize * threadIndex;
while (true) { while (true) {
if (!deterministic) if (!deterministic)
i = atomicCounter++; start = atomicCounter.fetch_add(groupSize);
if (i >= numParticles)
if (start >= numParticles)
break; break;
// Find the position relative to the nearest grid point. int end = min(start + groupSize, numParticles);
for (int i = start; i < end; ++i) {
fvec4 pos(&posq[4*i]); // Find the position relative to the nearest grid point.
(pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2; fvec4 pos(&posq[4*i]);
t = (t-floor(t))*gridSize; (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
ivec4 ti = t; fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
fvec4 dr = t-ti; t = (t-floor(t))*gridSize;
ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt); ivec4 ti = t;
fvec4 dr = t-ti;
// Compute the B-spline coefficients. ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
fvec4 data[PME_ORDER]; // Compute the B-spline coefficients.
data[PME_ORDER-1] = 0.0f;
data[1] = dr; fvec4 data[PME_ORDER];
data[0] = one-dr; data[PME_ORDER-1] = 0.0f;
for (int j = 3; j < PME_ORDER; j++) { data[1] = dr;
fvec4 div(1.0f/(j-1)); data[0] = one-dr;
data[j-1] = div*dr*data[j-2]; for (int j = 3; j < PME_ORDER; j++) {
for (int k = 1; k < j-1; k++) fvec4 div(1.0f/(j-1));
data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]); data[j-1] = div*dr*data[j-2];
data[0] = div*(one-dr)*data[0]; for (int k = 1; k < j-1; k++)
} data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[0] = div*(one-dr)*data[0];
for (int j = 1; j < (PME_ORDER-1); j++) }
data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
data[0] = scale*(one-dr)*data[0]; for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
// Spread the charges. data[0] = scale*(one-dr)*data[0];
int gridIndexX = gridIndex[0]; // Spread the charges.
int gridIndexY = gridIndex[1];
int gridIndexZ = gridIndex[2]; int gridIndexX = gridIndex[0];
if (gridIndexX < 0) int gridIndexY = gridIndex[1];
return; // This happens when a simulation blows up and coordinates become NaN. int gridIndexZ = gridIndex[2];
int zindex[PME_ORDER]; if (gridIndexX < 0)
for (int j = 0; j < PME_ORDER; j++) { return; // This happens when a simulation blows up and coordinates become NaN.
zindex[j] = gridIndexZ+j; int zindex[PME_ORDER];
zindex[j] -= (zindex[j] >= gridz ? gridz : 0); for (int j = 0; j < PME_ORDER; j++) {
} zindex[j] = gridIndexZ+j;
float charge = epsilonFactor*posq[4*i+3]; zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]); }
float zdata4 = data[4][2]; float charge = epsilonFactor*posq[4*i+3];
if (gridIndexZ+4 < gridz) { fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
for (int ix = 0; ix < PME_ORDER; ix++) { float zdata4 = data[4][2];
int xbase = gridIndexX+ix; if (gridIndexZ+4 < gridz) {
xbase -= (xbase >= gridx ? gridx : 0); for (int ix = 0; ix < PME_ORDER; ix++) {
xbase = xbase*gridy*gridz; int xbase = gridIndexX+ix;
float xdata = charge*data[ix][0]; xbase -= (xbase >= gridx ? gridx : 0);
for (int iy = 0; iy < PME_ORDER; iy++) { xbase = xbase*gridy*gridz;
int ybase = gridIndexY+iy; float xdata = charge*data[ix][0];
ybase -= (ybase >= gridy ? gridy : 0); for (int iy = 0; iy < PME_ORDER; iy++) {
ybase = xbase + ybase*gridz; int ybase = gridIndexY+iy;
float multiplier = xdata*data[iy][1]; ybase -= (ybase >= gridy ? gridy : 0);
fvec4 add0to3 = zdata0to3*multiplier; ybase = xbase + ybase*gridz;
(fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]); float multiplier = xdata*data[iy][1];
grid[ybase+zindex[4]] += multiplier*zdata4; fvec4 add0to3 = zdata0to3*multiplier;
(fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
grid[ybase+zindex[4]] += multiplier*zdata4;
}
} }
} }
} else {
else { for (int ix = 0; ix < PME_ORDER; ix++) {
for (int ix = 0; ix < PME_ORDER; ix++) { int xbase = gridIndexX+ix;
int xbase = gridIndexX+ix; xbase -= (xbase >= gridx ? gridx : 0);
xbase -= (xbase >= gridx ? gridx : 0); xbase = xbase*gridy*gridz;
xbase = xbase*gridy*gridz; float xdata = charge*data[ix][0];
float xdata = charge*data[ix][0]; for (int iy = 0; iy < PME_ORDER; iy++) {
for (int iy = 0; iy < PME_ORDER; iy++) { int ybase = gridIndexY+iy;
int ybase = gridIndexY+iy; ybase -= (ybase >= gridy ? gridy : 0);
ybase -= (ybase >= gridy ? gridy : 0); ybase = xbase + ybase*gridz;
ybase = xbase + ybase*gridz; float multiplier = xdata*data[iy][1];
float multiplier = xdata*data[iy][1]; fvec4 add0to3 = zdata0to3*multiplier;
fvec4 add0to3 = zdata0to3*multiplier; add0to3.store(temp);
add0to3.store(temp); grid[ybase+zindex[0]] += temp[0];
grid[ybase+zindex[0]] += temp[0]; grid[ybase+zindex[1]] += temp[1];
grid[ybase+zindex[1]] += temp[1]; grid[ybase+zindex[2]] += temp[2];
grid[ybase+zindex[2]] += temp[2]; grid[ybase+zindex[3]] += temp[3];
grid[ybase+zindex[3]] += temp[3]; grid[ybase+zindex[4]] += multiplier*zdata4;
grid[ybase+zindex[4]] += multiplier*zdata4; }
} }
} }
} }
if (deterministic) if (deterministic)
i += numThreads; start += groupSize * numThreads;
} }
} }
...@@ -310,7 +316,7 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vecto ...@@ -310,7 +316,7 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vecto
} }
} }
static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor) { static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor, int numThreads) {
fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0); fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0); fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0); fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
...@@ -320,88 +326,94 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx, ...@@ -320,88 +326,94 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx,
ivec4 gridSizeInt(gridx, gridy, gridz, 0); ivec4 gridSizeInt(gridx, gridy, gridz, 0);
fvec4 one(1); fvec4 one(1);
fvec4 scale(1.0f/(PME_ORDER-1)); fvec4 scale(1.0f/(PME_ORDER-1));
const int groupSize = max(1, numParticles / (10 * numThreads));
while (true) { while (true) {
int i = atomicCounter++; int start = atomicCounter.fetch_add(groupSize);
if (i >= numParticles) if (start >= numParticles)
break; break;
// Find the position relative to the nearest grid point. int end = min(start + groupSize, numParticles);
fvec4 pos(&posq[4*i]); for (int i = start; i < end; i++) {
float posInBox[4]; // Find the position relative to the nearest grid point.
(pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2; fvec4 pos(&posq[4*i]);
t = (t-floor(t))*gridSize; float posInBox[4];
ivec4 ti = t; (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
fvec4 dr = t-ti; fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt); t = (t-floor(t))*gridSize;
ivec4 ti = t;
// Compute the B-spline coefficients. fvec4 dr = t-ti;
ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
fvec4 data[PME_ORDER];
fvec4 ddata[PME_ORDER]; // Compute the B-spline coefficients.
data[PME_ORDER-1] = 0.0f;
data[1] = dr; fvec4 data[PME_ORDER];
data[0] = one-dr; fvec4 ddata[PME_ORDER];
for (int j = 3; j < PME_ORDER; j++) { data[PME_ORDER-1] = 0.0f;
fvec4 div(1.0f/(j-1)); data[1] = dr;
data[j-1] = div*dr*data[j-2]; data[0] = one-dr;
for (int k = 1; k < j-1; k++) for (int j = 3; j < PME_ORDER; j++) {
data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]); fvec4 div(1.0f/(j-1));
data[0] = div*(one-dr)*data[0]; data[j-1] = div*dr*data[j-2];
} for (int k = 1; k < j-1; k++)
ddata[0] = -data[0]; data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
for (int j = 1; j < PME_ORDER; j++) data[0] = div*(one-dr)*data[0];
ddata[j] = data[j-1]-data[j]; }
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; ddata[0] = -data[0];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < PME_ORDER; j++)
data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]); ddata[j] = data[j-1]-data[j];
data[0] = scale*(one-dr)*data[0]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
// Compute the force on this atom. data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(one-dr)*data[0];
int gridIndexX = gridIndex[0];
int gridIndexY = gridIndex[1]; // Compute the force on this atom.
int gridIndexZ = gridIndex[2];
if (gridIndexX < 0) int gridIndexX = gridIndex[0];
return; // This happens when a simulation blows up and coordinates become NaN. int gridIndexY = gridIndex[1];
int zindex[PME_ORDER]; int gridIndexZ = gridIndex[2];
for (int j = 0; j < PME_ORDER; j++) { if (gridIndexX < 0)
zindex[j] = gridIndexZ+j; return; // This happens when a simulation blows up and coordinates become NaN.
zindex[j] -= (zindex[j] >= gridz ? gridz : 0); int zindex[PME_ORDER];
} for (int j = 0; j < PME_ORDER; j++) {
fvec4 zdata[PME_ORDER]; zindex[j] = gridIndexZ+j;
for (int j = 0; j < PME_ORDER; j++) zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0); }
fvec4 f = 0.0f; fvec4 zdata[PME_ORDER];
for (int ix = 0; ix < PME_ORDER; ix++) { for (int j = 0; j < PME_ORDER; j++)
int xbase = gridIndexX+ix; zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
xbase -= (xbase >= gridx ? gridx : 0); fvec4 f = 0.0f;
xbase = xbase*gridy*gridz; for (int ix = 0; ix < PME_ORDER; ix++) {
float dx = data[ix][0]; int xbase = gridIndexX+ix;
float ddx = ddata[ix][0]; xbase -= (xbase >= gridx ? gridx : 0);
fvec4 xdata(ddx, dx, dx, 0); xbase = xbase*gridy*gridz;
float dx = data[ix][0];
for (int iy = 0; iy < PME_ORDER; iy++) { float ddx = ddata[ix][0];
int ybase = gridIndexY+iy; fvec4 xdata(ddx, dx, dx, 0);
ybase -= (ybase >= gridy ? gridy : 0);
ybase = xbase + ybase*gridz; for (int iy = 0; iy < PME_ORDER; iy++) {
float dy = data[iy][1]; int ybase = gridIndexY+iy;
float ddy = ddata[iy][1]; ybase -= (ybase >= gridy ? gridy : 0);
fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0); ybase = xbase + ybase*gridz;
float dy = data[iy][1];
for (int iz = 0; iz < PME_ORDER; iz++) { float ddy = ddata[iy][1];
fvec4 gridValue(grid[ybase+zindex[iz]]); fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
f = f+xydata*zdata[iz]*gridValue;
for (int iz = 0; iz < PME_ORDER; iz++) {
fvec4 gridValue(grid[ybase+zindex[iz]]);
f = f+xydata*zdata[iz]*gridValue;
}
} }
} }
f *= -epsilonFactor*posq[4*i+3];
float fc[4];
f.store(fc);
force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
} }
f *= -epsilonFactor*posq[4*i+3];
float fc[4];
f.store(fc);
force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
} }
} }
...@@ -606,7 +618,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i ...@@ -606,7 +618,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
} }
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm); reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads(); threads.syncThreads();
interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor); interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
} }
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) { void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
...@@ -900,7 +912,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre ...@@ -900,7 +912,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre
complexStart = (index*complexSize)/numThreads; complexStart = (index*complexSize)/numThreads;
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm); reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads(); threads.syncThreads();
interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor); interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
} }
void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(CalcPmeReciprocalForceKernel::IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) { void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(CalcPmeReciprocalForceKernel::IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment