Unverified Commit 01ead86e authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2662 from mark-mb/parallel

Use coarser granularity to split work in PME kernels
parents d51ef9e6 55801d55
...@@ -66,13 +66,17 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri ...@@ -66,13 +66,17 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri
float posInBox[4] = {0,0,0,0}; float posInBox[4] = {0,0,0,0};
memset(grid, 0, sizeof(float)*gridx*gridy*gridz); memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
int i = threadIndex; const int groupSize = max(1, numParticles / (10 * numThreads));
int start = groupSize * threadIndex;
while (true) { while (true) {
if (!deterministic) if (!deterministic)
i = atomicCounter++; start = atomicCounter.fetch_add(groupSize);
if (i >= numParticles)
if (start >= numParticles)
break; break;
int end = min(start + groupSize, numParticles);
for (int i = start; i < end; ++i) {
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
fvec4 pos(&posq[4*i]); fvec4 pos(&posq[4*i]);
...@@ -154,8 +158,10 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri ...@@ -154,8 +158,10 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri
} }
} }
} }
}
if (deterministic) if (deterministic)
i += numThreads; start += groupSize * numThreads;
} }
} }
...@@ -310,7 +316,7 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vecto ...@@ -310,7 +316,7 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vecto
} }
} }
static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor) { static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor, int numThreads) {
fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0); fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0); fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0); fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
...@@ -320,11 +326,16 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx, ...@@ -320,11 +326,16 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx,
ivec4 gridSizeInt(gridx, gridy, gridz, 0); ivec4 gridSizeInt(gridx, gridy, gridz, 0);
fvec4 one(1); fvec4 one(1);
fvec4 scale(1.0f/(PME_ORDER-1)); fvec4 scale(1.0f/(PME_ORDER-1));
const int groupSize = max(1, numParticles / (10 * numThreads));
while (true) { while (true) {
int i = atomicCounter++; int start = atomicCounter.fetch_add(groupSize);
if (i >= numParticles) if (start >= numParticles)
break; break;
int end = min(start + groupSize, numParticles);
for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
fvec4 pos(&posq[4*i]); fvec4 pos(&posq[4*i]);
...@@ -403,6 +414,7 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx, ...@@ -403,6 +414,7 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx,
force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1]; force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2]; force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
} }
}
} }
static void* threadBody(void* args) { static void* threadBody(void* args) {
...@@ -606,7 +618,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i ...@@ -606,7 +618,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
} }
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm); reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads(); threads.syncThreads();
interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor); interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
} }
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) { void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
...@@ -900,7 +912,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre ...@@ -900,7 +912,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre
complexStart = (index*complexSize)/numThreads; complexStart = (index*complexSize)/numThreads;
reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm); reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads(); threads.syncThreads();
interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor); interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
} }
void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(CalcPmeReciprocalForceKernel::IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) { void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(CalcPmeReciprocalForceKernel::IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment