Commit dc474876 authored by peastman's avatar peastman
Browse files

Merge pull request #1152 from peastman/cpuopt

Further optimizations to CPU platform
parents ec799972 15e2ba85
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "SimTKOpenMMUtilities.h" #include "SimTKOpenMMUtilities.h"
#include "ReferenceForce.h" #include "ReferenceForce.h"
#include "CpuCustomGBForce.h" #include "CpuCustomGBForce.h"
#include "gmx_atomic.h" #include "openmm/internal/gmx_atomic.h"
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
#include "ReferenceTabulatedFunction.h" #include "ReferenceTabulatedFunction.h"
#include "openmm/internal/CustomManyParticleForceImpl.h" #include "openmm/internal/CustomManyParticleForceImpl.h"
#include "lepton/CustomFunction.h" #include "lepton/CustomFunction.h"
#include "gmx_atomic.h" #include "openmm/internal/gmx_atomic.h"
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "SimTKOpenMMUtilities.h" #include "SimTKOpenMMUtilities.h"
#include "ReferenceForce.h" #include "ReferenceForce.h"
#include "CpuCustomNonbondedForce.h" #include "CpuCustomNonbondedForce.h"
#include "gmx_atomic.h" #include "openmm/internal/gmx_atomic.h"
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "CpuGBSAOBCForce.h" #include "CpuGBSAOBCForce.h"
#include "SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "openmm/internal/vectorize.h" #include "openmm/internal/vectorize.h"
#include "gmx_atomic.h" #include "openmm/internal/gmx_atomic.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "CpuNonbondedForce.h" #include "CpuNonbondedForce.h"
#include "ReferenceForce.h" #include "ReferenceForce.h"
#include "ReferencePME.h" #include "ReferencePME.h"
#include "gmx_atomic.h" #include "openmm/internal/gmx_atomic.h"
#include <algorithm> #include <algorithm>
// In case we're using some primitive version of Visual Studio this will // In case we're using some primitive version of Visual Studio this will
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013 Stanford University and the Authors. * * Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CpuSETTLE.h" #include "CpuSETTLE.h"
#include "openmm/internal/gmx_atomic.h"
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
...@@ -39,10 +40,14 @@ public: ...@@ -39,10 +40,14 @@ public:
ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP), RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) { inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
gmx_atomic_set(&atomicCounter, 0);
} }
void execute(ThreadPool& threads, int threadIndex) { void execute(ThreadPool& threads, int threadIndex) {
if (threadIndex < threadSettle.size()) { while (true) {
threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance); int index = gmx_atomic_fetch_add(&atomicCounter, 1);
if (index >= threadSettle.size())
break;
threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
} }
} }
vector<OpenMM::RealVec>& atomCoordinates; vector<OpenMM::RealVec>& atomCoordinates;
...@@ -50,6 +55,7 @@ public: ...@@ -50,6 +55,7 @@ public:
vector<RealOpenMM>& inverseMasses; vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance; RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle; vector<ReferenceSETTLEAlgorithm*>& threadSettle;
gmx_atomic_t atomicCounter;
}; };
class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task { class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
...@@ -57,10 +63,14 @@ public: ...@@ -57,10 +63,14 @@ public:
ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities), RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) { inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
gmx_atomic_set(&atomicCounter, 0);
} }
void execute(ThreadPool& threads, int threadIndex) { void execute(ThreadPool& threads, int threadIndex) {
if (threadIndex < threadSettle.size()) { while (true) {
threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance); int index = gmx_atomic_fetch_add(&atomicCounter, 1);
if (index >= threadSettle.size())
break;
threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
} }
} }
vector<OpenMM::RealVec>& atomCoordinates; vector<OpenMM::RealVec>& atomCoordinates;
...@@ -68,17 +78,18 @@ public: ...@@ -68,17 +78,18 @@ public:
vector<RealOpenMM>& inverseMasses; vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance; RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle; vector<ReferenceSETTLEAlgorithm*>& threadSettle;
gmx_atomic_t atomicCounter;
}; };
CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) { CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
int numThreads = threads.getNumThreads(); int numBlocks = 10*threads.getNumThreads();
int numClusters = settle.getNumClusters(); int numClusters = settle.getNumClusters();
vector<RealOpenMM> mass(system.getNumParticles()); vector<RealOpenMM> mass(system.getNumParticles());
for (int i = 0; i < system.getNumParticles(); i++) for (int i = 0; i < system.getNumParticles(); i++)
mass[i] = system.getParticleMass(i); mass[i] = system.getParticleMass(i);
for (int i = 0; i < numThreads; i++) { for (int i = 0; i < numBlocks; i++) {
int start = i*numClusters/numThreads; int start = i*numClusters/numBlocks;
int end = (i+1)*numClusters/numThreads; int end = (i+1)*numClusters/numBlocks;
if (start != end) { if (start != end) {
int numThreadClusters = end-start; int numThreadClusters = end-start;
vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters); vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
......
...@@ -49,7 +49,7 @@ static const int PME_ORDER = 5; ...@@ -49,7 +49,7 @@ static const int PME_ORDER = 5;
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false; bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0; int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) { static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter) {
float temp[4]; float temp[4];
fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0); fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0); fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
...@@ -64,7 +64,11 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx ...@@ -64,7 +64,11 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
const float epsilonFactor = sqrt(ONE_4PI_EPS0); const float epsilonFactor = sqrt(ONE_4PI_EPS0);
memset(grid, 0, sizeof(float)*gridx*gridy*gridz); memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
for (int i = start; i < end; i++) { while (true) {
int i = gmx_atomic_fetch_add(&atomicCounter, 1);
if (i >= numParticles)
break;
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
fvec4 pos(&posq[4*i]); fvec4 pos(&posq[4*i]);
...@@ -225,25 +229,15 @@ static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int grid ...@@ -225,25 +229,15 @@ static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int grid
return 0.5*energy; return 0.5*energy;
} }
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, vector<float>& recipEterm) { static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vector<float>& recipEterm) {
const unsigned int zsize = gridz/2+1; for (int index = start; index < end; index++) {
const unsigned int yzsize = gridy*zsize;
int firstz = (start == 0 ? 1 : 0);
for (int kx = start; kx < end; kx++) {
for (int ky = 0; ky < gridy; ky++) {
for (int kz = firstz; kz < zsize; kz++) {
int index = kx*yzsize + ky*zsize + kz;
float eterm = recipEterm[index]; float eterm = recipEterm[index];
grid[index][0] *= eterm; grid[index][0] *= eterm;
grid[index][1] *= eterm; grid[index][1] *= eterm;
} }
firstz = 0;
}
}
} }
static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) { static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter) {
fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0); fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0); fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0); fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
...@@ -254,7 +248,11 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo ...@@ -254,7 +248,11 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
fvec4 one(1); fvec4 one(1);
fvec4 scale(1.0f/(PME_ORDER-1)); fvec4 scale(1.0f/(PME_ORDER-1));
const float epsilonFactor = sqrt(ONE_4PI_EPS0); const float epsilonFactor = sqrt(ONE_4PI_EPS0);
for (int i = start; i < end; i++) { while (true) {
int i = gmx_atomic_fetch_add(&atomicCounter, 1);
if (i >= numParticles)
break;
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
fvec4 pos(&posq[4*i]); fvec4 pos(&posq[4*i]);
...@@ -485,6 +483,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() { ...@@ -485,6 +483,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
break; break;
posq = io->getPosq(); posq = io->getPosq();
ComputeTask task(*this); ComputeTask task(*this);
gmx_atomic_set(&atomicCounter, 0);
threads.execute(task); // Signal threads to perform charge spreading. threads.execute(task); // Signal threads to perform charge spreading.
threads.waitForThreads(); threads.waitForThreads();
threads.resumeThreads(); // Signal threads to sum the charge grids. threads.resumeThreads(); // Signal threads to sum the charge grids.
...@@ -503,6 +502,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() { ...@@ -503,6 +502,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
threads.resumeThreads(); // Signal threads to perform reciprocal convolution. threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
threads.waitForThreads(); threads.waitForThreads();
fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid); fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
gmx_atomic_set(&atomicCounter, 0);
threads.resumeThreads(); // Signal threads to interpolate forces. threads.resumeThreads(); // Signal threads to interpolate forces.
threads.waitForThreads(); threads.waitForThreads();
isFinished = true; isFinished = true;
...@@ -515,14 +515,15 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() { ...@@ -515,14 +515,15 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
} }
void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) { void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
int particleStart = (index*numParticles)/numThreads;
int particleEnd = ((index+1)*numParticles)/numThreads;
int gridxStart = (index*gridx)/numThreads; int gridxStart = (index*gridx)/numThreads;
int gridxEnd = ((index+1)*gridx)/numThreads; int gridxEnd = ((index+1)*gridx)/numThreads;
int gridSize = (gridx*gridy*gridz+3)/4; int gridSize = (gridx*gridy*gridz+3)/4;
int gridStart = 4*((index*gridSize)/numThreads); int gridStart = 4*((index*gridSize)/numThreads);
int gridEnd = 4*(((index+1)*gridSize)/numThreads); int gridEnd = 4*(((index+1)*gridSize)/numThreads);
spreadCharge(particleStart, particleEnd, posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors); int complexSize = gridx*gridy*(gridz/2+1);
int complexStart = max(1, ((index*complexSize)/numThreads));
int complexEnd = (((index+1)*complexSize)/numThreads);
spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter);
threads.syncThreads(); threads.syncThreads();
int numGrids = tempGrid.size(); int numGrids = tempGrid.size();
for (int i = gridStart; i < gridEnd; i += 4) { for (int i = gridStart; i < gridEnd; i += 4) {
...@@ -540,9 +541,9 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i ...@@ -540,9 +541,9 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors); threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
threads.syncThreads(); threads.syncThreads();
} }
reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm); reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
threads.syncThreads(); threads.syncThreads();
interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors); interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter);
} }
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) { void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013-2014 Stanford University and the Authors. * * Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include "internal/windowsExportPme.h" #include "internal/windowsExportPme.h"
#include "openmm/kernels.h" #include "openmm/kernels.h"
#include "openmm/Vec3.h" #include "openmm/Vec3.h"
#include "openmm/internal/gmx_atomic.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
#include <fftw3.h> #include <fftw3.h>
#include <pthread.h> #include <pthread.h>
...@@ -130,6 +131,7 @@ private: ...@@ -130,6 +131,7 @@ private:
float* posq; float* posq;
Vec3 periodicBoxVectors[3], recipBoxVectors[3]; Vec3 periodicBoxVectors[3], recipBoxVectors[3];
bool includeEnergy; bool includeEnergy;
gmx_atomic_t atomicCounter;
}; };
} // namespace OpenMM } // namespace OpenMM
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment