Optimizations to CPU nonbonded forces: better load balancing between threads,...

Optimizations to CPU nonbonded forces: better load balancing between threads, use linear splines instead of cubic

Optimizations to CPU nonbonded forces: better load balancing between threads,...
Optimizations to CPU nonbonded forces: better load balancing between threads, use linear splines instead of cubic
be04ec57 · peastman · 56cf0fde · be04ec57 · be04ec57 · be04ec57
Commit be04ec57 authored Dec 04, 2013 by peastman
5 changed files
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -107,6 +107,7 @@ private:
    float const* posq;
    std::vector<std::vector<float> >* threadForce;
    bool includeEnergy;
+    void* atomicCounter;
    static const int NUM_TABLE_POINTS;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -156,6 +156,7 @@ private:
        bool periodic;
        bool ewald;
        bool pme;
+        bool tableIsValid;
        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
@@ -174,6 +175,7 @@ private:
        std::set<int> const* exclusions;
        std::vector<std::vector<float> >* threadForce;
        bool includeEnergy;
+        void* atomicCounter;
        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -24,14 +24,14 @@
 #include "CpuGBSAOBCForce.h"
 #include "SimTKOpenMMRealType.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
 #include <cmath>
 using namespace std;
 using namespace OpenMM;
-const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048;
 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
@@ -46,20 +46,10 @@ public:
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
    logDX = 0.5/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
+    logTable.resize(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
+        double x = 0.5+i*logDX;
-        y[i] = log(x[i]);
+        logTable[i] = log(x);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    logTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        logTable[4*i] = (float) y[i];
-        logTable[4*i+1] = (float) y[i+1];
-        logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
-        logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
    }
 }
@@ -104,16 +94,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
    threadBornForces.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
        threadBornForces[i].resize(particleParams.size()+3);
+    gmx_atomic_t counter;
+    this->atomicCounter = &counter;
    // Signal the threads to start running and wait for them to finish.
    ComputeTask task(*this);
+    gmx_atomic_set(&counter, 0);
    threads.execute(task);
    threads.waitForThreads(); // Compute Born radii
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Compute surface area term
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // First loop
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Second loop
@@ -141,8 +137,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    // Calculate Born radii
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
-        int numInBlock = min(4, end-blockStart);
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -213,7 +212,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    vector<float>& bornForces = threadBornForces[threadIndex];
    for (int i = 0; i < numParticles; i++)
        bornForces[i] = 0.0f;
-    for (int atomI = start; atomI < end; atomI++) {
+    while (true) {
+        int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+        if (atomI >= numParticles)
+            break;
        if (bornRadii[atomI] > 0) {
            float radiusI = particleParams[atomI].first + dielectricOffset;
            float r = radiusI + probeRadius;
@@ -235,8 +237,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
        preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
    else
        preFactor = 0.0f;
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
-        int numInBlock = min(4, end-blockStart);
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomCharge[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -303,13 +308,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    // Second loop of Born energy computation.
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
        fvec4 bornForce(0.0f);
        for (int i = 0; i < numThreads; i++)
            bornForce += fvec4(&threadBornForces[i][blockStart]);
        fvec4 radii(&bornRadii[blockStart]);
        bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
-        int numInBlock = min(4, end-blockStart);
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -385,21 +393,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
 fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
    // Evaluate log(x) using a lookup table for speed.
-    float y[4];
    fvec4 x1 = (x-0.5f)*logDXInv;
    ivec4 index = floor(x1);
-    fvec4 coeff[4];
+    fvec4 coeff2 = x1-index;
-    coeff[1] = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
-    coeff[0] = 1.0f-coeff[1];
+    float table1[4], table2[4];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
-    static float maxdiff = 0.0f;
    for (int i = 0; i < 4; i++) {
-        if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
+        int tableIndex = index[i];
-            y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
+        if (tableIndex < NUM_TABLE_POINTS)
-        else
+            table1[i] = logTable[tableIndex];
-            y[i] = logf(x[i]);
+            table2[i] = logTable[tableIndex+1];
    }
-    return fvec4(y);
+    return coeff1*fvec4(table1) + coeff2*fvec4(table2);
 }
--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -29,8 +29,8 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -40,7 +40,7 @@ using namespace std;
 using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
-const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
+const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
 class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
@@ -58,10 +58,10 @@ public:
   --------------------------------------------------------------------------------------- */
-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
 }
-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------
   Set the force to use a cutoff.
@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period
     --------------------------------------------------------------------------------------- */
-  void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+    if (distance != cutoffDistance)
+        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      numRx = kmaxx;
      numRy = kmaxy;
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */
  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      meshDim[0] = meshSize[0];
      meshDim[1] = meshSize[1];
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
 void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+    if (tableIsValid)
+        return;
+    tableIsValid = true;
    ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
    ewaldDXInv = 1.0f/ewaldDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
+    ewaldScaleTable.resize(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
+        double r = i*ewaldDX;
        double alphaR = alphaEwald*r;
-        x[i] = r;
+        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-        y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        ewaldScaleTable[4*i] = (float) y[i];
-        ewaldScaleTable[4*i+1] = (float) y[i+1];
-        ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
-        ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
    }
 }
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
    threadEnergy.resize(threads.getNumThreads());
+    gmx_atomic_t counter;
+    gmx_atomic_set(&counter, 0);
+    this->atomicCounter = &counter;
    // Signal the threads to start running and wait for them to finish.
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    if (ewald || pme) {
        // Compute the interactions from the neighbor list.
-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
+        while (true) {
-            calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    else if (cutoff) {
        // Compute the interactions from the neighbor list.
-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
+        while (true) {
-            calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
    }
    else {
        // Loop over all atom pairs
-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
+        while (true) {
+            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (i >= numberOfAtoms)
+                break;
            for (int j = i+1; j < numberOfAtoms; j++)
                if (exclusions[j].find(i) == exclusions[j].end())
                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
@@ -609,10 +620,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        fvec4 sig2 = inverseR*sig;
        sig2 *= sig2;
        fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
+        fvec4 epsSig6 = blockAtomEpsilon*atomParameters[atom].second*sig6;
-        dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+        dEdR += switchValue*epsSig6*(12.0f*sig6 - 6.0f);
        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
+        fvec4 energy = epsSig6*(sig6-1.0f);
        if (useSwitch) {
            dEdR -= energy*switchDeriv*inverseR;
            energy *= switchValue;
@@ -683,18 +694,16 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
 fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-    float y[4];
    fvec4 x1 = x*ewaldDXInv;
    ivec4 index = floor(x1);
-    fvec4 coeff[4];
+    fvec4 coeff2 = x1-index;
-    coeff[1] = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
-    coeff[0] = 1.0f-coeff[1];
+    float table1[4], table2[4];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
    for (int i = 0; i < 4; i++) {
-        if (index[i] < NUM_TABLE_POINTS)
+        int tableIndex = index[i];
-            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
+        if (tableIndex < NUM_TABLE_POINTS)
+            table1[i] = ewaldScaleTable[tableIndex];
+            table2[i] = ewaldScaleTable[tableIndex+1];
    }
-    return fvec4(y);
+    return coeff1*fvec4(table1) + coeff2*fvec4(table2);
 }
--- a/platforms/cpu/src/gmx_atomic.h
+++ b/platforms/cpu/src/gmx_atomic.h