Merge pull request #212 from peastman/master

Tabulated log(x) to improve GB performance

Merge pull request #212 from peastman/master
Tabulated log(x) to improve GB performance
fd2708b4 · peastman · 59893e57 · f39a5545 · fd2708b4 · fd2708b4
Commit fd2708b4 authored Nov 07, 2013 by peastman
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 4 deletions

platforms/cpu/include/CpuGBSAOBCForce.h platforms/cpu/include/CpuGBSAOBCForce.h +9 -0

platforms/cpu/src/CpuGBSAOBCForce.cpp platforms/cpu/src/CpuGBSAOBCForce.cpp +44 -4

No files found.
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -101,16 +101,25 @@ private:
    std::vector<std::vector<float> > threadBornForces;
    std::vector<float> obcChain;
    std::vector<double> threadEnergy;
+    std::vector<float> logTable;
+    float logDX, logDXInv;
    // The following variables are used to make information accessible to the individual threads.
    float const* posq;
    std::vector<std::vector<float> >* threadForce;
    bool includeEnergy;
+    static const int NUM_TABLE_POINTS;
    /**
     * Compute the displacement and squared distance between a collection of points, optionally using
     * periodic boundary conditions.
     */
    void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+    /**
+     * Evaluate log(x) using a lookup table for speed.
+     */
+    fvec4 fastLog(fvec4 x);
 };
 } // namespace OpenMM

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -24,12 +24,15 @@
 #include "CpuGBSAOBCForce.h"
 #include "SimTKOpenMMRealType.h"
+#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
 #include <cmath>
 using namespace std;
 using namespace OpenMM;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
    ComputeTask(CpuGBSAOBCForce& owner) : owner(owner) {
@@ -41,6 +44,23 @@ public:
 };
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
+    logDX = 0.5/NUM_TABLE_POINTS;
+    logDXInv = 1.0f/logDX;
+    vector<double> x(NUM_TABLE_POINTS+1);
+    vector<double> y(NUM_TABLE_POINTS+1);
+    vector<double> deriv;
+    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
+        x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
+        y[i] = log(x[i]);
+    }
+    SplineFitter::createNaturalSpline(x, y, deriv);
+    logTable.resize(4*NUM_TABLE_POINTS);
+    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
+        logTable[4*i] = (float) y[i];
+        logTable[4*i+1] = (float) y[i+1];
+        logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
+        logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
+    }
 }
 void CpuGBSAOBCForce::setUseCutoff(float distance) {
@@ -161,8 +181,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
            fvec4 u_ij2 = u_ij*u_ij;
            fvec4 rInverse = 1.0f/r;
            fvec4 r2Inverse = rInverse*rInverse;
-            fvec4 ratio = u_ij/l_ij;
+            fvec4 logRatio = fastLog(u_ij/l_ij);
-            fvec4 logRatio(logf(ratio[0]), logf(ratio[1]), logf(ratio[2]), logf(ratio[3]));
            fvec4 term = l_ij - u_ij + 0.25f*r*(u_ij2 - l_ij2) + (0.5f*rInverse*logRatio) + (0.25f*scaledRadiusJ*scaledRadiusJ*rInverse)*(l_ij2 - u_ij2);
            for (int j = 0; j < 4; j++) {
                if (include[j]) {
@@ -329,8 +348,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
            fvec4 u_ij2 = u_ij*u_ij;
            fvec4 rInverse = 1.0f/r;
            fvec4 r2Inverse = rInverse*rInverse;
-            fvec4 ratio = u_ij/l_ij;
+            fvec4 logRatio = fastLog(u_ij/l_ij);
-            fvec4 logRatio(logf(ratio[0]), logf(ratio[1]), logf(ratio[2]), logf(ratio[3]));
            fvec4 t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
            fvec4 de = bornForce*t3*rInverse;
            fvec4 result[4] = {dx*de, dy*de, dz*de, 0.0f};
@@ -363,3 +381,25 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
    }
    r2 = dx*dx + dy*dy + dz*dz;
 }
+fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
+    // Evaluate log(x) using a lookup table for speed.
+    float y[4];
+    fvec4 x1 = (x-0.5f)*logDXInv;
+    ivec4 index = floor(x1);
+    fvec4 coeff[4];
+    coeff[1] = x1-index;
+    coeff[0] = 1.0f-coeff[1];
+    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
+    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
+    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
+    static float maxdiff = 0.0f;
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
+            y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
+        else
+            y[i] = logf(x[i]);
+    }
+    return fvec4(y);
+}