Tabulated erfc() to improve accuracy of energies

15465af0 · peastman · 4bdbcf4d · 15465af0 · 15465af0 · 15465af0
Commit 15465af0 authored May 14, 2015 by peastman
6 changed files
--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h

-/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -174,8 +174,8 @@ protected:
        float alphaEwald;
        int numRx, numRy, numRz;
        int meshDim[3];
-        std::vector<float> ewaldScaleTable;
-        float ewaldDX, ewaldDXInv;
+        std::vector<float> erfcTable, ewaldScaleTable;
+        float ewaldDX, ewaldDXInv, erfcDXInv;
        std::vector<double> threadEnergy;
        // The following variables are used to make information accessible to the individual threads.
        int numberOfAtoms;
@@ -241,7 +241,7 @@ protected:
      /**
       * Compute a fast approximation to erfc(x).
       */
-      static float erfcApprox(float x);
+      float erfcApprox(float x);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h

-/* Portions copyright (c) 2006-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -87,7 +87,7 @@ protected:
      /**
       * Compute a fast approximation to erfc(x).
       */
-      static fvec4 erfcApprox(const fvec4& x);
+      fvec4 erfcApprox(const fvec4& x);
      
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h

-/* Portions copyright (c) 2006-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -86,7 +86,7 @@ protected:
      /**
       * Compute a fast approximation to erfc(x).
       */
-      static fvec8 erfcApprox(const fvec8& x);
+      fvec8 erfcApprox(const fvec8& x);
      
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -171,17 +171,20 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
  }

  
-void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+  void CpuNonbondedForce::tabulateEwaldScaleFactor() {
    if (tableIsValid)
        return;
    tableIsValid = true;
    ewaldDX = cutoffDistance/NUM_TABLE_POINTS;
    ewaldDXInv = 1.0f/ewaldDX;
+    erfcDXInv = 1.0f/(ewaldDX*alphaEwald);
+    erfcTable.resize(NUM_TABLE_POINTS+4);
    ewaldScaleTable.resize(NUM_TABLE_POINTS+4);
    for (int i = 0; i < NUM_TABLE_POINTS+4; i++) {
        double r = i*ewaldDX;
        double alphaR = alphaEwald*r;
-        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
+        erfcTable[i] = erfc(alphaR);
+        ewaldScaleTable[i] = erfcTable[i] + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
  
@@ -364,15 +367,15 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                    float inverseR = 1/r;
                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
                    float alphaR = alphaEwald*r;
-                    float erfAlphaR = erf(alphaR);
-                    if (erfAlphaR > 1e-6f) {
+                    float erfcAlphaR = erfcApprox(alphaR);
+                    if (1-erfcAlphaR > 1e-6f) {
                        float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
-                        dEdR = (float) (dEdR * (erfAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
+                        dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
                        fvec4 result = deltaR*dEdR;
                        (fvec4(forces+4*i)-result).store(forces+4*i);
                        (fvec4(forces+4*j)+result).store(forces+4*j);
                        if (includeEnergy)
-                            threadEnergy[threadIndex] -= chargeProd*inverseR*erfAlphaR;
+                            threadEnergy[threadIndex] -= chargeProd*inverseR*(1.0f-erfcAlphaR);
                    }
                }
            }
@@ -473,14 +476,10 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
 }

 float CpuNonbondedForce::erfcApprox(float x) {
-    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
-    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
-    // error of 3e-7.
-
-    float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
-    t *= t;
-    t *= t;
-    t *= t;
-    return 1.0f/(t*t);
+    float x1 = x*erfcDXInv;
+    int index = min((int) floor(x1), NUM_TABLE_POINTS);
+    float coeff2 = x1-index;
+    float coeff1 = 1.0f-coeff2;
+    return coeff1*erfcTable[index] + coeff2*erfcTable[index+1];
 }

--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp

-/* Portions copyright (c) 2006-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -393,15 +393,16 @@ void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const f
 }

 fvec4 CpuNonbondedForceVec4::erfcApprox(const fvec4& x) {
-    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
-    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
-    // error of 3e-7.
-
-    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
-    t *= t;
-    t *= t;
-    t *= t;
-    return 1.0f/(t*t);
+    fvec4 x1 = x*erfcDXInv;
+    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&erfcTable[index[0]]);
+    fvec4 t2(&erfcTable[index[1]]);
+    fvec4 t3(&erfcTable[index[2]]);
+    fvec4 t4(&erfcTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
 }

 fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(const fvec4& x) {

--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp

-/* Portions copyright (c) 2006-2014 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -423,15 +423,23 @@ void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const f
 }

 fvec8 CpuNonbondedForceVec8::erfcApprox(const fvec8& x) {
-    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
-    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
-    // error of 3e-7.
-
-    fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
-    t *= t;
-    t *= t;
-    t *= t;
-    return 1.0f/(t*t);
+    fvec8 x1 = x*erfcDXInv;
+    ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = x1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&erfcTable[indexLower[0]]);
+    fvec4 t2(&erfcTable[indexLower[1]]);
+    fvec4 t3(&erfcTable[indexLower[2]]);
+    fvec4 t4(&erfcTable[indexLower[3]]);
+    fvec4 t5(&erfcTable[indexUpper[0]]);
+    fvec4 t6(&erfcTable[indexUpper[1]]);
+    fvec4 t7(&erfcTable[indexUpper[2]]);
+    fvec4 t8(&erfcTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
 }

 fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(const fvec8& x) {