Optimizations to CpuNonbondedForce

3a415486 · peastman · 79b85ad7 · 3a415486 · 3a415486 · 3a415486
Commit 3a415486 authored Apr 10, 2015 by peastman
6 changed files
--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) {
    return fvec8(_mm256_sqrt_ps(v.val));
 }

+static inline fvec8 rsqrt(const fvec8& v) {
+    // Initial estimate of rsqrt().
+
+    fvec8 y(_mm256_rsqrt_ps(v.val));
+
+    // Perform an iteration of Newton refinement.
+
+    fvec8 x2 = v*0.5f;
+    y *= fvec8(1.5f)-x2*y*y;
+    return y;
+}
+
 static inline float dot8(const fvec8& v1, const fvec8& v2) {
    fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());

--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) {
    return vabsq_f32(v);
 }

-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 rsqrt(const fvec4& v) {
    float32x4_t recipSqrt = vrsqrteq_f32(v);
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
-    return vmulq_f32(v, recipSqrt);
+    return recipSqrt;
+}
+
+static inline fvec4 sqrt(const fvec4& v) {
+    return rsqrt(v)*v;
 }

 static inline float dot3(const fvec4& v1, const fvec4& v2) {

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) {
    return truncated + blend(0.0f, 1.0f, truncated<v);
 }

-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 rsqrt(const fvec4& v) {
    // Initial estimate of rsqrt().

    ivec4 i = (__m128i) v;
@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) {
    y *= 1.5f-x2*y*y;
    y *= 1.5f-x2*y*y;
    y *= 1.5f-x2*y*y;
-    return y*v;
+    return y;
+}
+
+static inline fvec4 sqrt(const fvec4& v) {
+    return rsqrt(v)*v;
 }

 #endif /*OPENMM_VECTORIZE_PNACL_H_*/

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }

+static inline fvec4 rsqrt(const fvec4& v) {
+    // Initial estimate of rsqrt().
+
+    fvec4 y(_mm_rsqrt_ps(v.val));
+
+    // Perform an iteration of Newton refinement.
+
+    fvec4 x2 = v*0.5f;
+    y *= fvec4(1.5f)-x2*y*y;
+    return y;
+}
+
 static inline float dot3(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }

--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -95,8 +95,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
        
        // Compute the interactions.
        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 inverseR = rsqrt(r2);
        fvec4 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -108,6 +107,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
+                fvec4 r = r2*inverseR;
                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
@@ -212,8 +212,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        
        // Compute the interactions.
        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 inverseR = rsqrt(r2);
+        fvec4 r = r2*inverseR;
        fvec4 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {

--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -126,8 +126,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
        
        // Compute the interactions.
        
-        fvec8 r = sqrt(r2);
-        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 inverseR = rsqrt(r2);
        fvec8 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -139,6 +138,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
+                fvec8 r = r2*inverseR;
                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
@@ -242,8 +242,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        
        // Compute the interactions.
        
-        fvec8 r = sqrt(r2);
-        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 inverseR = rsqrt(r2);
+        fvec8 r = r2*inverseR;
        fvec8 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {