Improvements to vectorization on ARM (#3555)

99a9fdc9 · Peter Eastman · GitHub · fd13a655 · 99a9fdc9
Unverified Commit 99a9fdc9 authored Apr 08, 2022 by Peter Eastman Committed by GitHub Apr 08, 2022
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 25 deletions

openmmapi/include/openmm/internal/vectorize_neon.h openmmapi/include/openmm/internal/vectorize_neon.h +6 -25

No files found.
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2022 Stanford University and the Authors.      *
 * Authors: Mateus Lima, Peter Eastman                                        *
 * Contributors:                                                              *
 *                                                                            *
@@ -130,13 +130,7 @@ public:
        return vmulq_f32(val, other);
    }
    fvec4 operator/(fvec4 other) const {
-        // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
-
-        float32x4_t reciprocal = vrecpeq_f32(other);
-        reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
-        reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
-        fvec4 result = vmulq_f32(val,reciprocal);
-        return result;
+        return vdivq_f32(val, other);
    }
    void operator+=(fvec4 other) {
        val = vaddq_f32(val, other);
@@ -337,16 +331,11 @@ static inline float dot3(fvec4 v1, fvec4 v2) {
 }

 static inline float dot4(fvec4 v1, fvec4 v2) {
-    fvec4 result = v1*v2;
-    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
+    return vaddvq_f32(v1*v2);
 }

 static inline float reduceAdd(fvec4 v) {
-#ifdef __ARM64__
    return vaddvq_f32(v);
-#else
-    return dot4(v, fvec4(1.0f));
-#endif
 }

 static inline fvec4 cross(fvec4 v1, fvec4 v2) {
@@ -397,11 +386,7 @@ static inline ivec4 abs(ivec4 v) {
 }

 static inline bool any(ivec4 v) {
-#ifdef __ARM64__
    return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
-#else
-    return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
-#endif
 }

 // Mathematical operators involving a scalar and a vector.
@@ -439,19 +424,15 @@ static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
 // These are at the end since they involve other functions defined above.

 static inline fvec4 round(fvec4 v) {
-    fvec4 shift(0x1.0p23f);
-    fvec4 absResult = (abs(v)+shift)-shift;
-    return blend(v, absResult, ivec4(0x7FFFFFFF));
+    return vrndnq_f32(v);
 }

 static inline fvec4 floor(fvec4 v) {
-    fvec4 rounded = round(v);
-    return rounded + blend(0.0f, -1.0f, rounded>v);
+    return vrndmq_f32(v);
 }

 static inline fvec4 ceil(fvec4 v) {
-    fvec4 rounded = round(v);
-    return rounded + blend(0.0f, 1.0f, rounded<v);
+    return vrndpq_f32(v);
 }

 /* Given a table of floating-point values and a set of indexes, perform a gather read into a pair