Unverified Commit 99a9fdc9 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improvements to vectorization on ARM (#3555)

parent fd13a655
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013-2014 Stanford University and the Authors. * * Portions copyright (c) 2013-2022 Stanford University and the Authors. *
* Authors: Mateus Lima, Peter Eastman * * Authors: Mateus Lima, Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -130,13 +130,7 @@ public: ...@@ -130,13 +130,7 @@ public:
return vmulq_f32(val, other); return vmulq_f32(val, other);
} }
fvec4 operator/(fvec4 other) const { fvec4 operator/(fvec4 other) const {
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply. return vdivq_f32(val, other);
float32x4_t reciprocal = vrecpeq_f32(other);
reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
fvec4 result = vmulq_f32(val,reciprocal);
return result;
} }
void operator+=(fvec4 other) { void operator+=(fvec4 other) {
val = vaddq_f32(val, other); val = vaddq_f32(val, other);
...@@ -337,16 +331,11 @@ static inline float dot3(fvec4 v1, fvec4 v2) { ...@@ -337,16 +331,11 @@ static inline float dot3(fvec4 v1, fvec4 v2) {
} }
static inline float dot4(fvec4 v1, fvec4 v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 result = v1*v2; return vaddvq_f32(v1*v2);
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
} }
static inline float reduceAdd(fvec4 v) { static inline float reduceAdd(fvec4 v) {
#ifdef __ARM64__
return vaddvq_f32(v); return vaddvq_f32(v);
#else
return dot4(v, fvec4(1.0f));
#endif
} }
static inline fvec4 cross(fvec4 v1, fvec4 v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
...@@ -397,11 +386,7 @@ static inline ivec4 abs(ivec4 v) { ...@@ -397,11 +386,7 @@ static inline ivec4 abs(ivec4 v) {
} }
static inline bool any(ivec4 v) { static inline bool any(ivec4 v) {
#ifdef __ARM64__
return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0); return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
#else
return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
#endif
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
...@@ -439,19 +424,15 @@ static inline ivec4 blendZero(ivec4 v, ivec4 mask) { ...@@ -439,19 +424,15 @@ static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 round(fvec4 v) { static inline fvec4 round(fvec4 v) {
fvec4 shift(0x1.0p23f); return vrndnq_f32(v);
fvec4 absResult = (abs(v)+shift)-shift;
return blend(v, absResult, ivec4(0x7FFFFFFF));
} }
static inline fvec4 floor(fvec4 v) { static inline fvec4 floor(fvec4 v) {
fvec4 rounded = round(v); return vrndmq_f32(v);
return rounded + blend(0.0f, -1.0f, rounded>v);
} }
static inline fvec4 ceil(fvec4 v) { static inline fvec4 ceil(fvec4 v) {
fvec4 rounded = round(v); return vrndpq_f32(v);
return rounded + blend(0.0f, 1.0f, rounded<v);
} }
/* Given a table of floating-point values and a set of indexes, perform a gather read into a pair /* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment