Commit 04f2d85a authored by peastman's avatar peastman
Browse files

Further cleanup and optimization to NEON vector operations

parent 9cac3275
...@@ -61,23 +61,31 @@ public: ...@@ -61,23 +61,31 @@ public:
return val; return val;
} }
float operator[](int i) const { float operator[](int i) const {
float result[4]; switch (i) {
store(result); case 0:
return result[i]; return vgetq_lane_f32(val, 0);
case 1:
return vgetq_lane_f32(val, 1);
case 2:
return vgetq_lane_f32(val, 2);
case 3:
return vgetq_lane_f32(val, 3);
}
return 0.0f;
} }
void store(float* v) const { void store(float* v) const {
vst1q_f32(v, val); vst1q_f32(v, val);
} }
fvec4 operator+(const fvec4& other) const { // Tested OK fvec4 operator+(const fvec4& other) const {
return vaddq_f32(val, other); return vaddq_f32(val, other);
} }
fvec4 operator-(const fvec4& other) const { // Tested OK fvec4 operator-(const fvec4& other) const {
return vsubq_f32(val, other); return vsubq_f32(val, other);
} }
fvec4 operator*(const fvec4& other) const { // Tested OK fvec4 operator*(const fvec4& other) const {
return vmulq_f32(val, other); return vmulq_f32(val, other);
} }
fvec4 operator/(const fvec4& other) const { // Tested OK fvec4 operator/(const fvec4& other) const {
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply. // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
float32x4_t reciprocal = vrecpeq_f32(other); float32x4_t reciprocal = vrecpeq_f32(other);
...@@ -96,7 +104,7 @@ public: ...@@ -96,7 +104,7 @@ public:
val = vmulq_f32(val, other); val = vmulq_f32(val, other);
} }
void operator/=(const fvec4& other) { void operator/=(const fvec4& other) {
val = val / other.val; val = *this/other;
} }
fvec4 operator-() const { fvec4 operator-() const {
return vnegq_f32(val); return vnegq_f32(val);
...@@ -105,7 +113,7 @@ public: ...@@ -105,7 +113,7 @@ public:
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other))); return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vorrq_u32(vcvtq_u32_f32(val), vcvtq_u32_f32(other)))); return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
} }
fvec4 operator==(const fvec4& other) const { fvec4 operator==(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
...@@ -148,9 +156,17 @@ public: ...@@ -148,9 +156,17 @@ public:
return val; return val;
} }
int operator[](int i) const { int operator[](int i) const {
int result[4]; switch (i) {
store(result); case 0:
return result[i]; return vgetq_lane_s32(val, 0);
case 1:
return vgetq_lane_s32(val, 1);
case 2:
return vgetq_lane_s32(val, 2);
case 3:
return vgetq_lane_s32(val, 3);
}
return 0;
} }
void store(int* v) const { void store(int* v) const {
vst1q_s32(v, val); vst1q_s32(v, val);
...@@ -176,29 +192,29 @@ public: ...@@ -176,29 +192,29 @@ public:
ivec4 operator-() const { ivec4 operator-() const {
return vnegq_s32(val); return vnegq_s32(val);
} }
ivec4 operator&(const ivec4& other) const { // Tested OK ivec4 operator&(const ivec4& other) const {
return ivec4(vandq_s32(val, other)); return vandq_s32(val, other);
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(const ivec4& other) const {
return ivec4(vorrq_s32(val, other)); return vorrq_s32(val, other);
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(const ivec4& other) const {
return ivec4(vreinterpretq_s32_u32(vceqq_s32(val, other))); return vreinterpretq_s32_u32(vceqq_s32(val, other));
} }
ivec4 operator!=(const ivec4& other) const { // OK ivec4 operator!=(const ivec4& other) const {
return ivec4(vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other)))); // not(equal(val, other)) return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(const ivec4& other) const {
return ivec4(vreinterpretq_s32_u32(vcgtq_s32(val, other))); return vreinterpretq_s32_u32(vcgtq_s32(val, other));
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(const ivec4& other) const {
return ivec4(vreinterpretq_s32_u32(vcltq_s32(val, other))); return vreinterpretq_s32_u32(vcltq_s32(val, other));
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(const ivec4& other) const {
return ivec4(vreinterpretq_s32_u32(vcgeq_s32(val, other))); return vreinterpretq_s32_u32(vcgeq_s32(val, other));
} }
ivec4 operator<=(const ivec4& other) const { // OK ivec4 operator<=(const ivec4& other) const {
return ivec4(vreinterpretq_s32_u32(vcleq_s32(val, other))); return vreinterpretq_s32_u32(vcleq_s32(val, other));
} }
operator fvec4() const; operator fvec4() const;
}; };
...@@ -215,11 +231,11 @@ inline ivec4::operator fvec4() const { ...@@ -215,11 +231,11 @@ inline ivec4::operator fvec4() const {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { // Tested OK static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
return vminq_f32(v1, v2); return vminq_f32(v1, v2);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { // Tested OK static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
return vmaxq_f32(v1, v2); return vmaxq_f32(v1, v2);
} }
...@@ -234,57 +250,43 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -234,57 +250,43 @@ static inline fvec4 sqrt(const fvec4& v) {
return vmulq_f32(v, recipSqrt); return vmulq_f32(v, recipSqrt);
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { // Tested: OK static inline float dot3(const fvec4& v1, const fvec4& v2) {
fvec4 result = v1 * v2; fvec4 result = v1*v2;
float aux[4]; return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
vst1q_f32(aux, result);
return aux[0] + aux[1] + aux[2]; // Ignore w component
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { // Tested: OK static inline float dot4(const fvec4& v1, const fvec4& v2) {
fvec4 result = v1 * v2; fvec4 result = v1*v2;
float aux[4]; return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
vst1q_f32(aux, result);
return aux[0] + aux[1] + aux[2] + aux[3];
} }
static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) { // Tested: OK static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
float aux1[4]; float32x4x2_t t1 = vuzpq_f32(v1, v3);
float aux2[4]; float32x4x2_t t2 = vuzpq_f32(v2, v4);
float aux3[4]; float32x4x2_t t3 = vtrnq_f32(t1.val[0], t2.val[0]);
float aux4[4]; float32x4x2_t t4 = vtrnq_f32(t1.val[1], t2.val[1]);
vst1q_f32(aux1, v1); v1 = t3.val[0];
vst1q_f32(aux2, v2); v2 = t4.val[0];
vst1q_f32(aux3, v3); v3 = t3.val[1];
vst1q_f32(aux4, v4); v4 = t4.val[1];
v1 = fvec4(aux1[0], aux2[0], aux3[0], aux4[0]);
v2 = fvec4(aux1[1], aux2[1], aux3[1], aux4[1]);
v3 = fvec4(aux1[2], aux2[2], aux3[2], aux4[2]);
v4 = fvec4(aux1[3], aux2[3], aux3[3], aux4[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { // Tested: not tested static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
ivec4 res = ivec4(vminq_s32(v1.val, v2.val)); return vminq_s32(v1, v2);
return res;
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { // Tested: not tested static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
ivec4 res = ivec4(vmaxq_s32(v1.val, v2.val)); return vmaxq_s32(v1, v2);
return res;
} }
static inline ivec4 abs(const ivec4& v) { // Tested: Not tested static inline ivec4 abs(const ivec4& v) {
ivec4 res = ivec4(vabdq_s32(v.val, ivec4(0))); return vabdq_s32(v, ivec4(0));
return res;
} }
static inline bool any(const ivec4& v) { // Tested: OK static inline bool any(const ivec4& v) {
int result[4]; return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
vst1q_s32(result, v);
return result[0] != 0 || result[1] != 0 || result[2] != 0 || result[3] != 0;
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment