Commit f83289a4 authored by Marc Marí's avatar Marc Marí
Browse files

Pass by value in vectorization API

parent 828ae646
...@@ -75,55 +75,55 @@ public: ...@@ -75,55 +75,55 @@ public:
void store(float* v) const { void store(float* v) const {
_mm256_storeu_ps(v, val); _mm256_storeu_ps(v, val);
} }
fvec8 operator+(const fvec8& other) const { fvec8 operator+(fvec8 other) const {
return _mm256_add_ps(val, other); return _mm256_add_ps(val, other);
} }
fvec8 operator-(const fvec8& other) const { fvec8 operator-(fvec8 other) const {
return _mm256_sub_ps(val, other); return _mm256_sub_ps(val, other);
} }
fvec8 operator*(const fvec8& other) const { fvec8 operator*(fvec8 other) const {
return _mm256_mul_ps(val, other); return _mm256_mul_ps(val, other);
} }
fvec8 operator/(const fvec8& other) const { fvec8 operator/(fvec8 other) const {
return _mm256_div_ps(val, other); return _mm256_div_ps(val, other);
} }
void operator+=(const fvec8& other) { void operator+=(fvec8 other) {
val = _mm256_add_ps(val, other); val = _mm256_add_ps(val, other);
} }
void operator-=(const fvec8& other) { void operator-=(fvec8 other) {
val = _mm256_sub_ps(val, other); val = _mm256_sub_ps(val, other);
} }
void operator*=(const fvec8& other) { void operator*=(fvec8 other) {
val = _mm256_mul_ps(val, other); val = _mm256_mul_ps(val, other);
} }
void operator/=(const fvec8& other) { void operator/=(fvec8 other) {
val = _mm256_div_ps(val, other); val = _mm256_div_ps(val, other);
} }
fvec8 operator-() const { fvec8 operator-() const {
return _mm256_sub_ps(_mm256_set1_ps(0.0f), val); return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
} }
fvec8 operator&(const fvec8& other) const { fvec8 operator&(fvec8 other) const {
return _mm256_and_ps(val, other); return _mm256_and_ps(val, other);
} }
fvec8 operator|(const fvec8& other) const { fvec8 operator|(fvec8& other) const {
return _mm256_or_ps(val, other); return _mm256_or_ps(val, other);
} }
fvec8 operator==(const fvec8& other) const { fvec8 operator==(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_EQ_OQ); return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
} }
fvec8 operator!=(const fvec8& other) const { fvec8 operator!=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ); return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
} }
fvec8 operator>(const fvec8& other) const { fvec8 operator>(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_GT_OQ); return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
} }
fvec8 operator<(const fvec8& other) const { fvec8 operator<(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_LT_OQ); return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
} }
fvec8 operator>=(const fvec8& other) const { fvec8 operator>=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_GE_OQ); return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
} }
fvec8 operator<=(const fvec8& other) const { fvec8 operator<=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_LE_OQ); return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
} }
operator ivec8() const; operator ivec8() const;
...@@ -159,10 +159,10 @@ public: ...@@ -159,10 +159,10 @@ public:
void store(int* v) const { void store(int* v) const {
_mm256_storeu_si256((__m256i*) v, val); _mm256_storeu_si256((__m256i*) v, val);
} }
ivec8 operator&(const ivec8& other) const { ivec8 operator&(ivec8 other) const {
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val))); return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
} }
ivec8 operator|(const ivec8& other) const { ivec8 operator|(ivec8 other) const {
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val))); return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
} }
operator fvec8() const; operator fvec8() const;
...@@ -193,36 +193,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) { ...@@ -193,36 +193,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) {
// Functions that operate on fvec8s. // Functions that operate on fvec8s.
static inline fvec8 floor(const fvec8& v) { static inline fvec8 floor(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, 0x09)); return fvec8(_mm256_round_ps(v.val, 0x09));
} }
static inline fvec8 ceil(const fvec8& v) { static inline fvec8 ceil(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, 0x0A)); return fvec8(_mm256_round_ps(v.val, 0x0A));
} }
static inline fvec8 round(const fvec8& v) { static inline fvec8 round(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT)); return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
} }
static inline fvec8 min(const fvec8& v1, const fvec8& v2) { static inline fvec8 min(fvec8 v1, fvec8 v2) {
return fvec8(_mm256_min_ps(v1.val, v2.val)); return fvec8(_mm256_min_ps(v1.val, v2.val));
} }
static inline fvec8 max(const fvec8& v1, const fvec8& v2) { static inline fvec8 max(fvec8 v1, fvec8 v2) {
return fvec8(_mm256_max_ps(v1.val, v2.val)); return fvec8(_mm256_max_ps(v1.val, v2.val));
} }
static inline fvec8 abs(const fvec8& v) { static inline fvec8 abs(fvec8 v) {
static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
return fvec8(_mm256_and_ps(v.val, mask)); return fvec8(_mm256_and_ps(v.val, mask));
} }
static inline fvec8 sqrt(const fvec8& v) { static inline fvec8 sqrt(fvec8 v) {
return fvec8(_mm256_sqrt_ps(v.val)); return fvec8(_mm256_sqrt_ps(v.val));
} }
static inline fvec8 rsqrt(const fvec8& v) { static inline fvec8 rsqrt(fvec8 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
fvec8 y(_mm256_rsqrt_ps(v.val)); fvec8 y(_mm256_rsqrt_ps(v.val));
...@@ -234,17 +234,17 @@ static inline fvec8 rsqrt(const fvec8& v) { ...@@ -234,17 +234,17 @@ static inline fvec8 rsqrt(const fvec8& v) {
return y; return y;
} }
static inline float dot8(const fvec8& v1, const fvec8& v2) { static inline float dot8(fvec8 v1, fvec8 v2) {
fvec8 result = _mm256_dp_ps(v1, v2, 0xF1); fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec()); return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
} }
static inline float reduceAdd(const fvec8 v) { static inline float reduceAdd(fvec8 v) {
// :TODO: There are more efficient ways to do this. // :TODO: There are more efficient ways to do this.
return dot8(v, fvec8(1.0f)); return dot8(v, fvec8(1.0f));
} }
static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) { static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4; fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8; fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
_MM_TRANSPOSE4_PS(i1, i2, i3, i4); _MM_TRANSPOSE4_PS(i1, i2, i3, i4);
...@@ -275,7 +275,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8& ...@@ -275,7 +275,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8&
transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4); transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
} }
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) { static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
out1 = in1.lowerVec(); out1 = in1.lowerVec();
out2 = in2.lowerVec(); out2 = in2.lowerVec();
out3 = in3.lowerVec(); out3 = in3.lowerVec();
...@@ -291,40 +291,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in ...@@ -291,40 +291,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
/** /**
* Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements. * Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
*/ */
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) { static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4 out[8]) {
transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
} }
// Functions that operate on ivec8s. // Functions that operate on ivec8s.
static inline bool any(const ivec8& v) { static inline bool any(ivec8 v) {
return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF)); return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec8 operator+(float v1, const fvec8& v2) { static inline fvec8 operator+(float v1, fvec8 v2) {
return fvec8(v1)+v2; return fvec8(v1)+v2;
} }
static inline fvec8 operator-(float v1, const fvec8& v2) { static inline fvec8 operator-(float v1, fvec8 v2) {
return fvec8(v1)-v2; return fvec8(v1)-v2;
} }
static inline fvec8 operator*(float v1, const fvec8& v2) { static inline fvec8 operator*(float v1, fvec8 v2) {
return fvec8(v1)*v2; return fvec8(v1)*v2;
} }
static inline fvec8 operator/(float v1, const fvec8& v2) { static inline fvec8 operator/(float v1, fvec8 v2) {
return fvec8(v1)/v2; return fvec8(v1)/v2;
} }
// Operation for blending fvec8 from a full bitmask. // Operation for blending fvec8 from a full bitmask.
static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) { static inline fvec8 blend(fvec8 v1, fvec8 v2, fvec8 mask) {
return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val)); return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
} }
static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) { static inline fvec8 blendZero(fvec8 v, fvec8 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
...@@ -333,7 +333,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) { ...@@ -333,7 +333,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) { static inline void gatherVecPair(const float* table, ivec8 index, fvec8& out0, fvec8& out1) {
const auto lower = index.lowerVec(); const auto lower = index.lowerVec();
const auto upper = index.upperVec(); const auto upper = index.upperVec();
...@@ -368,7 +368,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o ...@@ -368,7 +368,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o
* output[2] = (Z0 + Z1 + Z2 + ...) * output[2] = (Z0 + Z1 + Z2 + ...)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) { static inline fvec4 reduceToVec3(fvec8 x, fvec8 y, fvec8 z) {
// The general strategy for a vector reduce-add operation is to take values from // The general strategy for a vector reduce-add operation is to take values from
// different parts of the vector and overlap them a different part of the vector and then // different parts of the vector and overlap them a different part of the vector and then
// add together. Repeat this several times until all values have been summed. Initially 8 // add together. Repeat this several times until all values have been summed. Initially 8
......
...@@ -118,16 +118,16 @@ public: ...@@ -118,16 +118,16 @@ public:
v[2] = vgetq_lane_f32(val, 2); v[2] = vgetq_lane_f32(val, 2);
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return vaddq_f32(val, other); return vaddq_f32(val, other);
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return vsubq_f32(val, other); return vsubq_f32(val, other);
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return vmulq_f32(val, other); return vmulq_f32(val, other);
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply. // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
float32x4_t reciprocal = vrecpeq_f32(other); float32x4_t reciprocal = vrecpeq_f32(other);
...@@ -136,43 +136,43 @@ public: ...@@ -136,43 +136,43 @@ public:
fvec4 result = vmulq_f32(val,reciprocal); fvec4 result = vmulq_f32(val,reciprocal);
return result; return result;
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = vaddq_f32(val, other); val = vaddq_f32(val, other);
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = vsubq_f32(val, other); val = vsubq_f32(val, other);
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = vmulq_f32(val, other); val = vmulq_f32(val, other);
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = *this/other; val = *this/other;
} }
fvec4 operator-() const { fvec4 operator-() const {
return vnegq_f32(val); return vnegq_f32(val);
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other))); return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other))); return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
} }
fvec4 operator==(const fvec4& other) const { fvec4 operator==(fvec4 other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
} }
fvec4 operator!=(const fvec4& other) const { fvec4 operator!=(fvec4 other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other)) return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))
} }
fvec4 operator>(const fvec4& other) const { fvec4 operator>(fvec4 other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
} }
fvec4 operator<(const fvec4& other) const { fvec4 operator<(fvec4 other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
} }
fvec4 operator>=(const fvec4& other) const { fvec4 operator>=(fvec4 other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
} }
fvec4 operator<=(const fvec4& other) const { fvec4 operator<=(fvec4 other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other))); return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
} }
operator ivec4() const; operator ivec4() const;
...@@ -217,49 +217,49 @@ public: ...@@ -217,49 +217,49 @@ public:
void store(int* v) const { void store(int* v) const {
vst1q_s32(v, val); vst1q_s32(v, val);
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return vaddq_s32(val, other); return vaddq_s32(val, other);
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return vsubq_s32(val, other); return vsubq_s32(val, other);
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return vmulq_s32(val, other); return vmulq_s32(val, other);
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = vaddq_s32(val, other); val = vaddq_s32(val, other);
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = vsubq_s32(val, other); val = vsubq_s32(val, other);
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = vmulq_s32(val, other); val = vmulq_s32(val, other);
} }
ivec4 operator-() const { ivec4 operator-() const {
return vnegq_s32(val); return vnegq_s32(val);
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return vandq_s32(val, other); return vandq_s32(val, other);
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return vorrq_s32(val, other); return vorrq_s32(val, other);
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return vreinterpretq_s32_u32(vceqq_s32(val, other)); return vreinterpretq_s32_u32(vceqq_s32(val, other));
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other)) return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return vreinterpretq_s32_u32(vcgtq_s32(val, other)); return vreinterpretq_s32_u32(vcgtq_s32(val, other));
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return vreinterpretq_s32_u32(vcltq_s32(val, other)); return vreinterpretq_s32_u32(vcltq_s32(val, other));
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return vreinterpretq_s32_u32(vcgeq_s32(val, other)); return vreinterpretq_s32_u32(vcgeq_s32(val, other));
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return vreinterpretq_s32_u32(vcleq_s32(val, other)); return vreinterpretq_s32_u32(vcleq_s32(val, other));
} }
operator fvec4() const; operator fvec4() const;
...@@ -283,52 +283,52 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -283,52 +283,52 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
} }
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return vminq_f32(v1, v2); return vminq_f32(v1, v2);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return vmaxq_f32(v1, v2); return vmaxq_f32(v1, v2);
} }
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
return vabsq_f32(v); return vabsq_f32(v);
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
float32x4_t recipSqrt = vrsqrteq_f32(v); float32x4_t recipSqrt = vrsqrteq_f32(v);
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt)); recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt)); recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
return recipSqrt; return recipSqrt;
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return rsqrt(v)*v; return rsqrt(v)*v;
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(exp_ps(v.val)); return fvec4(exp_ps(v.val));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(log_ps(v.val)); return fvec4(log_ps(v.val));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
fvec4 result = v1*v2; fvec4 result = v1*v2;
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2); return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 result = v1*v2; fvec4 result = v1*v2;
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3); return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
return fvec4(v1[1]*v2[2] - v1[2]*v2[1], return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
v1[2]*v2[0] - v1[0]*v2[2], v1[2]*v2[0] - v1[0]*v2[2],
v1[0]*v2[1] - v1[1]*v2[0], 0); v1[0]*v2[1] - v1[1]*v2[0], 0);
...@@ -356,26 +356,26 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -356,26 +356,26 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return vminq_s32(v1, v2); return vminq_s32(v1, v2);
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return vmaxq_s32(v1, v2); return vmaxq_s32(v1, v2);
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return vabdq_s32(v, ivec4(0)); return vabdq_s32(v, ivec4(0));
} }
static inline bool any(const ivec4& v) { static inline bool any(ivec4 v) {
#ifdef __ARM64__ #ifdef __ARM64__
return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0); return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
#else #else
...@@ -385,46 +385,46 @@ static inline bool any(const ivec4& v) { ...@@ -385,46 +385,46 @@ static inline bool any(const ivec4& v) {
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1); return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) { static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
fvec4 shift(0x1.0p23f); fvec4 shift(0x1.0p23f);
fvec4 absResult = (abs(v)+shift)-shift; fvec4 absResult = (abs(v)+shift)-shift;
return blend(v, absResult, ivec4(0x7FFFFFFF)); return blend(v, absResult, ivec4(0x7FFFFFFF));
} }
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
fvec4 rounded = round(v); fvec4 rounded = round(v);
return rounded + blend(0.0f, -1.0f, rounded>v); return rounded + blend(0.0f, -1.0f, rounded>v);
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
fvec4 rounded = round(v); fvec4 rounded = round(v);
return rounded + blend(0.0f, 1.0f, rounded<v); return rounded + blend(0.0f, 1.0f, rounded<v);
} }
...@@ -433,7 +433,7 @@ static inline fvec4 ceil(const fvec4& v) { ...@@ -433,7 +433,7 @@ static inline fvec4 ceil(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -456,7 +456,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -456,7 +456,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z); const auto nz = reduceAdd(z);
......
...@@ -95,45 +95,45 @@ public: ...@@ -95,45 +95,45 @@ public:
v[1] = val[1]; v[1] = val[1];
v[2] = val[2]; v[2] = val[2];
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return val+other; return val+other;
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return val-other; return val-other;
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return val*other; return val*other;
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
return val/other; return val/other;
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = val+other; val = val+other;
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = val-other; val = val-other;
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = val*other; val = val*other;
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = val/other; val = val/other;
} }
fvec4 operator-() const { fvec4 operator-() const {
return -val; return -val;
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return (fvec4) (((__m128i)val)&((__m128i)other.val)); return (fvec4) (((__m128i)val)&((__m128i)other.val));
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return (fvec4) (((__m128i)val)|((__m128i)other.val)); return (fvec4) (((__m128i)val)|((__m128i)other.val));
} }
ivec4 operator==(const fvec4& other) const; ivec4 operator==(fvec4 other) const;
ivec4 operator!=(const fvec4& other) const; ivec4 operator!=(fvec4 other) const;
ivec4 operator>(const fvec4& other) const; ivec4 operator>(fvec4 other) const;
ivec4 operator<(const fvec4& other) const; ivec4 operator<(fvec4 other) const;
ivec4 operator>=(const fvec4& other) const; ivec4 operator>=(fvec4 other) const;
ivec4 operator<=(const fvec4& other) const; ivec4 operator<=(fvec4 other) const;
operator ivec4() const; operator ivec4() const;
/** /**
...@@ -171,49 +171,49 @@ public: ...@@ -171,49 +171,49 @@ public:
void store(int* v) const { void store(int* v) const {
*((__m128*) v) = val; *((__m128*) v) = val;
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return val+other; return val+other;
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return val-other; return val-other;
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return val*other; return val*other;
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = val+other; val = val+other;
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = val-other; val = val-other;
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = val*other; val = val*other;
} }
ivec4 operator-() const { ivec4 operator-() const {
return -val; return -val;
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return val&other.val; return val&other.val;
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return val|other.val; return val|other.val;
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return (val==other.val); return (val==other.val);
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return (val!=other.val); return (val!=other.val);
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return (val>other.val); return (val>other.val);
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return (val<other.val); return (val<other.val);
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return (val>=other.val); return (val>=other.val);
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return (val<=other.val); return (val<=other.val);
} }
operator fvec4() const; operator fvec4() const;
...@@ -221,27 +221,27 @@ public: ...@@ -221,27 +221,27 @@ public:
// Conversion operators. // Conversion operators.
inline ivec4 fvec4::operator==(const fvec4& other) const { inline ivec4 fvec4::operator==(fvec4 other) const {
return (__m128i) (val==other.val); return (__m128i) (val==other.val);
} }
inline ivec4 fvec4::operator!=(const fvec4& other) const { inline ivec4 fvec4::operator!=(fvec4 other) const {
return (__m128i) (val!=other.val); return (__m128i) (val!=other.val);
} }
inline ivec4 fvec4::operator>(const fvec4& other) const { inline ivec4 fvec4::operator>(fvec4 other) const {
return (__m128i) (val>other.val); return (__m128i) (val>other.val);
} }
inline ivec4 fvec4::operator<(const fvec4& other) const { inline ivec4 fvec4::operator<(fvec4 other) const {
return (__m128i) (val<other.val); return (__m128i) (val<other.val);
} }
inline ivec4 fvec4::operator>=(const fvec4& other) const { inline ivec4 fvec4::operator>=(fvec4 other) const {
return (__m128i) (val>=other.val); return (__m128i) (val>=other.val);
} }
inline ivec4 fvec4::operator<=(const fvec4& other) const { inline ivec4 fvec4::operator<=(fvec4 other) const {
return (__m128i) (val<=other.val); return (__m128i) (val<=other.val);
} }
...@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
return v&(__m128) ivec4(0x7FFFFFFF); return v&(__m128) ivec4(0x7FFFFFFF);
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3])); return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3])); return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
return r[0]+r[1]+r[2]; return r[0]+r[1]+r[2];
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
fvec4 temp = __builtin_shufflevector(r.val, r.val, 0, 1, -1, -1)+__builtin_shufflevector(r.val, r.val, 2, 3, -1, -1); fvec4 temp = __builtin_shufflevector(r.val, r.val, 0, 1, -1, -1)+__builtin_shufflevector(r.val, r.val, 2, 3, -1, -1);
return temp[0]+temp[1]; return temp[0]+temp[1];
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
__m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) - __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3); v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
return __builtin_shufflevector(temp, temp, 2, 0, 1, 3); return __builtin_shufflevector(temp, temp, 2, 0, 1, 3);
...@@ -317,85 +317,85 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -317,85 +317,85 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3])); return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3]));
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3])); return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3]));
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return ivec4(abs(v[0]), abs(v[1]), abs(v[2]), abs(v[3])); return ivec4(abs(v[0]), abs(v[1]), abs(v[2]), abs(v[3]));
} }
static inline bool any(const __m128i& v) { static inline bool any(__m128i v) {
ivec4 temp = __builtin_shufflevector(v, v, 0, 1, -1, -1) | __builtin_shufflevector(v, v, 2, 3, -1, -1); ivec4 temp = __builtin_shufflevector(v, v, 0, 1, -1, -1) | __builtin_shufflevector(v, v, 2, 3, -1, -1);
return (temp[0] || temp[1]); return (temp[0] || temp[1]);
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1)); return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) { static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return blend(v1, v2, v1 > v2); return blend(v1, v2, v1 > v2);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return blend(v1, v2, v1 < v2); return blend(v1, v2, v1 < v2);
} }
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
fvec4 shift(0x1.0p23f); fvec4 shift(0x1.0p23f);
fvec4 absResult = (abs(v)+shift)-shift; fvec4 absResult = (abs(v)+shift)-shift;
return (__m128) ((ivec4(0x80000000)&(__m128i)v) + (ivec4(0x7FFFFFFF)&(__m128i)absResult)); return (__m128) ((ivec4(0x80000000)&(__m128i)v) + (ivec4(0x7FFFFFFF)&(__m128i)absResult));
} }
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128); fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
return truncated + blend(0.0f, -1.0f, truncated>v); return truncated + blend(0.0f, -1.0f, truncated>v);
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128); fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
return truncated + blend(0.0f, 1.0f, truncated<v); return truncated + blend(0.0f, 1.0f, truncated<v);
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
ivec4 i = (__m128i) v; ivec4 i = (__m128i) v;
...@@ -411,7 +411,7 @@ static inline fvec4 rsqrt(const fvec4& v) { ...@@ -411,7 +411,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
return y; return y;
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return rsqrt(v)*v; return rsqrt(v)*v;
} }
...@@ -420,7 +420,7 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -420,7 +420,7 @@ static inline fvec4 sqrt(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -443,7 +443,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -443,7 +443,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z); const auto nz = reduceAdd(z);
......
...@@ -97,45 +97,45 @@ public: ...@@ -97,45 +97,45 @@ public:
v[2] = val[2]; v[2] = val[2];
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return vec_add(val, other.val); return vec_add(val, other.val);
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return vec_sub(val, other.val); return vec_sub(val, other.val);
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return vec_mul(val, other.val); return vec_mul(val, other.val);
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
return vec_div(val, other.val); return vec_div(val, other.val);
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = vec_add(val, other.val); val = vec_add(val, other.val);
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = vec_sub(val, other.val); val = vec_sub(val, other.val);
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = vec_mul(val, other.val); val = vec_mul(val, other.val);
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = vec_div(val, other.val); val = vec_div(val, other.val);
} }
fvec4 operator-() const { fvec4 operator-() const {
return -val; return -val;
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return vec_and(val, other.val); return vec_and(val, other.val);
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return vec_or(val, other.val); return vec_or(val, other.val);
} }
ivec4 operator==(const fvec4& other) const; ivec4 operator==(fvec4 other) const;
ivec4 operator!=(const fvec4& other) const; ivec4 operator!=(fvec4 other) const;
ivec4 operator>(const fvec4& other) const; ivec4 operator>(fvec4 other) const;
ivec4 operator<(const fvec4& other) const; ivec4 operator<(fvec4 other) const;
ivec4 operator>=(const fvec4& other) const; ivec4 operator>=(fvec4 other) const;
ivec4 operator<=(const fvec4& other) const; ivec4 operator<=(fvec4 other) const;
operator ivec4() const; operator ivec4() const;
/*** /***
...@@ -173,49 +173,49 @@ public: ...@@ -173,49 +173,49 @@ public:
void store(int* v) const { void store(int* v) const {
*((__m128i*) v) = val; *((__m128i*) v) = val;
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return vec_add(val, other.val); return vec_add(val, other.val);
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return vec_sub(val, other.val); return vec_sub(val, other.val);
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return val*other.val; return val*other.val;
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = vec_add(val, other.val); val = vec_add(val, other.val);
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = vec_sub(val, other.val); val = vec_sub(val, other.val);
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = val*other.val; val = val*other.val;
} }
ivec4 operator-() const { ivec4 operator-() const {
return -val; return -val;
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return val&other.val; return val&other.val;
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return val|other.val; return val|other.val;
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return (val==other.val); return (val==other.val);
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return (val!=other.val); return (val!=other.val);
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return (val>other.val); return (val>other.val);
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return (val<other.val); return (val<other.val);
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return (val>=other.val); return (val>=other.val);
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return (val<=other.val); return (val<=other.val);
} }
operator fvec4() const; operator fvec4() const;
...@@ -223,27 +223,27 @@ public: ...@@ -223,27 +223,27 @@ public:
// Conversion operators. // Conversion operators.
inline ivec4 fvec4::operator==(const fvec4& other) const { inline ivec4 fvec4::operator==(fvec4 other) const {
return (val==other.val); return (val==other.val);
} }
inline ivec4 fvec4::operator!=(const fvec4& other) const { inline ivec4 fvec4::operator!=(fvec4 other) const {
return (val!=other.val); return (val!=other.val);
} }
inline ivec4 fvec4::operator>(const fvec4& other) const { inline ivec4 fvec4::operator>(fvec4 other) const {
return (val>other.val); return (val>other.val);
} }
inline ivec4 fvec4::operator<(const fvec4& other) const { inline ivec4 fvec4::operator<(fvec4 other) const {
return (val<other.val); return (val<other.val);
} }
inline ivec4 fvec4::operator>=(const fvec4& other) const { inline ivec4 fvec4::operator>=(fvec4 other) const {
return (val>=other.val); return (val>=other.val);
} }
inline ivec4 fvec4::operator<=(const fvec4& other) const { inline ivec4 fvec4::operator<=(fvec4 other) const {
return (val<=other.val); return (val<=other.val);
} }
...@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
return vec_abs(v.val); return vec_abs(v.val);
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3])); return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3])); return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
return r[0]+r[1]+r[2]; return r[0]+r[1]+r[2];
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
fvec4 temp = r + vec_sld(r.val, r.val, 8); fvec4 temp = r + vec_sld(r.val, r.val, 8);
return temp[0]+temp[1]; return temp[0]+temp[1];
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15}; vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
__m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) - __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
v1.val*vec_perm(v2.val, v2.val, perm); v1.val*vec_perm(v2.val, v2.val, perm);
...@@ -324,80 +324,80 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -324,80 +324,80 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return vec_min(v1.val, v2.val); return vec_min(v1.val, v2.val);
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return vec_max(v1.val, v2.val); return vec_max(v1.val, v2.val);
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return vec_abs(v.val); return vec_abs(v.val);
} }
static inline bool any(const ivec4 v) { static inline bool any(ivec4 v) {
return !vec_all_eq(v.val, ivec4(0).val); return !vec_all_eq(v.val, ivec4(0).val);
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val); return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) { static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return vec_min(v1.val, v2.val); return vec_min(v1.val, v2.val);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return vec_max(v1.val, v2.val); return vec_max(v1.val, v2.val);
} }
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
return vec_round(v.val); return vec_round(v.val);
} }
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
return vec_floor(v.val); return vec_floor(v.val);
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
return vec_ceil(v.val); return vec_ceil(v.val);
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
fvec4 y(vec_rsqrte(v.val)); fvec4 y(vec_rsqrte(v.val));
...@@ -409,7 +409,7 @@ static inline fvec4 rsqrt(const fvec4& v) { ...@@ -409,7 +409,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
return y; return y;
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return vec_sqrt(v.val); return vec_sqrt(v.val);
} }
...@@ -417,7 +417,7 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -417,7 +417,7 @@ static inline fvec4 sqrt(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -440,7 +440,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -440,7 +440,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z); const auto nz = reduceAdd(z);
......
...@@ -108,55 +108,55 @@ public: ...@@ -108,55 +108,55 @@ public:
#endif #endif
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return _mm_add_ps(val, other); return _mm_add_ps(val, other);
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return _mm_sub_ps(val, other); return _mm_sub_ps(val, other);
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return _mm_mul_ps(val, other); return _mm_mul_ps(val, other);
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
return _mm_div_ps(val, other); return _mm_div_ps(val, other);
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = _mm_add_ps(val, other); val = _mm_add_ps(val, other);
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = _mm_sub_ps(val, other); val = _mm_sub_ps(val, other);
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = _mm_mul_ps(val, other); val = _mm_mul_ps(val, other);
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = _mm_div_ps(val, other); val = _mm_div_ps(val, other);
} }
fvec4 operator-() const { fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val); return _mm_sub_ps(_mm_set1_ps(0.0f), val);
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return _mm_and_ps(val, other); return _mm_and_ps(val, other);
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return _mm_or_ps(val, other); return _mm_or_ps(val, other);
} }
fvec4 operator==(const fvec4& other) const { fvec4 operator==(fvec4 other) const {
return _mm_cmpeq_ps(val, other); return _mm_cmpeq_ps(val, other);
} }
fvec4 operator!=(const fvec4& other) const { fvec4 operator!=(fvec4 other) const {
return _mm_cmpneq_ps(val, other); return _mm_cmpneq_ps(val, other);
} }
fvec4 operator>(const fvec4& other) const { fvec4 operator>(fvec4 other) const {
return _mm_cmpgt_ps(val, other); return _mm_cmpgt_ps(val, other);
} }
fvec4 operator<(const fvec4& other) const { fvec4 operator<(fvec4 other) const {
return _mm_cmplt_ps(val, other); return _mm_cmplt_ps(val, other);
} }
fvec4 operator>=(const fvec4& other) const { fvec4 operator>=(fvec4 other) const {
return _mm_cmpge_ps(val, other); return _mm_cmpge_ps(val, other);
} }
fvec4 operator<=(const fvec4& other) const { fvec4 operator<=(fvec4 other) const {
return _mm_cmple_ps(val, other); return _mm_cmple_ps(val, other);
} }
operator ivec4() const; operator ivec4() const;
...@@ -191,49 +191,49 @@ public: ...@@ -191,49 +191,49 @@ public:
void store(int* v) const { void store(int* v) const {
_mm_storeu_si128((__m128i*) v, val); _mm_storeu_si128((__m128i*) v, val);
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return _mm_add_epi32(val, other); return _mm_add_epi32(val, other);
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return _mm_sub_epi32(val, other); return _mm_sub_epi32(val, other);
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return _mm_mullo_epi32(val, other); return _mm_mullo_epi32(val, other);
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = _mm_add_epi32(val, other); val = _mm_add_epi32(val, other);
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = _mm_sub_epi32(val, other); val = _mm_sub_epi32(val, other);
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = _mm_mullo_epi32(val, other); val = _mm_mullo_epi32(val, other);
} }
ivec4 operator-() const { ivec4 operator-() const {
return _mm_sub_epi32(_mm_set1_epi32(0), val); return _mm_sub_epi32(_mm_set1_epi32(0), val);
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return _mm_and_si128(val, other); return _mm_and_si128(val, other);
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return _mm_or_si128(val, other); return _mm_or_si128(val, other);
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return _mm_cmpeq_epi32(val, other); return _mm_cmpeq_epi32(val, other);
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF)); return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return _mm_cmpgt_epi32(val, other); return _mm_cmpgt_epi32(val, other);
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return _mm_cmplt_epi32(val, other); return _mm_cmplt_epi32(val, other);
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF)); return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF)); return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
} }
operator fvec4() const; operator fvec4() const;
...@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
return fvec4(_mm_floor_ps(v.val)); return fvec4(_mm_floor_ps(v.val));
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
return fvec4(_mm_ceil_ps(v.val)); return fvec4(_mm_ceil_ps(v.val));
} }
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT)); return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
} }
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return fvec4(_mm_min_ps(v1.val, v2.val)); return fvec4(_mm_min_ps(v1.val, v2.val));
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return fvec4(_mm_max_ps(v1.val, v2.val)); return fvec4(_mm_max_ps(v1.val, v2.val));
} }
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask)); return fvec4(_mm_and_ps(v.val, mask));
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return fvec4(_mm_sqrt_ps(v.val)); return fvec4(_mm_sqrt_ps(v.val));
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
fvec4 y(_mm_rsqrt_ps(v.val)); fvec4 y(_mm_rsqrt_ps(v.val));
...@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) { ...@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) {
return y; return y;
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(exp_ps(v.val)); return fvec4(exp_ps(v.val));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(log_ps(v.val)); return fvec4(log_ps(v.val));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71)); return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1)); return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) - fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1)))); fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(3, 0, 2, 1));
...@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return ivec4(_mm_min_epi32(v1.val, v2.val)); return ivec4(_mm_min_epi32(v1.val, v2.val));
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val)); return ivec4(_mm_max_epi32(v1.val, v2.val));
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return ivec4(_mm_abs_epi32(v.val)); return ivec4(_mm_abs_epi32(v.val));
} }
static inline bool any(const ivec4& v) { static inline bool any(ivec4 v) {
return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF)); return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4 // Operations for blending fvec4
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, fvec4 mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val)); return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
} }
static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) { static inline fvec4 blendZero(fvec4 v, fvec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
...@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) { ...@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
// :TODO: Could be made more efficient. // :TODO: Could be made more efficient.
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment