Merge pull request #2658 from mark-mb/master

Improvements for ARM64 vector implementation

Merge pull request #2658 from mark-mb/master
Improvements for ARM64 vector implementation
d36e3414 · peastman · GitHub · 6b39ec7e · 8e3f5bc6 · d36e3414
Unverified Commit d36e3414 authored Jun 01, 2020 by peastman Committed by GitHub Jun 01, 2020
5 changed files
--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -75,55 +75,55 @@ public:
    void store(float* v) const {
        _mm256_storeu_ps(v, val);
    }
-    fvec8 operator+(const fvec8& other) const {
+    fvec8 operator+(fvec8 other) const {
        return _mm256_add_ps(val, other);
    }
-    fvec8 operator-(const fvec8& other) const {
+    fvec8 operator-(fvec8 other) const {
        return _mm256_sub_ps(val, other);
    }
-    fvec8 operator*(const fvec8& other) const {
+    fvec8 operator*(fvec8 other) const {
        return _mm256_mul_ps(val, other);
    }
-    fvec8 operator/(const fvec8& other) const {
+    fvec8 operator/(fvec8 other) const {
        return _mm256_div_ps(val, other);
    }
-    void operator+=(const fvec8& other) {
+    void operator+=(fvec8 other) {
        val = _mm256_add_ps(val, other);
    }
-    void operator-=(const fvec8& other) {
+    void operator-=(fvec8 other) {
        val = _mm256_sub_ps(val, other);
    }
-    void operator*=(const fvec8& other) {
+    void operator*=(fvec8 other) {
        val = _mm256_mul_ps(val, other);
    }
-    void operator/=(const fvec8& other) {
+    void operator/=(fvec8 other) {
        val = _mm256_div_ps(val, other);
    }
    fvec8 operator-() const {
        return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
    }
-    fvec8 operator&(const fvec8& other) const {
+    fvec8 operator&(fvec8 other) const {
        return _mm256_and_ps(val, other);
    }
-    fvec8 operator|(const fvec8& other) const {
+    fvec8 operator|(fvec8& other) const {
        return _mm256_or_ps(val, other);
    }
-    fvec8 operator==(const fvec8& other) const {
+    fvec8 operator==(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
    }
-    fvec8 operator!=(const fvec8& other) const {
+    fvec8 operator!=(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
    }
-    fvec8 operator>(const fvec8& other) const {
+    fvec8 operator>(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
    }
-    fvec8 operator<(const fvec8& other) const {
+    fvec8 operator<(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
    }
-    fvec8 operator>=(const fvec8& other) const {
+    fvec8 operator>=(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
    }
-    fvec8 operator<=(const fvec8& other) const {
+    fvec8 operator<=(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
    }
    operator ivec8() const;
@@ -159,10 +159,10 @@ public:
    void store(int* v) const {
        _mm256_storeu_si256((__m256i*) v, val);
    }
-    ivec8 operator&(const ivec8& other) const {
+    ivec8 operator&(ivec8 other) const {
        return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
    }
-    ivec8 operator|(const ivec8& other) const {
+    ivec8 operator|(ivec8 other) const {
        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
    }
    operator fvec8() const;
@@ -193,36 +193,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec8s.
-static inline fvec8 floor(const fvec8& v) {
+static inline fvec8 floor(fvec8 v) {
    return fvec8(_mm256_round_ps(v.val, 0x09));
 }
-static inline fvec8 ceil(const fvec8& v) {
+static inline fvec8 ceil(fvec8 v) {
    return fvec8(_mm256_round_ps(v.val, 0x0A));
 }
-static inline fvec8 round(const fvec8& v) {
+static inline fvec8 round(fvec8 v) {
    return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
 }
-static inline fvec8 min(const fvec8& v1, const fvec8& v2) {
+static inline fvec8 min(fvec8 v1, fvec8 v2) {
    return fvec8(_mm256_min_ps(v1.val, v2.val));
 }
-static inline fvec8 max(const fvec8& v1, const fvec8& v2) {
+static inline fvec8 max(fvec8 v1, fvec8 v2) {
    return fvec8(_mm256_max_ps(v1.val, v2.val));
 }
-static inline fvec8 abs(const fvec8& v) {
+static inline fvec8 abs(fvec8 v) {
    static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    return fvec8(_mm256_and_ps(v.val, mask));
 }
-static inline fvec8 sqrt(const fvec8& v) {
+static inline fvec8 sqrt(fvec8 v) {
    return fvec8(_mm256_sqrt_ps(v.val));
 }
-static inline fvec8 rsqrt(const fvec8& v) {
+static inline fvec8 rsqrt(fvec8 v) {
    // Initial estimate of rsqrt().
    fvec8 y(_mm256_rsqrt_ps(v.val));
@@ -234,17 +234,17 @@ static inline fvec8 rsqrt(const fvec8& v) {
    return y;
 }
-static inline float dot8(const fvec8& v1, const fvec8& v2) {
+static inline float dot8(fvec8 v1, fvec8 v2) {
    fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
 }
-static inline float reduceAdd(const fvec8 v) {
+static inline float reduceAdd(fvec8 v) {
    // :TODO: There are more efficient ways to do this.
    return dot8(v, fvec8(1.0f));
 }
-static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
+static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
    fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
    fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
    _MM_TRANSPOSE4_PS(i1, i2, i3, i4);
@@ -275,7 +275,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8&
    transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
 }
-static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
+static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
    out1 = in1.lowerVec();
    out2 = in2.lowerVec();
    out3 = in3.lowerVec();
@@ -291,40 +291,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
 /**
 * Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
 */
-static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) {
+static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4 out[8]) {
    transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
 }
 // Functions that operate on ivec8s.
-static inline bool any(const ivec8& v) {
+static inline bool any(ivec8 v) {
    return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec8 operator+(float v1, const fvec8& v2) {
+static inline fvec8 operator+(float v1, fvec8 v2) {
    return fvec8(v1)+v2;
 }
-static inline fvec8 operator-(float v1, const fvec8& v2) {
+static inline fvec8 operator-(float v1, fvec8 v2) {
    return fvec8(v1)-v2;
 }
-static inline fvec8 operator*(float v1, const fvec8& v2) {
+static inline fvec8 operator*(float v1, fvec8 v2) {
    return fvec8(v1)*v2;
 }
-static inline fvec8 operator/(float v1, const fvec8& v2) {
+static inline fvec8 operator/(float v1, fvec8 v2) {
    return fvec8(v1)/v2;
 }
 // Operation for blending fvec8 from a full bitmask.
-static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) {
+static inline fvec8 blend(fvec8 v1, fvec8 v2, fvec8 mask) {
    return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
 }
-static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
+static inline fvec8 blendZero(fvec8 v, fvec8 mask) {
    return blend(0.0f, v, mask);
 }
@@ -333,7 +333,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) {
+static inline void gatherVecPair(const float* table, ivec8 index, fvec8& out0, fvec8& out1) {
    const auto lower = index.lowerVec();
    const auto upper = index.upperVec();
@@ -368,7 +368,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o
 *   output[2] = (Z0 + Z1 + Z2 + ...)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
+static inline fvec4 reduceToVec3(fvec8 x, fvec8 y, fvec8 z) {
    // The general strategy for a vector reduce-add operation is to take values from
    // different parts of the vector and overlap them a different part of the vector and then
    // add together. Repeat this several times until all values have been summed. Initially 8

--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -76,10 +76,7 @@ public:
    fvec4() = default;
    fvec4(float v) : val(vdupq_n_f32(v)) {}
-    fvec4(float v1, float v2, float v3, float v4) {
+    fvec4(float v1, float v2, float v3, float v4) : val {v1, v2, v3, v4} {}
-        float v[] = {v1, v2, v3, v4};
-        val = vld1q_f32(v);
-    }
    fvec4(float32x4_t v) : val(v) {}
    fvec4(const float* v) : val(vld1q_f32(v)) {}
    operator float32x4_t() const {
@@ -121,16 +118,16 @@ public:
        v[2] = vgetq_lane_f32(val, 2);
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return vaddq_f32(val, other);
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return vsubq_f32(val, other);
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return vmulq_f32(val, other);
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
        float32x4_t reciprocal = vrecpeq_f32(other);
@@ -139,45 +136,34 @@ public:
        fvec4 result = vmulq_f32(val,reciprocal);
        return result;
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = vaddq_f32(val, other);
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = vsubq_f32(val, other);
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = vmulq_f32(val, other);
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = *this/other;
    }
    fvec4 operator-() const {
        return vnegq_f32(val);
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
    }
-    fvec4 operator==(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
+    ivec4 operator==(fvec4 other) const;
-    }
+    ivec4 operator!=(fvec4 other) const;
-    fvec4 operator!=(const fvec4& other) const {
+    ivec4 operator>(fvec4 other) const;
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))
+    ivec4 operator<(fvec4 other) const;
-    }
+    ivec4 operator>=(fvec4 other) const;
-    fvec4 operator>(const fvec4& other) const {
+    ivec4 operator<=(fvec4 other) const;
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
-    }
-    fvec4 operator<(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
-    }
-    fvec4 operator>=(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
-    }
-    fvec4 operator<=(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
-    }
    operator ivec4() const;
    /**
@@ -198,10 +184,7 @@ public:
    ivec4() {}
    ivec4(int v) : val(vdupq_n_s32(v)) {}
-    ivec4(int v1, int v2, int v3, int v4) {
+    ivec4(int v1, int v2, int v3, int v4) : val {v1, v2, v3, v4} {}
-        int v[] = {v1, v2, v3, v4};
-        val = vld1q_s32(v);
-    }
    ivec4(int32x4_t v) : val(v) {}
    ivec4(const int* v) : val(vld1q_s32(v)) {}
    operator int32x4_t() const {
@@ -223,49 +206,49 @@ public:
    void store(int* v) const {
        vst1q_s32(v, val);
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return vaddq_s32(val, other);
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return vsubq_s32(val, other);
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return vmulq_s32(val, other);
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = vaddq_s32(val, other);
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = vsubq_s32(val, other);
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = vmulq_s32(val, other);
    }
    ivec4 operator-() const {
        return vnegq_s32(val);
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return vandq_s32(val, other);
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return vorrq_s32(val, other);
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return vreinterpretq_s32_u32(vceqq_s32(val, other));
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return vreinterpretq_s32_u32(vcgtq_s32(val, other));
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return vreinterpretq_s32_u32(vcltq_s32(val, other));
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return vreinterpretq_s32_u32(vcgeq_s32(val, other));
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return vreinterpretq_s32_u32(vcleq_s32(val, other));
    }
    operator fvec4() const;
@@ -287,54 +270,84 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
                 bitmask & 4 ? -1 : 0,
                 bitmask & 8 ? -1 : 0);
 }
+// Comparison operators
+inline ivec4 fvec4::operator==(fvec4 other) const {
+    return vreinterpretq_s32_u32(vceqq_f32(val, other));
+}
+inline ivec4 fvec4::operator!=(fvec4 other) const {
+    return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other))); // not(equals(val, other))
+}
+inline ivec4 fvec4::operator>(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcgtq_f32(val, other));
+}
+inline ivec4 fvec4::operator<(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcltq_f32(val, other));
+}
+inline ivec4 fvec4::operator>=(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcgeq_f32(val, other));
+}
+inline ivec4 fvec4::operator<=(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcleq_f32(val, other));
+}
 // Functions that operate on fvec4s.
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return vminq_f32(v1, v2);
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return vmaxq_f32(v1, v2);
 }
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    return vabsq_f32(v);
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    float32x4_t recipSqrt = vrsqrteq_f32(v);
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
    return recipSqrt;
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return rsqrt(v)*v;
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(exp_ps(v.val));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(log_ps(v.val));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    fvec4 result = v1*v2;
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    fvec4 result = v1*v2;
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
+#ifdef __ARM64__
+    return vaddvq_f32(v);
+#else
    return dot4(v, fvec4(1.0f));
+#endif
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
                 v1[2]*v2[0] - v1[0]*v2[2],
                 v1[0]*v2[1] - v1[1]*v2[0], 0);
@@ -362,71 +375,79 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return vminq_s32(v1, v2);
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return vmaxq_s32(v1, v2);
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return vabdq_s32(v, ivec4(0));
 }
-static inline bool any(const ivec4& v) {
+static inline bool any(ivec4 v) {
+#ifdef __ARM64__
+    return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
+#else
    return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
+#endif
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
    return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
 }
-static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
-    return blend(0.0f, v, mask);
+    return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), mask));
+}
+static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
+    return v & mask;
 }
 // These are at the end since they involve other functions defined above.
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    fvec4 shift(0x1.0p23f);
    fvec4 absResult = (abs(v)+shift)-shift;
    return blend(v, absResult, ivec4(0x7FFFFFFF));
 }
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    fvec4 rounded = round(v);
    return rounded + blend(0.0f, -1.0f, rounded>v);
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    fvec4 rounded = round(v);
    return rounded + blend(0.0f, 1.0f, rounded<v);
 }
@@ -435,7 +456,7 @@ static inline fvec4 ceil(const fvec4& v) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -458,7 +479,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);
    const auto nz = reduceAdd(z);

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -95,45 +95,45 @@ public:
        v[1] = val[1];
        v[2] = val[2];
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return val+other;
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return val-other;
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return val*other;
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        return val/other;
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = val+other;
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = val-other;
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = val*other;
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = val/other;
    }
    fvec4 operator-() const {
        return -val;
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return (fvec4) (((__m128i)val)&((__m128i)other.val));
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return (fvec4) (((__m128i)val)|((__m128i)other.val));
    }
-    ivec4 operator==(const fvec4& other) const;
+    ivec4 operator==(fvec4 other) const;
-    ivec4 operator!=(const fvec4& other) const;
+    ivec4 operator!=(fvec4 other) const;
-    ivec4 operator>(const fvec4& other) const;
+    ivec4 operator>(fvec4 other) const;
-    ivec4 operator<(const fvec4& other) const;
+    ivec4 operator<(fvec4 other) const;
-    ivec4 operator>=(const fvec4& other) const;
+    ivec4 operator>=(fvec4 other) const;
-    ivec4 operator<=(const fvec4& other) const;
+    ivec4 operator<=(fvec4 other) const;
    operator ivec4() const;
    /**
@@ -171,49 +171,49 @@ public:
    void store(int* v) const {
        *((__m128*) v) = val;
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return val+other;
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return val-other;
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return val*other;
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = val+other;
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = val-other;
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = val*other;
    }
    ivec4 operator-() const {
        return -val;
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return val&other.val;
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return val|other.val;
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return (val==other.val);
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return (val!=other.val);
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return (val>other.val);
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return (val<other.val);
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return (val>=other.val);
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return (val<=other.val);
    }
    operator fvec4() const;
@@ -221,27 +221,27 @@ public:
 // Conversion operators.
-inline ivec4 fvec4::operator==(const fvec4& other) const {
+inline ivec4 fvec4::operator==(fvec4 other) const {
    return (__m128i) (val==other.val);
 }
-inline ivec4 fvec4::operator!=(const fvec4& other) const {
+inline ivec4 fvec4::operator!=(fvec4 other) const {
    return (__m128i) (val!=other.val);
 }
-inline ivec4 fvec4::operator>(const fvec4& other) const {
+inline ivec4 fvec4::operator>(fvec4 other) const {
    return (__m128i) (val>other.val);
 }
-inline ivec4 fvec4::operator<(const fvec4& other) const {
+inline ivec4 fvec4::operator<(fvec4 other) const {
    return (__m128i) (val<other.val);
 }
-inline ivec4 fvec4::operator>=(const fvec4& other) const {
+inline ivec4 fvec4::operator>=(fvec4 other) const {
    return (__m128i) (val>=other.val);
 }
-inline ivec4 fvec4::operator<=(const fvec4& other) const {
+inline ivec4 fvec4::operator<=(fvec4 other) const {
    return (__m128i) (val<=other.val);
 }
@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec4s.
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    return v&(__m128) ivec4(0x7FFFFFFF);
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    return r[0]+r[1]+r[2];
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    fvec4 temp = __builtin_shufflevector(r.val, r.val, 0, 1, -1, -1)+__builtin_shufflevector(r.val, r.val, 2, 3, -1, -1);
    return temp[0]+temp[1];
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
    return dot4(v, fvec4(1.0f));
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
                  v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
    return __builtin_shufflevector(temp, temp, 2, 0, 1, 3);
@@ -317,85 +317,89 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3]));
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3]));
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return ivec4(abs(v[0]), abs(v[1]), abs(v[2]), abs(v[3]));
 }
-static inline bool any(const __m128i& v) {
+static inline bool any(__m128i v) {
    ivec4 temp = __builtin_shufflevector(v, v, 0, 1, -1, -1) | __builtin_shufflevector(v, v, 2, 3, -1, -1);
    return (temp[0] || temp[1]);
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
    return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
 }
-static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
    return blend(0.0f, v, mask);
 }
+static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
+    return v & mask;
+}
 // These are at the end since they involve other functions defined above.
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return blend(v1, v2, v1 > v2);
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return blend(v1, v2, v1 < v2);
 }
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    fvec4 shift(0x1.0p23f);
    fvec4 absResult = (abs(v)+shift)-shift;
    return (__m128) ((ivec4(0x80000000)&(__m128i)v) + (ivec4(0x7FFFFFFF)&(__m128i)absResult));
 }
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
    return truncated + blend(0.0f, -1.0f, truncated>v);
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
    return truncated + blend(0.0f, 1.0f, truncated<v);
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    // Initial estimate of rsqrt().
    ivec4 i = (__m128i) v;
@@ -411,7 +415,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return rsqrt(v)*v;
 }
@@ -420,7 +424,7 @@ static inline fvec4 sqrt(const fvec4& v) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -443,7 +447,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);
    const auto nz = reduceAdd(z);

--- a/openmmapi/include/openmm/internal/vectorize_ppc.h
+++ b/openmmapi/include/openmm/internal/vectorize_ppc.h
@@ -97,45 +97,45 @@ public:
        v[2] = val[2];
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return vec_add(val, other.val);
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return vec_sub(val, other.val);
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return vec_mul(val, other.val);
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        return vec_div(val, other.val);
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = vec_add(val, other.val);
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = vec_sub(val, other.val);
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = vec_mul(val, other.val);
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = vec_div(val, other.val);
    }
    fvec4 operator-() const {
        return -val;
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return vec_and(val, other.val);
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return vec_or(val, other.val);
    }
-    ivec4 operator==(const fvec4& other) const;
+    ivec4 operator==(fvec4 other) const;
-    ivec4 operator!=(const fvec4& other) const;
+    ivec4 operator!=(fvec4 other) const;
-    ivec4 operator>(const fvec4& other) const;
+    ivec4 operator>(fvec4 other) const;
-    ivec4 operator<(const fvec4& other) const;
+    ivec4 operator<(fvec4 other) const;
-    ivec4 operator>=(const fvec4& other) const;
+    ivec4 operator>=(fvec4 other) const;
-    ivec4 operator<=(const fvec4& other) const;
+    ivec4 operator<=(fvec4 other) const;
    operator ivec4() const;
    /***
@@ -173,49 +173,49 @@ public:
    void store(int* v) const {
        *((__m128i*) v) = val;
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return vec_add(val, other.val);
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return vec_sub(val, other.val);
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return val*other.val;
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = vec_add(val, other.val);
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = vec_sub(val, other.val);
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = val*other.val;
    }
    ivec4 operator-() const {
        return -val;
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return val&other.val;
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return val|other.val;
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return (val==other.val);
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return (val!=other.val);
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return (val>other.val);
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return (val<other.val);
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return (val>=other.val);
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return (val<=other.val);
    }
    operator fvec4() const;
@@ -223,27 +223,27 @@ public:
 // Conversion operators.
-inline ivec4 fvec4::operator==(const fvec4& other) const {
+inline ivec4 fvec4::operator==(fvec4 other) const {
    return  (val==other.val);
 }
-inline ivec4 fvec4::operator!=(const fvec4& other) const {
+inline ivec4 fvec4::operator!=(fvec4 other) const {
    return  (val!=other.val);
 }
-inline ivec4 fvec4::operator>(const fvec4& other) const {
+inline ivec4 fvec4::operator>(fvec4 other) const {
    return  (val>other.val);
 }
-inline ivec4 fvec4::operator<(const fvec4& other) const {
+inline ivec4 fvec4::operator<(fvec4 other) const {
    return  (val<other.val);
 }
-inline ivec4 fvec4::operator>=(const fvec4& other) const {
+inline ivec4 fvec4::operator>=(fvec4 other) const {
    return  (val>=other.val);
 }
-inline ivec4 fvec4::operator<=(const fvec4& other) const {
+inline ivec4 fvec4::operator<=(fvec4 other) const {
    return  (val<=other.val);
 }
@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec4s.
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    return vec_abs(v.val);
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    return r[0]+r[1]+r[2];
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    fvec4 temp = r + vec_sld(r.val, r.val, 8);
    return temp[0]+temp[1];
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
    return dot4(v, fvec4(1.0f));
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
    __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
                  v1.val*vec_perm(v2.val, v2.val, perm);
@@ -324,80 +324,84 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return vec_min(v1.val, v2.val);
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return vec_max(v1.val, v2.val);
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return vec_abs(v.val);
 }
-static inline bool any(const ivec4 v) {
+static inline bool any(ivec4 v) {
    return !vec_all_eq(v.val, ivec4(0).val);
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
    return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
 }
-static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
    return blend(0.0f, v, mask);
 }
+static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
+    return v & mask;
+}
 // These are at the end since they involve other functions defined above.
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return vec_min(v1.val, v2.val);
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return vec_max(v1.val, v2.val);
 }
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    return vec_round(v.val);
 }
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    return vec_floor(v.val);
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    return vec_ceil(v.val);
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    // Initial estimate of rsqrt().
    fvec4 y(vec_rsqrte(v.val));
@@ -409,7 +413,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return vec_sqrt(v.val);
 }
@@ -417,7 +421,7 @@ static inline fvec4 sqrt(const fvec4& v) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -440,7 +444,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);
    const auto nz = reduceAdd(z);

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -108,55 +108,55 @@ public:
 #endif
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return _mm_add_ps(val, other);
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return _mm_sub_ps(val, other);
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return _mm_mul_ps(val, other);
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        return _mm_div_ps(val, other);
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = _mm_add_ps(val, other);
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = _mm_sub_ps(val, other);
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = _mm_mul_ps(val, other);
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = _mm_div_ps(val, other);
    }
    fvec4 operator-() const {
        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return _mm_and_ps(val, other);
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return _mm_or_ps(val, other);
    }
-    fvec4 operator==(const fvec4& other) const {
+    fvec4 operator==(fvec4 other) const {
        return _mm_cmpeq_ps(val, other);
    }
-    fvec4 operator!=(const fvec4& other) const {
+    fvec4 operator!=(fvec4 other) const {
        return _mm_cmpneq_ps(val, other);
    }
-    fvec4 operator>(const fvec4& other) const {
+    fvec4 operator>(fvec4 other) const {
        return _mm_cmpgt_ps(val, other);
    }
-    fvec4 operator<(const fvec4& other) const {
+    fvec4 operator<(fvec4 other) const {
        return _mm_cmplt_ps(val, other);
    }
-    fvec4 operator>=(const fvec4& other) const {
+    fvec4 operator>=(fvec4 other) const {
        return _mm_cmpge_ps(val, other);
    }
-    fvec4 operator<=(const fvec4& other) const {
+    fvec4 operator<=(fvec4 other) const {
        return _mm_cmple_ps(val, other);
    }
    operator ivec4() const;
@@ -191,49 +191,49 @@ public:
    void store(int* v) const {
        _mm_storeu_si128((__m128i*) v, val);
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return _mm_add_epi32(val, other);
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return _mm_sub_epi32(val, other);
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return _mm_mullo_epi32(val, other);
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = _mm_add_epi32(val, other);
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = _mm_sub_epi32(val, other);
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = _mm_mullo_epi32(val, other);
    }
    ivec4 operator-() const {
        return _mm_sub_epi32(_mm_set1_epi32(0), val);
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return _mm_and_si128(val, other);
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return _mm_or_si128(val, other);
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return _mm_cmpeq_epi32(val, other);
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return _mm_cmpgt_epi32(val, other);
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return _mm_cmplt_epi32(val, other);
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
    }
    operator fvec4() const;
@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec4s.
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    return fvec4(_mm_floor_ps(v.val));
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    return fvec4(_mm_ceil_ps(v.val));
 }
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
 }
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return fvec4(_mm_min_ps(v1.val, v2.val));
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return fvec4(_mm_max_ps(v1.val, v2.val));
 }
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    return fvec4(_mm_and_ps(v.val, mask));
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    // Initial estimate of rsqrt().
    fvec4 y(_mm_rsqrt_ps(v.val));
@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(exp_ps(v.val));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(log_ps(v.val));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
    return dot4(v, fvec4(1.0f));
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
                 fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
    return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(3, 0, 2, 1));
@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return ivec4(_mm_min_epi32(v1.val, v2.val));
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return ivec4(_mm_max_epi32(v1.val, v2.val));
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return ivec4(_mm_abs_epi32(v.val));
 }
-static inline bool any(const ivec4& v) {
+static inline bool any(ivec4 v) {
    return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, fvec4 mask) {
    return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
 }
-static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
+static inline fvec4 blendZero(fvec4 v, fvec4 mask) {
    return blend(0.0f, v, mask);
 }
@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    // :TODO: Could be made more efficient.
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);