Created PNaCl implementation of vector instructions (not yet optimized!)

43b669e9 · peastman · 86a8c924 · 43b669e9 · 43b669e9 · 43b669e9
Commit 43b669e9 authored Jul 24, 2014 by peastman
6 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,7 +115,7 @@ ELSE( CMAKE_SIZEOF_VOID_P EQUAL 8 )
  SET( LIB64  )
 ENDIF( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-IF (APPLE)
+IF (APPLE AND (NOT PNACL))
    # Build universal binaries compatible with OS X 10.7
    IF (NOT CMAKE_OSX_DEPLOYMENT_TARGET)
        SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.7" CACHE STRING "The minimum version of OS X to support" FORCE)
@@ -127,13 +127,13 @@ IF (APPLE)
    # Improve the linking behavior of Mac libraries
    SET (CMAKE_INSTALL_NAME_DIR "@rpath")
    SET(EXTRA_COMPILE_FLAGS "-msse2 -stdlib=libc++")
-ELSE (APPLE)
+ELSE (APPLE AND (NOT PNACL))
    IF (MSVC OR ANDROID OR PNACL)
        SET(EXTRA_COMPILE_FLAGS)
    ELSE (MSVC OR ANDROID OR PNACL)
        SET(EXTRA_COMPILE_FLAGS "-msse2")
    ENDIF (MSVC OR ANDROID OR PNACL)
-ENDIF (APPLE)
+ENDIF (APPLE AND (NOT PNACL))
 IF(UNIX AND NOT CMAKE_BUILD_TYPE)
    SET(CMAKE_BUILD_TYPE Release CACHE STRING "Debug or Release build" FORCE)
@@ -307,7 +307,9 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
 # On Linux need to link to libdl
 FIND_LIBRARY(DL_LIBRARY dl)
 IF(DL_LIBRARY)
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
+    IF(OPENMM_BUILD_SHARED_LIB)
+        TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
+    ENDIF(OPENMM_BUILD_SHARED_LIB)
    IF(OPENMM_BUILD_STATIC_LIB)
        TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
    ENDIF(OPENMM_BUILD_STATIC_LIB)

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -35,7 +35,11 @@
 #if defined(__ANDROID__)
    #include "vectorize_neon.h"
 #else
-    #include "vectorize_sse.h"
+    #if defined(__PNACL__)
+        #include "vectorize_pnacl.h"
+    #else
+        #include "vectorize_sse.h"
+    #endif
 #endif
 #endif /*OPENMM_VECTORIZE_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
+#ifndef OPENMM_VECTORIZE_PNACL_H_
+#define OPENMM_VECTORIZE_PNACL_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <cmath>
+// This file defines classes and functions to simplify vectorizing code with portable SIMD vectors.
+/**
+ * Determine whether ivec4 and fvec4 are supported on this processor.
+ */
+static bool isVec4Supported() {
+    return true;
+}
+typedef float __m128 __attribute__((vector_size(16)));
+typedef int __m128i __attribute__((vector_size(16)));
+class ivec4;
+/**
+ * A four element vector of floats.
+ */
+class fvec4 {
+public:
+    __m128 val;
+    fvec4() {}
+    fvec4(float v) {
+        val = {v, v, v, v};
+    }
+    fvec4(float v1, float v2, float v3, float v4) {
+        val = {v1, v2, v3, v4};
+    }
+    fvec4(__m128 v) : val(v) {}
+    fvec4(const float* v) {
+        val = *((__m128*) v);
+    }
+    operator __m128() const {
+        return val;
+    }
+    float operator[](int i) const {
+        return val[i];
+    }
+    void store(float* v) const {
+        *((__m128*) v) = val;
+    }
+    fvec4 operator+(const fvec4& other) const {
+        return val+other;
+    }
+    fvec4 operator-(const fvec4& other) const {
+        return val-other;
+    }
+    fvec4 operator*(const fvec4& other) const {
+        return val*other;
+    }
+    fvec4 operator/(const fvec4& other) const {
+        return val/other;
+    }
+    void operator+=(const fvec4& other) {
+        val = val+other;
+    }
+    void operator-=(const fvec4& other) {
+        val = val-other;
+    }
+    void operator*=(const fvec4& other) {
+        val = val*other;
+    }
+    void operator/=(const fvec4& other) {
+        val = val/other;
+    }
+    fvec4 operator-() const {
+        return -val;
+    }
+    fvec4 operator&(const fvec4& other) const {
+        return (fvec4) (((__m128i)val)&((__m128i)other.val));
+    }
+    fvec4 operator|(const fvec4& other) const {
+        return (fvec4) (((__m128i)val)|((__m128i)other.val));
+    }
+    fvec4 operator==(const fvec4& other) const {
+        return (val==other.val);
+    }
+    fvec4 operator!=(const fvec4& other) const {
+        return (val!=other.val);
+    }
+    fvec4 operator>(const fvec4& other) const {
+        return (val>other.val);
+    }
+    fvec4 operator<(const fvec4& other) const {
+        return (val<other.val);
+    }
+    fvec4 operator>=(const fvec4& other) const {
+        return (val>=other.val);
+    }
+    fvec4 operator<=(const fvec4& other) const {
+        return (val<=other.val);
+    }
+    operator ivec4() const;
+};
+/**
+ * A four element vector of ints.
+ */
+class ivec4 {
+public:
+    __m128i val;
+    ivec4() {}
+    ivec4(int v) {
+        val = {v, v, v, v};
+    }
+    ivec4(int v1, int v2, int v3, int v4) {
+        val = {v1, v2, v3, v4};
+    }
+    ivec4(__m128i v) : val(v) {}
+    ivec4(const int* v) {
+        val = *((__m128*) v);
+    }
+    operator __m128i() const {
+        return val;
+    }
+    int operator[](int i) const {
+        return val[i];
+    }
+    void store(int* v) const {
+        *((__m128*) v) = val;
+    }
+    ivec4 operator+(const ivec4& other) const {
+        return val+other;
+    }
+    ivec4 operator-(const ivec4& other) const {
+        return val-other;
+    }
+    ivec4 operator*(const ivec4& other) const {
+        return val*other;
+    }
+    void operator+=(const ivec4& other) {
+        val = val+other;
+    }
+    void operator-=(const ivec4& other) {
+        val = val-other;
+    }
+    void operator*=(const ivec4& other) {
+        val = val*other;
+    }
+    ivec4 operator-() const {
+        return -val;
+    }
+    ivec4 operator&(const ivec4& other) const {
+        return val&other.val;
+    }
+    ivec4 operator|(const ivec4& other) const {
+        return val|other.val;
+    }
+    ivec4 operator==(const ivec4& other) const {
+        return (val==other.val);
+    }
+    ivec4 operator!=(const ivec4& other) const {
+        return (val!=other.val);
+    }
+    ivec4 operator>(const ivec4& other) const {
+        return (val>other.val);
+    }
+    ivec4 operator<(const ivec4& other) const {
+        return (val<other.val);
+    }
+    ivec4 operator>=(const ivec4& other) const {
+        return (val>=other.val);
+    }
+    ivec4 operator<=(const ivec4& other) const {
+        return (val<=other.val);
+    }
+    operator fvec4() const;
+};
+// Conversion operators.
+inline fvec4::operator ivec4() const {
+    return __builtin_convertvector(val, __m128i);
+}
+inline ivec4::operator fvec4() const {
+    return __builtin_convertvector(val, __m128);
+}
+// Functions that operate on fvec4s.
+static inline fvec4 floor(const fvec4& v) {
+    return fvec4(std::floor(v[0]), std::floor(v[1]), std::floor(v[2]), std::floor(v[3]));
+}
+static inline fvec4 ceil(const fvec4& v) {
+    return fvec4(std::ceil(v[0]), std::ceil(v[1]), std::ceil(v[2]), std::ceil(v[3]));
+}
+static inline fvec4 round(const fvec4& v) {
+    return fvec4(std::round(v[0]), std::round(v[1]), std::round(v[2]), std::round(v[3]));
+}
+static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+    return fvec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3]));
+}
+static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+    return fvec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3]));
+}
+static inline fvec4 abs(const fvec4& v) {
+    return fvec4(std::abs(v[0]), std::abs(v[1]), std::abs(v[2]), std::abs(v[3]));
+}
+static inline fvec4 sqrt(const fvec4& v) {
+    return fvec4(std::sqrt(v[0]), std::sqrt(v[1]), std::sqrt(v[2]), std::sqrt(v[3]));
+}
+static inline float dot3(const fvec4& v1, const fvec4& v2) {
+    fvec4 r = v1*v2;
+    return r[0]+r[1]+r[2];
+}
+static inline float dot4(const fvec4& v1, const fvec4& v2) {
+    fvec4 r = v1*v2;
+    return r[0]+r[1]+r[2]+r[3];
+}
+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
+    __m128 a1 = __builtin_shufflevector(v1.val, v2.val, 0, 4, 2, 6);
+    __m128 a2 = __builtin_shufflevector(v1.val, v2.val, 1, 5, 3, 7);
+    __m128 a3 = __builtin_shufflevector(v3.val, v4.val, 0, 4, 2, 6);
+    __m128 a4 = __builtin_shufflevector(v3.val, v4.val, 1, 5, 3, 7);
+    v1 = __builtin_shufflevector(a1, a3, 0, 1, 4, 5);
+    v2 = __builtin_shufflevector(a2, a4, 0, 1, 4, 5);
+    v3 = __builtin_shufflevector(a1, a3, 2, 3, 6, 7);
+    v4 = __builtin_shufflevector(a2, a4, 2, 3, 6, 7);
+}
+// Functions that operate on ivec4s.
+static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+    return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3]));
+}
+static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+    return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3]));
+}
+static inline ivec4 abs(const ivec4& v) {
+    return ivec4(std::abs(v[0]), std::abs(v[1]), std::abs(v[2]), std::abs(v[3]));
+}
+static inline bool any(const ivec4& v) {
+    return (v[0] || v[1] || v[2] || v[3]);
+}
+// Mathematical operators involving a scalar and a vector.
+static inline fvec4 operator+(float v1, const fvec4& v2) {
+    return fvec4(v1)+v2;
+}
+static inline fvec4 operator-(float v1, const fvec4& v2) {
+    return fvec4(v1)-v2;
+}
+static inline fvec4 operator*(float v1, const fvec4& v2) {
+    return fvec4(v1)*v2;
+}
+static inline fvec4 operator/(float v1, const fvec4& v2) {
+    return fvec4(v1)/v2;
+}
+// Operations for blending fvec4s based on an ivec4.
+static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
+    return fvec4(mask[0] ? v2[0] : v1[0], mask[1] ? v2[1] : v1[1], mask[2] ? v2[2] : v1[2], mask[3] ? v2[3] : v1[3]);
+}
+#endif /*OPENMM_VECTORIZE_PNACL_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -159,7 +159,7 @@ public:
        return _mm_sub_epi32(val, other);
    }
    ivec4 operator*(const ivec4& other) const {
-        return _mm_mul_epi32(val, other);
+        return _mm_mullo_epi32(val, other);
    }
    void operator+=(const ivec4& other) {
        val = _mm_add_epi32(val, other);
@@ -168,7 +168,7 @@ public:
        val = _mm_sub_epi32(val, other);
    }
    void operator*=(const ivec4& other) {
-        val = _mm_mul_epi32(val, other);
+        val = _mm_mullo_epi32(val, other);
    }
    ivec4 operator-() const {
        return _mm_sub_epi32(_mm_set1_epi32(0), val);

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -14,7 +14,11 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    ELSE (OPENMM_BUILD_SHARED_LIB)
        TARGET_LINK_LIBRARIES(${TEST_ROOT} ${STATIC_TARGET})
    ENDIF (OPENMM_BUILD_SHARED_LIB)
-    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    IF ((${TEST_ROOT} MATCHES TestVectorize) AND NOT (MSVC OR ANDROID OR PNACL))
+        SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
+    ENDIF ((${TEST_ROOT} MATCHES TestVectorize) AND NOT (MSVC OR ANDROID OR PNACL))
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_TEST_FLAGS}" COMPILE_FLAGS "${EXTRA_TEST_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})
--- a/tests/TestVectorize.cpp
+++ b/tests/TestVectorize.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2014 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests vectorized operations.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/vectorize.h"
+#include <iostream>
+using namespace OpenMM;
+using namespace std;
+#define ASSERT_VEC4_EQUAL(found, expected0, expected1, expected2, expected3) {if (std::abs((found)[0]-(expected0))>1e-6 || std::abs((found)[1]-(expected1))>1e-6 || std::abs((found)[2]-(expected2))>1e-6 || std::abs((found)[3]-(expected3))>1e-6) {std::stringstream details; details << " Expected ("<<(expected0)<<","<<(expected1)<<","<<(expected2)<<","<<(expected3)<<"), found ("<<(found)[0]<<","<<(found)[1]<<","<<(found)[2]<<","<<(found)[3]<<")"; throwException(__FILE__, __LINE__, details.str());}};
+void testLoadStore() {
+    fvec4 f1(2.0);
+    ivec4 i1(3);
+    ASSERT_VEC4_EQUAL(f1, 2.0, 2.0, 2.0, 2.0);
+    ASSERT_VEC4_EQUAL(i1, 3, 3, 3, 3);
+    fvec4 f2(2.5, 3.0, 3.5, 4.0);
+    ivec4 i2(2, 3, 4, 5);
+    ASSERT_VEC4_EQUAL(f2, 2.5, 3.0, 3.5, 4.0);
+    ASSERT_VEC4_EQUAL(i2, 2, 3, 4, 5);
+    float farray[4];
+    int iarray[4];
+    f2.store(farray);
+    i2.store(iarray);
+    fvec4 f3(farray);
+    ivec4 i3(iarray);
+    ASSERT_VEC4_EQUAL(f3, 2.5, 3.0, 3.5, 4.0);
+    ASSERT_VEC4_EQUAL(i3, 2, 3, 4, 5);
+    ASSERT_EQUAL(f3[0], 2.5);
+    ASSERT_EQUAL(f3[1], 3.0);
+    ASSERT_EQUAL(f3[2], 3.5);
+    ASSERT_EQUAL(f3[3], 4.0);
+    ASSERT_EQUAL(i3[0], 2);
+    ASSERT_EQUAL(i3[1], 3);
+    ASSERT_EQUAL(i3[2], 4);
+    ASSERT_EQUAL(i3[3], 5);
+}
+void testArithmetic() {
+    fvec4 f1(0.5, 1.0, 1.5, 2.0);
+    ASSERT_VEC4_EQUAL(f1+fvec4(1, 2, 3, 4), 1.5, 3, 4.5, 6);
+    ASSERT_VEC4_EQUAL(f1-fvec4(1, 2, 3, 4), -0.5, -1.0, -1.5, -2.0);
+    ASSERT_VEC4_EQUAL(f1*fvec4(1, 2, 3, 4), 0.5, 2.0, 4.5, 8.0);
+    ASSERT_VEC4_EQUAL(f1/fvec4(1, 2, 3, 4), 0.5, 0.5, 0.5, 0.5);
+    ivec4 i1(1, 2, 3, 4);
+    ASSERT_VEC4_EQUAL(i1+ivec4(5, 2, 1, 3), 6, 4, 4, 7);
+    ASSERT_VEC4_EQUAL(i1-ivec4(5, 2, 1, 3), -4, 0, 2, 1);
+    ASSERT_VEC4_EQUAL(i1*ivec4(5, 2, 1, 3), 5, 4, 3, 12);
+    f1 = fvec4(0.5, 1.0, 1.5, 2.0);
+    f1 += fvec4(1, 2, 3, 4);
+    ASSERT_VEC4_EQUAL(f1, 1.5, 3, 4.5, 6);
+    f1 = fvec4(0.5, 1.0, 1.5, 2.0);
+    f1 -= fvec4(1, 2, 3, 4);
+    ASSERT_VEC4_EQUAL(f1, -0.5, -1.0, -1.5, -2.0);
+    f1 = fvec4(0.5, 1.0, 1.5, 2.0);
+    f1 *= fvec4(1, 2, 3, 4);
+    ASSERT_VEC4_EQUAL(f1, 0.5, 2.0, 4.5, 8.0);
+    f1 = fvec4(0.5, 1.0, 1.5, 2.0);
+    f1 /= fvec4(1, 2, 3, 4);
+    ASSERT_VEC4_EQUAL(f1, 0.5, 0.5, 0.5, 0.5);
+    i1 = ivec4(1, 2, 3, 4);
+    i1 += ivec4(5, 2, 1, 3);
+    ASSERT_VEC4_EQUAL(i1, 6, 4, 4, 7);
+    i1 = ivec4(1, 2, 3, 4);
+    i1 -= ivec4(5, 2, 1, 3);
+    ASSERT_VEC4_EQUAL(i1, -4, 0, 2, 1);
+    i1 = ivec4(1, 2, 3, 4);
+    i1 *= ivec4(5, 2, 1, 3);
+    ASSERT_VEC4_EQUAL(i1, 5, 4, 3, 12);
+}
+void testLogic() {
+    int allBits = -1;
+    float allBitsf = *((float*) &allBits);
+    ivec4 mask(0, allBits, allBits, 0);
+    fvec4 fmask(0, allBitsf, allBitsf, 0);;
+    fvec4 f1(0.5, 1.0, 1.5, 2.0);
+    ivec4 i1(1, 2, 3, 4);
+    ASSERT_VEC4_EQUAL(f1&fmask, 0, 1.0, 1.5, 0);
+    fvec4 temp = f1|fmask;
+    ASSERT_EQUAL(0.5, temp[0]);
+    ASSERT(temp[1] != temp[1]); // All bits set, which is nan
+    ASSERT(temp[2] != temp[2]); // All bits set, which is nan
+    ASSERT_EQUAL(2.0, temp[3]);
+    ASSERT_VEC4_EQUAL(i1&mask, 0, 2, 3, 0);
+    ASSERT_VEC4_EQUAL(i1|mask, 1, allBits, allBits, 4);
+}
+void testComparisons() {
+    fvec4 fmask(1.5, 1.5, 1.5, 1.5);
+    ASSERT_VEC4_EQUAL((fvec4(1.0, 1.5, 3.0, 2.2)==fvec4(1.1, 1.5, 3.0, 2.1))&fmask, 0.0, 1.5, 1.5, 0.0);
+    ASSERT_VEC4_EQUAL((fvec4(1.0, 1.5, 3.0, 2.2)!=fvec4(1.1, 1.5, 3.0, 2.1))&fmask, 1.5, 0.0, 0.0, 1.5);
+    ASSERT_VEC4_EQUAL((fvec4(1.0, 1.5, 3.0, 2.2)<fvec4(1.1, 1.5, 3.0, 2.1))&fmask, 1.5, 0.0, 0.0, 0.0);
+    ASSERT_VEC4_EQUAL((fvec4(1.0, 1.5, 3.0, 2.2)>fvec4(1.1, 1.5, 3.0, 2.1))&fmask, 0.0, 0.0, 0.0, 1.5);
+    ASSERT_VEC4_EQUAL((fvec4(1.0, 1.5, 3.0, 2.2)<=fvec4(1.1, 1.5, 3.0, 2.1))&fmask, 1.5, 1.5, 1.5, 0.0);
+    ASSERT_VEC4_EQUAL((fvec4(1.0, 1.5, 3.0, 2.2)>=fvec4(1.1, 1.5, 3.0, 2.1))&fmask, 0.0, 1.5, 1.5, 1.5);
+    fvec4 imask(3, 3, 3, 3);
+    ASSERT_VEC4_EQUAL((ivec4(1, 3, 7, 5)==ivec4(2, 3, 7, 4))&imask, 0, 3, 3, 0);
+    ASSERT_VEC4_EQUAL((ivec4(1, 3, 7, 5)!=ivec4(2, 3, 7, 4))&imask, 3, 0, 0, 3);
+    ASSERT_VEC4_EQUAL((ivec4(1, 3, 7, 5)<ivec4(2, 3, 7, 4))&imask, 3, 0, 0, 0);
+    ASSERT_VEC4_EQUAL((ivec4(1, 3, 7, 5)>ivec4(2, 3, 7, 4))&imask, 0, 0, 0, 3);
+    ASSERT_VEC4_EQUAL((ivec4(1, 3, 7, 5)<=ivec4(2, 3, 7, 4))&imask, 3, 3, 3, 0);
+    ASSERT_VEC4_EQUAL((ivec4(1, 3, 7, 5)>=ivec4(2, 3, 7, 4))&imask, 0, 3, 3, 3);
+}
+void testMathFunctions() {
+    fvec4 f1(0.4, 1.9, -1.2, -3.8);
+    fvec4 f2(1.1, 1.2, 1.3, -5.0);
+    ASSERT_VEC4_EQUAL(floor(f1), 0.0, 1.0, -2.0, -4.0);
+    ASSERT_VEC4_EQUAL(ceil(f1), 1.0, 2.0, -1.0, -3.0);
+    ASSERT_VEC4_EQUAL(round(f1), 0.0, 2.0, -1.0, -4.0);
+    ASSERT_VEC4_EQUAL(abs(f1), 0.4, 1.9, 1.2, 3.8);
+    ASSERT_VEC4_EQUAL(min(f1, f2), 0.4, 1.2, -1.2, -5.0);
+    ASSERT_VEC4_EQUAL(max(f1, f2), 1.1, 1.9, 1.3, -3.8);
+    ASSERT_VEC4_EQUAL(sqrt(fvec4(1.5, 3.1, 4.0, 15.0)), sqrt(1.5), sqrt(3.1), sqrt(4.0), sqrt(15.0));
+    ASSERT_EQUAL_TOL(f1[0]*f2[0]+f1[1]*f2[1]+f1[2]*f2[2], dot3(f1, f2), 1e-6);
+    ASSERT_EQUAL_TOL(f1[0]*f2[0]+f1[1]*f2[1]+f1[2]*f2[2]+f1[3]*f2[3], dot4(f1, f2), 1e-6);
+    ASSERT(any(f1 > 0.5));
+    ASSERT(!any(f1 > 2.0));
+    ASSERT_VEC4_EQUAL(blend(f1, f2, ivec4(-1, 0, -1, 0)), 1.1, 1.9, 1.3, -3.8);
+}
+void testTranspose() {
+    fvec4 f1(1.0, 2.0, 3.0, 4.0);
+    fvec4 f2(5.0, 6.0, 7.0, 8.0);
+    fvec4 f3(9.0, 10.0, 11.0, 12.0);
+    fvec4 f4(13.0, 14.0, 15.0, 16.0);
+    transpose(f1, f2, f3, f4);
+    ASSERT_VEC4_EQUAL(f1, 1.0, 5.0, 9.0, 13.0);
+    ASSERT_VEC4_EQUAL(f2, 2.0, 6.0, 10.0, 14.0);
+    ASSERT_VEC4_EQUAL(f3, 3.0, 7.0, 11.0, 15.0);
+    ASSERT_VEC4_EQUAL(f4, 4.0, 8.0, 12.0, 16.0);
+}
+int main(int argc, char* argv[]) {
+    try {
+        if (!isVec4Supported()) {
+            cout << "CPU is not supported.  Exiting." << endl;
+            return 0;
+        }
+        testLoadStore();
+        testArithmetic();
+        testLogic();
+        testComparisons();
+        testMathFunctions();
+        testTranspose();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}