Merge pull request #2692 from dwtowner/cpu_generic_vector_test

Cpu generic vector test

Merge pull request #2692 from dwtowner/cpu_generic_vector_test
Cpu generic vector test
b0d13582 · peastman · GitHub · f902295b · 154e2854 · b0d13582
Unverified Commit b0d13582 authored May 26, 2020 by peastman Committed by GitHub May 26, 2020
8 changed files
--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -46,7 +46,7 @@ class fvec8 {
 public:
    __m256 val;

-    fvec8() {}
+    fvec8() = default;
    fvec8(float v) : val(_mm256_set1_ps(v)) {}
    fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
    fvec8(__m256 v) : val(v) {}

--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -74,7 +74,7 @@ class fvec4 {
 public:
    float32x4_t val;

-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) : val(vdupq_n_f32(v)) {}
    fvec4(float v1, float v2, float v3, float v4) {
        float v[] = {v1, v2, v3, v4};

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -56,7 +56,7 @@ class fvec4 {
 public:
    __m128 val;
    
-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) {
        val = {v, v, v, v};
    }

--- a/openmmapi/include/openmm/internal/vectorize_ppc.h
+++ b/openmmapi/include/openmm/internal/vectorize_ppc.h
@@ -57,7 +57,7 @@ class fvec4 {
 public:
    __m128 val;
    
-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) {
        val = (__m128) {v, v, v, v};
    }

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -68,7 +68,7 @@ class fvec4 {
 public:
    __m128 val;
    
-    fvec4() {}
+    fvec4() = default;
    fvec4(float v) : val(_mm_set1_ps(v)) {}
    fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
    fvec4(__m128 v) : val(v) {}

--- a/tests/TestVectorize.cpp
+++ b/tests/TestVectorize.cpp
@@ -8,7 +8,7 @@
 *                                                                            *
 * Portions copyright (c) 2014-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
+ * Contributors: Daniel Towner                                                *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
@@ -35,6 +35,9 @@

 #include "openmm/internal/AssertionUtilities.h"
 #include "openmm/internal/vectorize.h"
+
+#include "TestVectorizeGeneric.h"
+
 #include <iostream>

 using namespace OpenMM;
@@ -203,45 +206,6 @@ void testTranspose() {
    ASSERT_VEC4_EQUAL(h[3], 0.4, 0.8, 1.2, 1.6);
 }

-void testUtility() {
-    fvec4 f1(7, 2, -5, 13);
-    fvec4 f2(1, 2, 4, 7);
-    fvec4 f3(0.5, 1.0, 1.5, 2.0);
-
-    // Reduce-add across three vectors into a single vec3.
-    const auto computedVec3 = reduceToVec3(f1, f2, f3);
-    ASSERT_EQUAL(17, computedVec3[0]);
-    ASSERT_EQUAL(14, computedVec3[1]);
-    ASSERT_EQUAL(5,  computedVec3[2]);
-
-    // Gather values from a table. Variants for both one vector and two vector gathers are provided.
-    float table[2048];
-    for (int i=0; i<2048;++i)
-        table[i] = -i; // Same index to make it easy to debug, but negative to avoid copying idx.
-
-    // Single vector gather.
-    const int vidx[4] = {156, 1987, 33, 1003};
-    fvec4 g(table, vidx);
-    ASSERT_VEC4_EQUAL(g, -156, -1987, -33, -1003);
-
-    // Pair-wise vector gather.
-    fvec4 p0, p1;
-    gatherVecPair(table, ivec4(57, 105, 1976, 91), p0, p1);
-    ASSERT_VEC4_EQUAL(p0, -57, -105, -1976, -91);
-    ASSERT_VEC4_EQUAL(p1, -58, -106, -1977, -92);
-
-    // Verify building blend mask from integer. The mask isn't checked directly, as different platforms
-    // use different types of mask. Instead, check the side effect of using the mask in a blend.
-    const auto elements = fvec4(1, 2, 3, 4);
-    const auto maskZero = fvec4::expandBitsToMask(0);
-    ASSERT_VEC4_EQUAL_INT(blendZero(elements, maskZero), 0, 0, 0, 0);
-    const auto maskOne = fvec4::expandBitsToMask(0b1111);
-    ASSERT_VEC4_EQUAL_INT(blendZero(elements, maskOne), 1, 2, 3, 4);
-    const auto maskMix = fvec4::expandBitsToMask(0b1001);
-    ASSERT_VEC4_EQUAL_INT(blendZero(elements, maskMix), 1, 0, 0, 4);
-
-}
-
 int main(int argc, char* argv[]) {
    try {
        if (!isVec4Supported()) {
@@ -249,12 +213,10 @@ int main(int argc, char* argv[]) {
            return 0;
        }
        testLoadStore();
-        testArithmetic();
        testLogic();
-        testComparisons();
-        testMathFunctions();
-        testTranspose();
-        testUtility();
+
+        TestFvec<fvec4>::testAll();
+
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/tests/TestVectorize8.cpp
+++ b/tests/TestVectorize8.cpp
@@ -8,7 +8,7 @@
 *                                                                            *
 * Portions copyright (c) 2014-2015 Stanford University and the Authors.      *
 * Authors: Robert T. McGibbon                                                *
- * Contributors:                                                              *
+ * Contributors: Daniel Towner                                                *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
@@ -37,6 +37,7 @@
 #include "openmm/internal/vectorize8.h"
 #include <iostream>

+#include "TestVectorizeGeneric.h"

 #ifndef __AVX__
 bool isVec8Supported() {
@@ -66,32 +67,15 @@ using namespace std;
 #define ASSERT_VEC8_EQUAL(found, expected0, expected1, expected2, expected3, expected4, expected5, expected6, expected7) {if (std::abs((found).lowerVec()[0]-(expected0))>1e-6 || std::abs((found).lowerVec()[1]-(expected1))>1e-6 || std::abs((found).lowerVec()[2]-(expected2))>1e-6 || std::abs((found).lowerVec()[3]-(expected3))>1e-6 || std::abs((found).upperVec()[0]-(expected4))>1e-6 || std::abs((found).upperVec()[1]-(expected5))>1e-6 || std::abs((found).upperVec()[2]-(expected6))>1e-6 || std::abs((found).upperVec()[3]-(expected7))>1e-6) {std::stringstream details; details << " Expected ("<<(expected0)<<","<<(expected1)<<","<<(expected2)<<","<<(expected3)<<","<<(expected4)<<","<<(expected5)<<","<<(expected6)<<","<<(expected7)<<"), found ("<<(found).lowerVec()[0]<<","<<(found).lowerVec()[1]<<","<<(found).lowerVec()[2]<<","<<(found).lowerVec()[3]<<","<<(found).upperVec()[0]<<","<<(found).upperVec()[1]<<","<<(found).upperVec()[2]<<","<<(found).upperVec()[3]<<")"; throwException(__FILE__, __LINE__, details.str());}};
 #define ASSERT_VEC8_EQUAL_INT(found, expected0, expected1, expected2, expected3, expected4, expected5, expected6, expected7) {if ((found).lowerVec()[0] != (expected0) || (found).lowerVec()[1] != (expected1) || (found).lowerVec()[2] != (expected2) || (found).lowerVec()[3] != (expected3) || (found).upperVec()[0] != (expected4) || (found).upperVec()[1] != (expected5) ||(found).upperVec()[2] != (expected6) || (found).upperVec()[3] != (expected7)) {std::stringstream details; details << " Expected ("<<(expected0)<<","<<(expected1)<<","<<(expected2)<<","<<(expected3)<<","<<(expected4)<<","<<(expected5)<<","<<(expected6)<<","<<(expected7)<<"), found ("<<(found).lowerVec()[0]<<","<<(found).lowerVec()[1]<<","<<(found).lowerVec()[2]<<","<<(found).lowerVec()[3]<<","<<(found).upperVec()[0]<<","<<(found).upperVec()[1]<<","<<(found).upperVec()[2]<<","<<(found).upperVec()[3]<<")"; throwException(__FILE__, __LINE__, details.str());}};

-
 void testLoadStore() {
-    fvec8 f1(2.0);
    ivec8 i1(3);
-    ASSERT_VEC8_EQUAL(f1, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0);
    ASSERT_VEC8_EQUAL_INT(i1, 3, 3, 3, 3, 3, 3, 3, 3);
-    fvec8 f2(2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0);
    ivec8 i2(2, 3, 4, 5, 6, 7, 8, 9);
-    ASSERT_VEC8_EQUAL(f2, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0);
    ASSERT_VEC8_EQUAL_INT(i2, 2, 3, 4, 5, 6, 7, 8, 9);
-    float farray[8];
    int iarray[8];
-    f2.store(farray);
    i2.store(iarray);
-    fvec8 f3(farray);
    ivec8 i3(iarray);
-    ASSERT_VEC8_EQUAL(f3, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0);
    ASSERT_VEC8_EQUAL_INT(i3, 2, 3, 4, 5, 6, 7, 8, 9);
-    ASSERT_EQUAL(f3.lowerVec()[0], 2.5);
-    ASSERT_EQUAL(f3.lowerVec()[1], 3.0);
-    ASSERT_EQUAL(f3.lowerVec()[2], 3.5);
-    ASSERT_EQUAL(f3.lowerVec()[3], 4.0);
-    ASSERT_EQUAL(f3.upperVec()[0], 4.5);
-    ASSERT_EQUAL(f3.upperVec()[1], 5.0);
-    ASSERT_EQUAL(f3.upperVec()[2], 5.5);
-    ASSERT_EQUAL(f3.upperVec()[3], 6.0);
    ASSERT_EQUAL(i3.lowerVec()[0], 2);
    ASSERT_EQUAL(i3.lowerVec()[1], 3);
    ASSERT_EQUAL(i3.lowerVec()[2], 4);
@@ -112,27 +96,6 @@ void testLoadStore() {
    ASSERT_EQUAL(overwriteTest[3], 9);
 }

-void testArithmetic() {
-    fvec8 f1(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
-    ASSERT_VEC8_EQUAL(f1+fvec8(1, 2, 3, 4, 5, 6, 7, 8), 1.5,   3. ,   4.5,   6. ,   7.5,   9. ,  10.5,  12.);
-    ASSERT_VEC8_EQUAL(f1-fvec8(1, 2, 3, 4, 5, 6, 7, 8), -0.5, -1. , -1.5, -2. , -2.5, -3. , -3.5, -4.);
-    ASSERT_VEC8_EQUAL(f1*fvec8(1, 2, 3, 4, 5, 6, 7, 8), 0.5,   2. ,   4.5,   8. ,  12.5,  18. ,  24.5,  32.);
-    ASSERT_VEC8_EQUAL(f1/fvec8(1, 2, 3, 4, 5, 6, 7, 8), 0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5);
-
-    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
-    f1 += fvec8(1, 2, 3, 4, 5, 6, 7, 8);
-    ASSERT_VEC8_EQUAL(f1, 1.5,   3. ,   4.5,   6. ,   7.5,   9. ,  10.5,  12.);
-    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
-    f1 -= fvec8(1, 2, 3, 4, 5, 6, 7, 8);
-    ASSERT_VEC8_EQUAL(f1, -0.5, -1. , -1.5, -2. , -2.5, -3. , -3.5, -4.);
-    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
-    f1 *= fvec8(1, 2, 3, 4, 5, 6, 7, 8);
-    ASSERT_VEC8_EQUAL(f1, 0.5,   2. ,   4.5,   8. ,  12.5,  18. ,  24.5,  32.);
-    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
-    f1 /= fvec8(1, 2, 3, 4, 5, 6, 7, 8);
-    ASSERT_VEC8_EQUAL(f1, 0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5);
-}
-
 void testLogic() {
    int allBits = -1;
    float allBitsf = *((float*) &allBits);
@@ -154,134 +117,6 @@ void testLogic() {
    ASSERT_VEC8_EQUAL_INT(i1|mask, 1, allBits, allBits, 4, 5, allBits, allBits, 8);
 }

-void testComparisons() {
-    fvec8 v1(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
-    fvec8 v2(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5);
-    ASSERT_VEC8_EQUAL(blend(v1, v2,
-        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)==fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
-        0.0, 1.5, 1.5, 0.0, 0.0, 1.5, 1.5, 0.0);
-    ASSERT_VEC8_EQUAL(blend(v1, v2,
-        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)!=fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
-        1.5, 0.0, 0.0, 1.5, 1.5, 0.0, 0.0, 1.5);
-    ASSERT_VEC8_EQUAL(blend(v1, v2,
-        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)<fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
-        1.5, 0.0, 0.0, 0.0, 1.5, 0.0, 0.0, 0.0);
-    ASSERT_VEC8_EQUAL(blend(v1, v2,
-        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)>fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
-        0.0, 0.0, 0.0, 1.5, 0.0, 0.0, 0.0, 1.5);
-    ASSERT_VEC8_EQUAL(blend(v1, v2,
-        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)<=fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
-        1.5, 1.5, 1.5, 0.0, 1.5, 1.5, 1.5, 0.0);
-    ASSERT_VEC8_EQUAL(blend(v1, v2,
-        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)>=fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
-        0.0, 1.5, 1.5, 1.5, 0.0, 1.5, 1.5, 1.5);
-}
-
-void testMathFunctions() {
-    fvec8 f1(0.4, 1.9, -1.2, -3.8, 0.4, 1.9, -1.2, -3.8);
-    fvec8 f2(1.1, 1.2, 1.3, -5.0, 1.1, 1.2, 1.3, -5.0);
-    ASSERT_VEC8_EQUAL(floor(f1), 0.0, 1.0, -2.0, -4.0, 0.0, 1.0, -2.0, -4.0);
-    ASSERT_VEC8_EQUAL(ceil(f1), 1.0, 2.0, -1.0, -3.0, 1.0, 2.0, -1.0, -3.0);
-    ASSERT_VEC8_EQUAL(round(f1), 0.0, 2.0, -1.0, -4.0, 0.0, 2.0, -1.0, -4.0);
-    ASSERT_VEC8_EQUAL(abs(f1), 0.4, 1.9, 1.2, 3.8, 0.4, 1.9, 1.2, 3.8);
-    ASSERT_VEC8_EQUAL(min(f1, f2), 0.4, 1.2, -1.2, -5.0, 0.4, 1.2, -1.2, -5.0);
-    ASSERT_VEC8_EQUAL(max(f1, f2), 1.1, 1.9, 1.3, -3.8, 1.1, 1.9, 1.3, -3.8);
-    ASSERT_VEC8_EQUAL(sqrt(fvec8(1.5, 3.1, 4.0, 15.0, 1.5, 3.1, 4.0, 15.0)), sqrt(1.5), sqrt(3.1), sqrt(4.0), sqrt(15.0), sqrt(1.5), sqrt(3.1), sqrt(4.0), sqrt(15.0));
-    ASSERT_VEC8_EQUAL(rsqrt(fvec8(1.5, 3.1, 4.0, 15.0, 1.5, 3.1, 4.0, 15.0)), 1.0/sqrt(1.5), 1.0/sqrt(3.1), 1.0/sqrt(4.0), 1.0/sqrt(15.0), 1.0/sqrt(1.5), 1.0/sqrt(3.1), 1.0/sqrt(4.0), 1.0/sqrt(15.0));
-    ASSERT_EQUAL_TOL(f1.lowerVec()[0]*f2.lowerVec()[0]+f1.lowerVec()[1]*f2.lowerVec()[1]+f1.lowerVec()[2]*f2.lowerVec()[2]+f1.lowerVec()[3]*f2.lowerVec()[3]+f1.upperVec()[0]*f2.upperVec()[0]+f1.upperVec()[1]*f2.upperVec()[1]+f1.upperVec()[2]*f2.upperVec()[2]+f1.upperVec()[3]*f2.upperVec()[3], dot8(f1, f2), 1e-6);
-    ASSERT(any(f1 > 0.5));
-    ASSERT(!any(f1 > 2.0));
-    ASSERT_VEC8_EQUAL(blend(f1, f2, ivec8(-1, 0, -1, 0, -1, 0, -1, 0)), 1.1, 1.9, 1.3, -3.8, 1.1, 1.9, 1.3, -3.8);
-}
-
-void testTranspose() {
-    fvec4 f[8] = {
-        {0.0,   1.0,  2.0,  3.0},
-        {10.0, 11.0, 12.0, 13.0},
-        {20.0, 21.0, 22.0, 23.0},
-        {30.0, 31.0, 32.0, 33.0},
-        {40.0, 41.0, 42.0, 43.0},
-        {50.0, 51.0, 52.0, 53.0},
-        {60.0, 61.0, 62.0, 63.0},
-        {70.0, 71.0, 72.0, 73.0}
-    };
-
-    fvec8 o1, o2, o3, o4;
-    transpose(f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7], o1, o2, o3, o4);
-    ASSERT_VEC8_EQUAL(o1, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0);
-    ASSERT_VEC8_EQUAL(o2, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 61.0, 71.0);
-    ASSERT_VEC8_EQUAL(o3, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 62.0, 72.0);
-    ASSERT_VEC8_EQUAL(o4, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 63.0, 73.0);
-
-    fvec8 q1, q2, q3, q4;
-    transpose(f, q1, q2, q3, q4);
-    ASSERT_VEC8_EQUAL(q1, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0);
-    ASSERT_VEC8_EQUAL(q2, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 61.0, 71.0);
-    ASSERT_VEC8_EQUAL(q3, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 62.0, 72.0);
-    ASSERT_VEC8_EQUAL(q4, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 63.0, 73.0);
-
-    fvec4 g[8];
-    transpose(o1, o2, o3, o4, g[0], g[1], g[2], g[3], g[4], g[5], g[6], g[7]);
-    ASSERT_VEC4_EQUAL(g[0], 0.0,   1.0,  2.0,  3.0);
-    ASSERT_VEC4_EQUAL(g[1], 10.0, 11.0, 12.0, 13.0);
-    ASSERT_VEC4_EQUAL(g[2], 20.0, 21.0, 22.0, 23.0);
-    ASSERT_VEC4_EQUAL(g[3], 30.0, 31.0, 32.0, 33.0);
-    ASSERT_VEC4_EQUAL(g[4], 40.0, 41.0, 42.0, 43.0);
-    ASSERT_VEC4_EQUAL(g[5], 50.0, 51.0, 52.0, 53.0);
-    ASSERT_VEC4_EQUAL(g[6], 60.0, 61.0, 62.0, 63.0);
-    ASSERT_VEC4_EQUAL(g[7], 70.0, 71.0, 72.0, 73.0);
-
-    fvec4 h[8];
-    transpose(o1, o2, o3, o4, h);
-    ASSERT_VEC4_EQUAL(h[0], 0.0,   1.0,  2.0,  3.0);
-    ASSERT_VEC4_EQUAL(h[1], 10.0, 11.0, 12.0, 13.0);
-    ASSERT_VEC4_EQUAL(h[2], 20.0, 21.0, 22.0, 23.0);
-    ASSERT_VEC4_EQUAL(h[3], 30.0, 31.0, 32.0, 33.0);
-    ASSERT_VEC4_EQUAL(h[4], 40.0, 41.0, 42.0, 43.0);
-    ASSERT_VEC4_EQUAL(h[5], 50.0, 51.0, 52.0, 53.0);
-    ASSERT_VEC4_EQUAL(h[6], 60.0, 61.0, 62.0, 63.0);
-    ASSERT_VEC4_EQUAL(h[7], 70.0, 71.0, 72.0, 73.0);
-}
-
-void testUtility() {
-    fvec8 f1(0.4, 1.9, -1.2, -3.8, 0.4, 1.9, -6.8, -3.8);
-    fvec8 f2(1, 2, 4, 7, 19, 31, 64, 5);
-    fvec8 f3(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
-
-    // Reduce-add across three vectors into a single vec3.
-    const auto computedVec3 = reduceToVec3(f1, f2, f3);
-    ASSERT_EQUAL(-11, computedVec3[0]);
-    ASSERT_EQUAL(133, computedVec3[1]);
-    ASSERT_EQUAL(18,  computedVec3[2]);
-
-    // Gather values from a table. Variants for both one vector and two vector gathers are provided.
-    float table[2048];
-    for (int i=0; i<2048;++i)
-        table[i] = -i; // Same index to make it easy to debug, but negative to avoid copying idx.
-
-    // Single vector gather.
-    const int vidx[8] = {4, 8, 156, 1987, 23, 65, 33, 1003};
-    fvec8 g(table, vidx);
-    ASSERT_VEC8_EQUAL(g, -4, -8, -156, -1987, -23, -65, -33, -1003);
-
-    // Pair-wise vector gather.
-    fvec8 p0, p1;
-    gatherVecPair(table, ivec8(57, 105, 1976, 91, 636, 1952, 345, 12), p0, p1);
-    ASSERT_VEC8_EQUAL(p0, -57, -105, -1976, -91, -636, -1952, -345, -12);
-    ASSERT_VEC8_EQUAL(p1, -58, -106, -1977, -92, -637, -1953, -346, -13);
-
-    // Verify building blend mask from integer. The mask isn't checked directly, as different platforms
-    // use different types of mask. Instead, check the side effect of using the mask in a blend.
-    const auto elements = fvec8(1, 2, 3, 4, 5, 6, 7, 8);
-    const auto maskZero = fvec8::expandBitsToMask(0);
-    ASSERT_VEC8_EQUAL_INT(blendZero(elements, maskZero), 0, 0, 0, 0, 0, 0, 0, 0);
-    const auto maskOne = fvec8::expandBitsToMask(0b11111111);
-    ASSERT_VEC8_EQUAL_INT(blendZero(elements, maskOne), 1, 2, 3, 4, 5, 6, 7, 8);
-    const auto maskMix = fvec8::expandBitsToMask(0b01101001);
-    ASSERT_VEC8_EQUAL_INT(blendZero(elements, maskMix), 1, 0, 0, 4, 0, 6, 7, 0);
-
-}
-
 int main(int argc, char* argv[]) {
    try {
        if (!isVec8Supported()) {
@@ -289,12 +124,10 @@ int main(int argc, char* argv[]) {
            return 0;
        }
        testLoadStore();
-        testArithmetic();
        testLogic();
-        testComparisons();
-        testMathFunctions();
-        testTranspose();
-        testUtility();
+
+        TestFvec<fvec8>::testAll();
+
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/tests/TestVectorizeGeneric.h
+++ b/tests/TestVectorizeGeneric.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2014-2020 Stanford University and the Authors.      *
+ * Authors: Daniel Towner                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#pragma once
+
+/**
+ * This tests all sizes of vectorized operations using templated test code.
+ */
+
+#include <array>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <memory.h>
+#include <sstream>
+#include <typeinfo>
+
+/**
+ * Return the 32-bit integer bit pattern from the given floating-point value.
+ */
+static int32_t floatAsIntBits(float f) {
+    int32_t i;
+    memcpy(&i, &f, 4);
+    return i;
+}
+
+/**
+ * Compare two floating-point values using units-in-last-place (ULP) as a measure of equality. Two values
+ * which are only a few representable values apart can be considered to be equal. Note that IEEE
+ * operations (add, mul, etc.) will always be exact, but sequences of operations might be more than
+ * a few ULP apart, but still close enough to be considered equal. ULP comparisons work at any scale of
+ * number, unlike an epsilon-based approach.
+ */
+static bool almostEqual(float a, float b) {
+    // Maybe they really are equal.
+    if (a == b)
+        return true;
+
+    // Infinities and NANs are never equal to anything, even other nans and infinities.
+    if (std::isnan(a) || std::isinf(a) ||
+        std::isnan(b) || std::isinf(b))
+        return false;
+
+    // If they are different signs then they can't be equal. For two very small denormal values they might
+    // be very close to each other but either side of 0, but denormals are a corner case which don't deserve
+    // to be equal.
+    if (std::signbit(a) != std::signbit(b))
+        return false;
+
+    // The two numbers must be valid values with the same sign, so treat then as basic integers to
+    // get at their ULP values. If they are only a few ULP apart, then they are essentially equal.
+    int32_t intDiff = std::abs(floatAsIntBits(a) - floatAsIntBits(b));
+    return intDiff < 4;
+}
+
+static bool exactlyEqual(float a, float b) { return a == b; }
+
+/**
+ * Write the contents of the given array-like object to a stream. No formatting is applied.
+ */
+template<typename FVEC>
+void VecToStream(std::ostream& stream, const FVEC& vec)
+{
+    constexpr int numElements = sizeof(FVEC) / sizeof(float);
+    const float* vptr = (const float*)&vec;
+    for (int i=0; i<numElements; ++i)
+        stream << vptr[i] << ", ";
+}
+
+/**
+ * Given two vector-like objects compared each of their elements for equality. The vector objects can be
+ * anything which in memory is a list of 32-bit floating-point values, so SIMD vectors, C arrays or
+ * C++ arrays would all be valid.
+ */
+template<typename S, typename T>
+static void checkElementsEqual(const S& computed, const T& expected,
+                               std::function<bool(float, float)> equal_fn,
+                               const char* file, int line) {
+    // Both S and T should be arrays of floats of the same length.
+    static_assert(sizeof(T) == sizeof(S), "Array-like elements must have the same size");
+
+    constexpr int numElements = sizeof(S) / sizeof(float);
+
+    const float* computedPtr = (const float*)&computed;
+    const float* expectedPtr = (const float*)&expected;
+
+    std::ostringstream details;
+    details << "Error during test for type " << typeid(S).name() << '\n';
+
+    bool passed = true;
+    for (int i=0; i<numElements; ++i)
+    {
+        if (!equal_fn(computedPtr[i], expectedPtr[i]))
+            passed = false;
+    }
+
+    if (!passed)
+    {
+        details << "Values differ. ";
+        VecToStream(details, computed);
+        details << " and ";
+        VecToStream(details, expected);
+        OpenMM::throwException(file, line, details.str());
+    }
+
+}
+
+#define ASSERT_VEC_EQUAL(computed, expected) {checkElementsEqual(computed, expected, exactlyEqual, __FILE__, __LINE__);}
+#define ASSERT_VEC_ALMOST_EQUAL(computed, expected) {checkElementsEqual(computed, expected, almostEqual, __FILE__, __LINE__);}
+
+static float getRandomFloat () {
+    // Between -50 and 50.
+    return float(rand()) / float(RAND_MAX/100.0f) - 50.0f;
+}
+
+/**
+ * Given an array-like memory object containing floats, apply the given function to every element.
+ */
+template<typename FVEC>
+FVEC applyUnaryFn(const FVEC& v, std::function<float(float)> fn) {
+    constexpr int numElements = sizeof(FVEC) / sizeof(float);
+
+    FVEC result;
+
+    float* rp = (float*)&result;
+    const float* vp = (const float*)&v;
+
+    for (int i=0; i<numElements; ++i)
+        rp[i] = fn(vp[i]);
+
+    return result;
+}
+
+/**
+ * Given an array-like memory object containing floats, apply the given function to every element.
+ */
+template<typename FVEC>
+FVEC applyBinaryFn(const FVEC& a, const FVEC& b, std::function<float(float, float)> fn) {
+    constexpr int numElements = sizeof(FVEC) / sizeof(float);
+
+    FVEC result;
+
+    float* rp = (float*)&result;
+    const float* ap = (const float*)&a;
+    const float* bp = (const float*)&b;
+
+    for (int i=0; i<numElements; ++i)
+        rp[i] = fn(ap[i], bp[i]);
+
+    return result;
+}
+
+/**
+ * Provide a test fixture class which underpins all verification for a given
+ * type of vector SIMD implementation, as well as providing common utility functions
+ */
+template<typename FVEC>
+class TestFvec {
+public:
+
+    static constexpr int numElements = sizeof(FVEC) / sizeof(float);
+
+    void testInitializers() const;
+    void testUnaryOps() const;
+    void testBinaryOps() const;
+    void testUtilities() const;
+    void testBlendAndCompare() const;
+    void testTranspose() const;
+
+    static void testAll() {
+        TestFvec<FVEC> testUnit;
+        testUnit.testInitializers();
+        testUnit.testUnaryOps();
+        testUnit.testBinaryOps();
+        testUnit.testUtilities();
+        testUnit.testBlendAndCompare();
+        testUnit.testTranspose();
+    }
+
+    FVEC getRandomFvec() const {
+        union {
+            FVEC v;
+            float f[numElements];
+        };
+
+        for (auto& e : f)
+            e = getRandomFloat();
+
+        return v;
+    }
+
+};
+
+template<typename FVEC>
+void TestFvec<FVEC>::testInitializers() const {
+    FVEC computedZero = {};
+    float expectedZero[numElements] = {};
+    ASSERT_VEC_EQUAL(computedZero, expectedZero);
+
+    FVEC computedBroadcast(14.5f);
+    float expectedBroadcast[numElements];
+    std::fill_n(expectedBroadcast, numElements, 14.5f);
+    ASSERT_VEC_EQUAL(computedBroadcast, expectedBroadcast);
+
+    float expectedArray[numElements];
+    std::iota(expectedArray, expectedArray + numElements, 23);
+    FVEC computedFromLoad(expectedArray);
+    ASSERT_VEC_EQUAL(computedFromLoad, expectedArray);
+
+    // Gather values from a table. Variants for both one vector and two vector gathers are provided.
+    // The indexes to gather (multiples of 7) are also generated, along with the expected answers.
+    float gatherTable[2048];
+    for (int i=0; i<2048;++i)
+        gatherTable[i] = -i; // Same index to make it easy to debug, but negative to avoid copying idx.
+
+    int gatherIndexes[numElements];
+    float gatherIndexesAsFloat[numElements]; // Same as above, but in float format.
+    float expectedGather0[numElements];
+    float expectedGather1[numElements];
+    for (int i=0; i<numElements; ++i)
+    {
+        gatherIndexes[i] = i * 7;
+        gatherIndexesAsFloat[i] = float(gatherIndexes[i]);
+        expectedGather0[i] = -(i * 7);
+        expectedGather1[i] = -(i * 7) - 1; // Each value is one less than previous.
+    }
+
+    // Single value gather
+    FVEC computedFromGather(gatherTable, gatherIndexes);
+    ASSERT_VEC_EQUAL(computedFromGather, expectedGather0);
+
+    // Pair-wise vector gather. The first values should be the same as a normal gather, and the
+    // second are just increments from the first. Note that there musty be some suitable conversion
+    // from a floating-point index (i.e., an integer value in float format), and the type required
+    // for the second operand of gatherVecPair. gatherVecPair can then take either an actual
+    // float vector, or some suitable format like ivec4 or ivec8.
+    FVEC findex(gatherIndexesAsFloat);
+    FVEC p0, p1;
+    gatherVecPair(gatherTable, findex, p0, p1);
+    ASSERT_VEC_EQUAL(p0, expectedGather0);
+    ASSERT_VEC_EQUAL(p1, expectedGather1);
+}
+
+template<typename FVEC>
+void TestFvec<FVEC>::testUnaryOps() const {
+    const auto v = getRandomFvec();
+
+    // Note that these are exact comparisons because all these SIMD operators are
+    // just applying the scalar operator, so there should be no loss of precision.
+
+    ASSERT_VEC_EQUAL(abs(v), applyUnaryFn(v, [](float x) { return std::abs(x);} ));
+
+    ASSERT_VEC_EQUAL(-v, applyUnaryFn(v, [](float x) { return 0 - x;} ));
+
+    ASSERT_VEC_EQUAL(floor(v), applyUnaryFn(v, [](float x) { return std::floor(x);} ));
+    ASSERT_VEC_EQUAL(ceil(v), applyUnaryFn(v, [](float x) { return std::ceil(x);} ));
+    ASSERT_VEC_EQUAL(round(v), applyUnaryFn(v, [](float x) { return std::round(x);} ));
+
+    // Borrow a few other functions to test sqrt neatly.
+    const auto positiveValue = abs(v) + 1;
+    ASSERT_VEC_ALMOST_EQUAL(sqrt(positiveValue * positiveValue), positiveValue);
+    ASSERT_VEC_ALMOST_EQUAL(rsqrt(positiveValue * positiveValue), 1.0f / abs(positiveValue));
+}
+
+template<typename FVEC>
+void TestFvec<FVEC>::testBinaryOps() const {
+    const auto v0 = getRandomFvec();
+    const auto v1 = getRandomFvec();
+
+    // Note that most of these are exact comparisons because all these SIMD operators are
+    // just applying the scalar operator, so there should be no loss of precision. The one
+    // exception is division, which does often do something slightly different
+    // since division is an expensive operation (e.g., multiply by reciprocal).
+
+    // Binary operators.
+    ASSERT_VEC_EQUAL(v0 + v1, applyBinaryFn(v0, v1, std::plus<float>()));
+    ASSERT_VEC_EQUAL(v0 - v1, applyBinaryFn(v0, v1, std::minus<float>()));
+    ASSERT_VEC_EQUAL(v0 * v1, applyBinaryFn(v0, v1, std::multiplies<float>()));
+    ASSERT_VEC_ALMOST_EQUAL(v0 / v1, applyBinaryFn(v0, v1, std::divides<float>()));
+
+    // Assignment operators.
+    auto addAssign = v0;
+    addAssign += v1;
+    ASSERT_VEC_EQUAL(addAssign, applyBinaryFn(v0, v1, std::plus<float>()));
+
+    auto subAssign = v0;
+    subAssign -= v1;
+    ASSERT_VEC_EQUAL(subAssign, applyBinaryFn(v0, v1, std::minus<float>()));
+
+    auto mulAssign = v0;
+    mulAssign *= v1;
+    ASSERT_VEC_EQUAL(mulAssign, applyBinaryFn(v0, v1, std::multiplies<float>()));
+
+    auto divAssign = v0;
+    divAssign /= v1;
+    ASSERT_VEC_ALMOST_EQUAL(divAssign, applyBinaryFn(v0, v1, std::divides<float>()));
+
+    // Binary ops between SIMD and scalar.
+    const float f = getRandomFloat();
+    const FVEC fdup(f);
+
+    ASSERT_VEC_EQUAL(v0 + f, applyBinaryFn(v0, fdup, std::plus<float>()));
+    ASSERT_VEC_EQUAL(f + v0, applyBinaryFn(fdup, v0, std::plus<float>()));
+    ASSERT_VEC_EQUAL(v0 - f, applyBinaryFn(v0, fdup, std::minus<float>()));
+    ASSERT_VEC_EQUAL(f - v0, applyBinaryFn(fdup, v0, std::minus<float>()));
+    ASSERT_VEC_EQUAL(v0 * f, applyBinaryFn(v0, fdup, std::multiplies<float>()));
+    ASSERT_VEC_EQUAL(f * v0, applyBinaryFn(fdup, v0, std::multiplies<float>()));
+    ASSERT_VEC_ALMOST_EQUAL(v0 / f, applyBinaryFn(v0, fdup, std::divides<float>()));
+    ASSERT_VEC_ALMOST_EQUAL(f / v0, applyBinaryFn(fdup, v0, std::divides<float>()));
+
+    // Binary functions.
+    using std::min;
+    using std::max;
+    ASSERT_VEC_EQUAL(min(v0, v1),
+                     applyBinaryFn(v0, v1, [](float x, float y) { return min(x, y); }));
+    ASSERT_VEC_EQUAL(max(v0, v1),
+                     applyBinaryFn(v0, v1, [](float x, float y) { return max(x, y); }));
+}
+
+template<typename FVEC>
+void TestFvec<FVEC>::testTranspose() const {
+
+    // A table of random data to transpose.
+    float table[numElements * 4];
+    for (auto& e : table) e = std::round(getRandomFloat());
+
+    // Load the table row data into vectors.
+    const auto i0 = FVEC(table + 0 * numElements);
+    const auto i1 = FVEC(table + 1 * numElements);
+    const auto i2 = FVEC(table + 2 * numElements);
+    const auto i3 = FVEC(table + 3 * numElements);
+
+    // Manually transpose the data.
+    std::array<float, numElements * 4> expectedTranspose;
+    for (auto r=0; r<4; ++r)
+    {
+        for (auto c=0; c<numElements; ++c)
+        {
+            expectedTranspose[c * 4 + r] = table[r * numElements + c];
+        }
+    }
+
+    fvec4 computedTranspose[numElements];
+    transpose(i0, i1, i2, i3, computedTranspose);
+
+    ASSERT_VEC_EQUAL(computedTranspose, expectedTranspose);
+
+    FVEC o0, o1, o2, o3;
+    transpose(computedTranspose, o0, o1, o2, o3);
+
+    ASSERT_VEC_EQUAL(i0, o0);
+    ASSERT_VEC_EQUAL(i1, o1);
+    ASSERT_VEC_EQUAL(i2, o2);
+    ASSERT_VEC_EQUAL(i3, o3);
+}
+
+template<typename FVEC>
+void TestFvec<FVEC>::testBlendAndCompare() const {
+    const FVEC zero = {};
+    const FVEC allOne(1.0f);
+    const FVEC allTwo(2.0f);
+
+    // Note that different targets use different types of mask, so rather than checking
+    // the mask directly, instead check the output of using the mask as a blend to provide
+    // an indirect test.
+
+    const auto maskNone = FVEC::expandBitsToMask(0);
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, maskNone), allOne);
+    ASSERT_VEC_EQUAL(blendZero(allOne, maskNone), zero);
+
+    const auto maskAll = FVEC::expandBitsToMask(-1);
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, maskAll), allTwo);
+    ASSERT_VEC_EQUAL(blendZero(allOne, maskAll), allOne);
+
+    // Repeating pattern big enough to do most SIMD lengths.
+    const int bitmask = 0b1100001101101001;
+    const auto maskSome = FVEC::expandBitsToMask(bitmask);
+    float expectedMaskSome[numElements];
+    float expectedZeroMaskSome[numElements];
+    for (int i=0; i<numElements; ++i)
+    {
+        expectedMaskSome[i] = (bitmask & (1 << i)) ? 2.0f : 1.0f;
+        expectedZeroMaskSome[i] = (bitmask & (1 << i)) ? 2.0f : 0.0f;
+    }
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, maskSome), expectedMaskSome);
+    ASSERT_VEC_EQUAL(blendZero(allTwo, maskSome), expectedZeroMaskSome);
+
+    // Test comparisons too, using random numbers, and then blending in either 0 or 1.
+    const auto v0 = getRandomFvec();
+    const auto v1 = getRandomFvec();
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, v0 < v1),
+                     applyBinaryFn(v0, v1, [](float x, float y) { return x < y ? 2.0f : 1.0f; }));
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, v0 <= v1),
+                     applyBinaryFn(v0, v1, [](float x, float y) { return x <= y ? 2.0f : 1.0f; }));
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, v0 <= v0), allTwo);
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, v0 > v1),
+                     applyBinaryFn(v0, v1, [](float x, float y) { return x > y ? 2.0f : 1.0f; }));
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, v0 >= v1),
+                     applyBinaryFn(v0, v1, [](float x, float y) { return x >= y ? 2.0f : 1.0f; }));
+    ASSERT_VEC_EQUAL(blend(allOne, allTwo, v0 >= v0), allTwo);
+
+}
+
+template<typename FVEC>
+void TestFvec<FVEC>::testUtilities() const {
+    /** Use rounded (i.e., integer) values for the reductions. Reduction operations are very sensitive
+     * to ordering. The correct result is found by sorting values into ascending order to ensure that
+     * similar sized numbers are accumulated earlier than less similar numbers. If completely random
+     * numbers were used, this effect would show up here, making it more a test of what random numbers
+     * you got, than of the code itself. By rounding to integers, the numbers will behave sanely for the
+     * reduction, meaning it is a test of the reduction, and not of the format.
+     */
+    const auto v0 = round(getRandomFvec());
+    const auto v1 = round(getRandomFvec());
+    const auto v2 = round(getRandomFvec());
+
+    const float* v0p = (const float*)&v0;
+    const float* v1p = (const float*)&v1;
+    const float* v2p = (const float*)&v2;
+
+    const auto expectedRedAddV0 = std::accumulate(v0p, v0p + numElements, 0.0f);
+    const auto expectedRedAddV1 = std::accumulate(v1p, v1p + numElements, 0.0f);
+    const auto expectedRedAddV2 = std::accumulate(v2p, v2p + numElements, 0.0f);
+
+    ASSERT_VEC_EQUAL(reduceAdd(v0), expectedRedAddV0);
+
+    // Reduction of three vectors by addition into a single 3-element vector. Note that the final element
+    // of the reduction is undefined, so the expected value copies over whatever that undefined value is.
+    const auto computedRed3 = reduceToVec3(v0, v1, v2);
+    const auto expectedRed3 = fvec4(expectedRedAddV0, expectedRedAddV1, expectedRedAddV2, computedRed3[3]);
+    ASSERT_VEC_EQUAL(computedRed3, expectedRed3);
+
+}
\ No newline at end of file