CPU: Added support for AVX2.

170ac691 · Daniel Towner · 8c43e37a · 170ac691 · 170ac691 · 170ac691
Commit 170ac691 authored Jun 09, 2020 by Daniel Towner
19 changed files
--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
-#ifndef OPENMM_VECTORIZE8_H_
-#define OPENMM_VECTORIZE8_H_
+#ifndef OPENMM_VECTORIZEAVX_H_
+#define OPENMM_VECTORIZEAVX_H_

 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
@@ -57,9 +57,7 @@ public:
     * @param table The table from which to do a lookup.
     * @param indexes The indexes to gather.
     */
-    fvec8(const float* table, const int idx[8]) {
-        // :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
-        // Use gather and static assert? Conditional code?
+    fvec8(const float* table, const int32_t idx[8]) {
        val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
    }

@@ -415,4 +413,4 @@ static inline fvec4 reduceToVec3(fvec8 x, fvec8 y, fvec8 z) {
    return laneResult.lowerVec() + laneResult.upperVec();
 }

-#endif /*OPENMM_VECTORIZE8_H_*/
+#endif /*OPENMM_VECTORIZEAVX_H_*/
--- a/openmmapi/include/openmm/internal/vectorizeAvx2.h
+++ b/openmmapi/include/openmm/internal/vectorizeAvx2.h
+#ifndef OPENMM_VECTORIZE_AVX2_H_
+#define OPENMM_VECTORIZE_AVX2_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Authors: Daniel Towner                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "vectorizeAvx.h"
+#include <immintrin.h>
+
+// This file defines classes and functions to simplify vectorizing code with AVX.
+
+bool isAvx2Supported() {
+
+    // Provide an alternative implementation of CPUID to support AVX2. On older
+    // non-Windows OSes the hardware.h support for CPUID doesn't set the CX register
+    // properly and gives the wrong answer when detecting AVX2 and beyond. On Windows
+    // the cpuid seems to work as expected so can be used.
+#if !(defined(_WIN32) || defined(WIN32))
+    auto cpuid = [](int output[4], int functionnumber) {
+        int a, b, c, d;
+        __asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber), "c"(0) : );
+        output[0] = a;
+        output[1] = b;
+        output[2] = c;
+        output[3] = d;
+    };
+#endif
+
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+
+    if (cpuInfo[0] >= 7) {
+        cpuInfo[2] = 0;
+        cpuid(cpuInfo, 7);
+        return ((cpuInfo[1] & ((int) 1 << 5)) != 0);
+    }
+
+    return false;
+}
+
+/** 
+ * Derive from fvec8 so that default implementations of everything are provided,
+ * but can be overriden with AVX2-specific variants where possible.
+ */
+class fvecAvx2 : public fvec8 {
+public:
+
+    fvecAvx2() = default;
+    fvecAvx2(fvec8 v) : fvec8(v) {}
+    fvecAvx2(float v) : fvec8(v) {}
+    fvecAvx2(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : fvec8(v8, v7, v6, v5, v4, v3, v2, v1) {}
+    fvecAvx2(__m256 v) : fvec8(v) {}
+    fvecAvx2(const float* v) : fvec8(v) {}
+
+    /** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvecAvx2(const float* table, const int idx[8])
+        : fvec8(_mm256_i32gather_ps(table, _mm256_loadu_si256((const __m256i*)idx), 4)) {}
+
+    static fvecAvx2 expandBitsToMask(int bitmask);
+};
+
+inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
+    // Put a copy of all bits into each vector element and then shift so that the
+    // appropriate sub-bit becomes the MSB. For masking purposes, only the MSB matters and
+    // the other bits can be completely arbitrary.
+    const auto msb = _mm256_sllv_epi32(_mm256_set1_epi8(bitmask),
+                                       _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+
+    return _mm256_castsi256_ps(msb);
+}
+
+#endif /*OPENMM_VECTORIZE_AVX2_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -89,7 +89,7 @@ public:
      * @param table The table from which to do a lookup.
      * @param indexes The indexes to gather.
      */
-    fvec4(const float* table, const int idx[4])
+    fvec4(const float* table, const int32_t idx[4])
        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }

    float operator[](int i) const {

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -74,7 +74,7 @@ public:
     * @param table The table from which to do a lookup.
     * @param indexes The indexes to gather.
     */
-    fvec4(const float* table, const int idx[4])
+    fvec4(const float* table, const int32_t idx[4])
        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }

    operator __m128() const {

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -80,7 +80,7 @@ public:
     * @param table The table from which to do a lookup.
     * @param indexes The indexes to gather.
     */
-    fvec4(const float* table, const int idx[4])
+    fvec4(const float* table, const int32_t idx[4])
        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }

    operator __m128() const {

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -51,7 +51,7 @@ public:
            const Vec3* periodicBoxVectors, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    int getBlockSize() const;
-    const std::vector<int>& getSortedAtoms() const;
+    const std::vector<int32_t>& getSortedAtoms() const;
    const std::vector<int>& getBlockNeighbors(int blockIndex) const;

    /**

--- a/platforms/cpu/include/CpuNonbondedForceFvec.h
+++ b/platforms/cpu/include/CpuNonbondedForceFvec.h
@@ -140,7 +140,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnHandler(int blockIndex, float
        using std::min;
        using std::max;

-        const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+        const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
        float minx, maxx, miny, maxy, minz, maxz;
        minx = maxx = posq[4*blockAtom[0]];
        miny = maxy = posq[4*blockAtom[0]+1];
@@ -183,7 +183,7 @@ template <int PERIODIC_TYPE, BlockType BLOCK_TYPE>
 void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.

-    const int* blockAtom = &neighborList->getSortedAtoms()[blockSize * blockIndex];
+    const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize * blockIndex];
    fvec4 blockAtomPosq[blockSize];
    FVEC blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
    FVEC blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;

--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
 FOREACH(file ${SOURCE_FILES})
-    IF(file MATCHES ".*Vec8.*")
-        IF(MSVC)
-            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
-        ELSEIF(X86)
-            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
-        ELSE()
-            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
-        ENDIF()
-    ELSE()
-        IF(X86 AND NOT MSVC)
-            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
-        ENDIF()
+    IF(X86 AND NOT MSVC)
+        SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
    ENDIF()
 ENDFOREACH(file)
+
+# Override some sources files with platform specific flags.
+IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX2 /D__AVX2__")
+ELSEIF(X86)
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/platforms/cpu/src/CpuNonbondedForceAvx2.cpp PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx2 -mfma")
+ENDIF()
+
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})

--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
@@ -361,7 +361,7 @@ void CpuCustomGBForce::calculateParticlePairValue(int index, ThreadData& data, i
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
-            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+            const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
@@ -456,7 +456,7 @@ void CpuCustomGBForce::calculateParticlePairEnergyTerm(int index, ThreadData& da
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
-            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+            const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {
@@ -543,7 +543,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
-            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+            const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const auto& blockExclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
@@ -193,7 +193,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
-            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+            const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -181,7 +181,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
-            const int* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
+            const int32_t* blockAtom = &neighborList->getSortedAtoms()[blockSize*blockIndex];
            const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
            const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
            for (int i = 0; i < (int) neighbors.size(); i++) {

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -501,7 +501,7 @@ int CpuNeighborList::getBlockSize() const {
    return blockSize;
 }

-const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
+const std::vector<int32_t>& CpuNeighborList::getSortedAtoms() const {
    return sortedAtoms;
 }


--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -27,9 +27,9 @@

 #ifdef __AVX__

-#include "openmm/internal/vectorize8.h"
+#include "openmm/internal/vectorizeAvx.h"

-bool isVec8Supported() {
+bool isAvxSupported() {
    // Make sure the CPU supports AVX.
    int cpuInfo[4];
    cpuid(cpuInfo, 0);
@@ -40,16 +40,16 @@ bool isVec8Supported() {
    return false;
 }

-OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec8() {
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx() {
    return new OpenMM::CpuNonbondedForceFvec<fvec8>();
 }

 #else
-bool isVec8Supported() {
+bool isAvxSupported() {
    return false;
 }

-OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec8() {
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx() {
   throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX support");
 }
 #endif
--- a/platforms/cpu/src/CpuNonbondedForceAvx2.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceAvx2.cpp
+
+/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+ * Contributors: Daniel Towner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuNonbondedForceFvec.h"
+#include "openmm/OpenMMException.h"
+
+#ifdef __AVX2__
+
+#include "openmm/internal/vectorizeAvx2.h"
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2() {
+    return new OpenMM::CpuNonbondedForceFvec<fvecAvx2>();
+}
+
+#else
+
+bool isAvx2Supported() {
+    return false;
+}
+
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2() {
+   throw OpenMM::OpenMMException("Internal error: OpenMM was compiled without AVX2 support");
+}
+#endif
--- a/platforms/cpu/src/CpuNonbondedForceFvec.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceFvec.cpp
@@ -25,19 +25,25 @@
 #include "CpuNonbondedForceFvec.h"

 OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec4();
-OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec8();
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx();
+OpenMM::CpuNonbondedForce* createCpuNonbondedForceAvx2();

-bool isVec8Supported();
+bool isAvxSupported();
+bool isAvx2Supported();
+
+#include <iostream>

 OpenMM::CpuNonbondedForce* createCpuNonbondedForceVec() {
-    if (isVec8Supported())
-        return createCpuNonbondedForceVec8();
+    if (isAvx2Supported())
+        return createCpuNonbondedForceAvx2();
+    else if (isAvxSupported())
+        return createCpuNonbondedForceAvx();
    else
        return createCpuNonbondedForceVec4();
 }

 int getVecBlockSize() {
-    if (isVec8Supported())
+    if (isAvx2Supported() || isAvxSupported())
        return 8;
    else
        return 4;

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,7 +8,7 @@ ENABLE_TESTING()
 FILE(GLOB TEST_PROGS "*Test*.cpp")
 FOREACH(TEST_PROG ${TEST_PROGS})
    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
-    IF ((${TEST_ROOT} MATCHES TestVectorize8) AND NOT X86)
+    IF ((${TEST_ROOT} MATCHES TestVectorizeAvx*) AND NOT X86)
        CONTINUE()
    ENDIF()
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
@@ -21,9 +21,12 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    IF((${TEST_ROOT} MATCHES TestVectorize) AND X86 AND NOT MSVC)
        SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
    ENDIF()
-    IF((${TEST_ROOT} MATCHES TestVectorize8) AND X86 AND NOT MSVC)
+    IF((${TEST_ROOT} MATCHES TestVectorizeAvx) AND X86 AND NOT MSVC)
        SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
    ENDIF()
+    IF((${TEST_ROOT} MATCHES TestVectorizeAvx2) AND X86 AND NOT MSVC)
+        SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS} -mfma -mavx2")
+    ENDIF()
    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_TEST_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})

--- a/tests/TestVectorize8.cpp
+++ b/tests/TestVectorize8.cpp
@@ -34,7 +34,7 @@
 */

 #include "openmm/internal/AssertionUtilities.h"
-#include "openmm/internal/vectorize8.h"
+#include "openmm/internal/vectorizeAvx.h"
 #include <iostream>

 #include "TestVectorizeGeneric.h"

--- a/tests/TestVectorizeAvx2.cpp
+++ b/tests/TestVectorizeAvx2.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2014-2015 Stanford University and the Authors.      *
+ * Authors: Daniel Towner                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests vectorized operations.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+
+#include <iostream>
+
+#ifndef __AVX2__
+int main () {
+    std::cout << "AVX2 CPU is not supported. Exiting." << std::endl;
+    return 0;
+}
+#else
+
+#include "openmm/internal/vectorizeAvx2.h"
+#include "TestVectorizeGeneric.h"
+
+using namespace OpenMM;
+
+int main(int argc, char* argv[]) {
+    try {
+        if (!isAvx2Supported()) {
+            std::cout << "CPU is not supported. Exiting." << std::endl;
+            return 0;
+        }
+
+        TestFvec<fvecAvx2>::testAll();
+    }
+    catch(const std::exception& e) {
+        std::cout << "exception: " << e.what() << std::endl;
+        return 1;
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
+
+#endif
\ No newline at end of file
--- a/tests/TestVectorizeGeneric.h
+++ b/tests/TestVectorizeGeneric.h
@@ -338,12 +338,10 @@ void TestFvec<FVEC>::testBinaryOps() const {
    ASSERT_VEC_ALMOST_EQUAL(f / v0, applyBinaryFn(fdup, v0, std::divides<float>()));

    // Binary functions.
-    using std::min;
-    using std::max;
    ASSERT_VEC_EQUAL(min(v0, v1),
-                     applyBinaryFn(v0, v1, [](float x, float y) { return min(x, y); }));
+                     applyBinaryFn(v0, v1, [](float x, float y) { return std::min<float>(x, y); }));
    ASSERT_VEC_EQUAL(max(v0, v1),
-                     applyBinaryFn(v0, v1, [](float x, float y) { return max(x, y); }));
+                     applyBinaryFn(v0, v1, [](float x, float y) { return std::max<float>(x, y); }));
 }

 template<typename FVEC>