CPU: Various optimisations which improve AVX/AVX2

f8dc7aa1 · Daniel Towner · e4c1f73c · f8dc7aa1 · f8dc7aa1
Commit f8dc7aa1 authored Jun 15, 2020 by Daniel Towner
Showing with 42 additions and 6 deletions

openmmapi/include/openmm/internal/vectorizeAvx2.h openmmapi/include/openmm/internal/vectorizeAvx2.h +32 -0

platforms/cpu/include/CpuNonbondedForceFvec.h platforms/cpu/include/CpuNonbondedForceFvec.h +10 -6

No files found.
--- a/openmmapi/include/openmm/internal/vectorizeAvx2.h
+++ b/openmmapi/include/openmm/internal/vectorizeAvx2.h
@@ -101,4 +101,36 @@ inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
    return _mm256_castsi256_ps(msb);
 }

+/**
+ * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, ivec8 index, fvecAvx2& out0, fvecAvx2& out1) {
+    const double* tableAsDbl = (const double*)table;
+
+    // The input is a set of 8 indexes, each of which refers to a pair of floating point
+    // values. The most efficient way to load from indexes in a vector is the gather instruction,
+    // and the 64-bit variant should be used to get the pairs.
+
+    // Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
+    // 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
+    const auto lowerIdx = _mm256_and_si256(index, _mm256_set1_epi64x(0xFFFFFFFF));
+    const auto lowerGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, lowerIdx, 4));
+
+    // Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
+    const auto upperIdx = _mm256_srli_epi64(index, 32);
+    const auto upperGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, upperIdx, 4));
+
+    // All the first values must now be brought together. The lower values are already in the
+    // correct place, but the upper gather values must be moved over and blended in.
+    const auto swapUpper = _mm256_permute_ps(upperGather, 0b10110001);
+    out0 = fvecAvx2(_mm256_blend_ps(lowerGather, swapUpper, 0b10101010));
+
+    // And the same for the upper values.
+    const auto swapLower = _mm256_permute_ps(lowerGather, 0b10110001);
+    out1 = fvecAvx2(_mm256_blend_ps(swapLower, upperGather, 0b10101010));
+
+}
+
 #endif /*OPENMM_VECTORIZE_AVX2_H_*/
--- a/platforms/cpu/include/CpuNonbondedForceFvec.h
+++ b/platforms/cpu/include/CpuNonbondedForceFvec.h
@@ -109,10 +109,8 @@ CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<floa
    FVEC s1, s2;
    gatherVecPair(table.data(), index, s1, s2);

-    const auto coeff2 = x1-FVEC(index);
-    const auto coeff1 = 1.0f-coeff2;
-
-    return coeff1*s1 + coeff2*s2;
+    const auto coeff2 = x1 - FVEC(index);
+    return (s1 - coeff2 * s1) + coeff2 * s2;
 }

 template<typename FVEC>
@@ -216,6 +214,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
    // Loop over neighbors for this block.
    const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
    const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
+    FVEC partialEnergy = {};
+
    for (int i = 0; i < (int) neighbors.size(); i++) {
        // Load the next neighbor.
        
@@ -230,7 +230,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);

        const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
-        const auto include = blendZero(r2 < cutoffDistance*cutoffDistance, exclNotMask);
+        const auto include = blendZero(r2 < cutoffDistanceSquared, exclNotMask);
        if (!any(include))
            continue; // No interactions to compute.

@@ -296,7 +296,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
                    energy += chargeProd*inverseR;
            }
            energy = blendZero(energy, include);
-            *totalEnergy += reduceAdd(energy);
+
+            partialEnergy += energy;
        }

        // Accumulate forces.
@@ -313,6 +314,9 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
        newAtomForce.store(atomForce);
    }
    
+    if (totalEnergy)
+        *totalEnergy += reduceAdd(partialEnergy);
+
    // Record the forces on the block atoms.
    fvec4 f[blockSize];
    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);