Commit f8dc7aa1 authored by Daniel Towner's avatar Daniel Towner
Browse files

CPU: Various optimisations which improve AVX/AVX2

parent e4c1f73c
...@@ -101,4 +101,36 @@ inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) { ...@@ -101,4 +101,36 @@ inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
return _mm256_castsi256_ps(msb); return _mm256_castsi256_ps(msb);
} }
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, ivec8 index, fvecAvx2& out0, fvecAvx2& out1) {
const double* tableAsDbl = (const double*)table;
// The input is a set of 8 indexes, each of which refers to a pair of floating point
// values. The most efficient way to load from indexes in a vector is the gather instruction,
// and the 64-bit variant should be used to get the pairs.
// Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
// 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
const auto lowerIdx = _mm256_and_si256(index, _mm256_set1_epi64x(0xFFFFFFFF));
const auto lowerGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, lowerIdx, 4));
// Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
const auto upperIdx = _mm256_srli_epi64(index, 32);
const auto upperGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, upperIdx, 4));
// All the first values must now be brought together. The lower values are already in the
// correct place, but the upper gather values must be moved over and blended in.
const auto swapUpper = _mm256_permute_ps(upperGather, 0b10110001);
out0 = fvecAvx2(_mm256_blend_ps(lowerGather, swapUpper, 0b10101010));
// And the same for the upper values.
const auto swapLower = _mm256_permute_ps(lowerGather, 0b10110001);
out1 = fvecAvx2(_mm256_blend_ps(swapLower, upperGather, 0b10101010));
}
#endif /*OPENMM_VECTORIZE_AVX2_H_*/ #endif /*OPENMM_VECTORIZE_AVX2_H_*/
...@@ -109,10 +109,8 @@ CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<floa ...@@ -109,10 +109,8 @@ CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<floa
FVEC s1, s2; FVEC s1, s2;
gatherVecPair(table.data(), index, s1, s2); gatherVecPair(table.data(), index, s1, s2);
const auto coeff2 = x1-FVEC(index); const auto coeff2 = x1 - FVEC(index);
const auto coeff1 = 1.0f-coeff2; return (s1 - coeff2 * s1) + coeff2 * s2;
return coeff1*s1 + coeff2*s2;
} }
template<typename FVEC> template<typename FVEC>
...@@ -216,6 +214,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f ...@@ -216,6 +214,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
// Loop over neighbors for this block. // Loop over neighbors for this block.
const auto& neighbors = neighborList->getBlockNeighbors(blockIndex); const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
const auto& exclusions = neighborList->getBlockExclusions(blockIndex); const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
FVEC partialEnergy = {};
for (int i = 0; i < (int) neighbors.size(); i++) { for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor. // Load the next neighbor.
...@@ -230,7 +230,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f ...@@ -230,7 +230,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize); getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);
const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]); const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
const auto include = blendZero(r2 < cutoffDistance*cutoffDistance, exclNotMask); const auto include = blendZero(r2 < cutoffDistanceSquared, exclNotMask);
if (!any(include)) if (!any(include))
continue; // No interactions to compute. continue; // No interactions to compute.
...@@ -296,7 +296,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f ...@@ -296,7 +296,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
energy += chargeProd*inverseR; energy += chargeProd*inverseR;
} }
energy = blendZero(energy, include); energy = blendZero(energy, include);
*totalEnergy += reduceAdd(energy);
partialEnergy += energy;
} }
// Accumulate forces. // Accumulate forces.
...@@ -313,6 +314,9 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f ...@@ -313,6 +314,9 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
newAtomForce.store(atomForce); newAtomForce.store(atomForce);
} }
if (totalEnergy)
*totalEnergy += reduceAdd(partialEnergy);
// Record the forces on the block atoms. // Record the forces on the block atoms.
fvec4 f[blockSize]; fvec4 f[blockSize];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f); transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment