Commit f8dc7aa1 authored by Daniel Towner's avatar Daniel Towner
Browse files

CPU: Various optimisations which improve AVX/AVX2

parent e4c1f73c
......@@ -101,4 +101,36 @@ inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
return _mm256_castsi256_ps(msb);
}
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, ivec8 index, fvecAvx2& out0, fvecAvx2& out1) {
const double* tableAsDbl = (const double*)table;
// The input is a set of 8 indexes, each of which refers to a pair of floating point
// values. The most efficient way to load from indexes in a vector is the gather instruction,
// and the 64-bit variant should be used to get the pairs.
// Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
// 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
const auto lowerIdx = _mm256_and_si256(index, _mm256_set1_epi64x(0xFFFFFFFF));
const auto lowerGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, lowerIdx, 4));
// Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
const auto upperIdx = _mm256_srli_epi64(index, 32);
const auto upperGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, upperIdx, 4));
// All the first values must now be brought together. The lower values are already in the
// correct place, but the upper gather values must be moved over and blended in.
const auto swapUpper = _mm256_permute_ps(upperGather, 0b10110001);
out0 = fvecAvx2(_mm256_blend_ps(lowerGather, swapUpper, 0b10101010));
// And the same for the upper values.
const auto swapLower = _mm256_permute_ps(lowerGather, 0b10110001);
out1 = fvecAvx2(_mm256_blend_ps(swapLower, upperGather, 0b10101010));
}
#endif /*OPENMM_VECTORIZE_AVX2_H_*/
......@@ -109,10 +109,8 @@ CpuNonbondedForceFvec<FVEC>::approximateFunctionFromTable(const std::vector<floa
FVEC s1, s2;
gatherVecPair(table.data(), index, s1, s2);
const auto coeff2 = x1-FVEC(index);
const auto coeff1 = 1.0f-coeff2;
return coeff1*s1 + coeff2*s2;
const auto coeff2 = x1 - FVEC(index);
return (s1 - coeff2 * s1) + coeff2 * s2;
}
template<typename FVEC>
......@@ -216,6 +214,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
// Loop over neighbors for this block.
const auto& neighbors = neighborList->getBlockNeighbors(blockIndex);
const auto& exclusions = neighborList->getBlockExclusions(blockIndex);
FVEC partialEnergy = {};
for (int i = 0; i < (int) neighbors.size(); i++) {
// Load the next neighbor.
......@@ -230,7 +230,7 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, boxSize, invBoxSize);
const auto exclNotMask = FVEC::expandBitsToMask(~exclusions[i]);
const auto include = blendZero(r2 < cutoffDistance*cutoffDistance, exclNotMask);
const auto include = blendZero(r2 < cutoffDistanceSquared, exclNotMask);
if (!any(include))
continue; // No interactions to compute.
......@@ -296,7 +296,8 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
energy += chargeProd*inverseR;
}
energy = blendZero(energy, include);
*totalEnergy += reduceAdd(energy);
partialEnergy += energy;
}
// Accumulate forces.
......@@ -313,6 +314,9 @@ void CpuNonbondedForceFvec<FVEC>::calculateBlockIxnImpl(int blockIndex, float* f
newAtomForce.store(atomForce);
}
if (totalEnergy)
*totalEnergy += reduceAdd(partialEnergy);
// Record the forces on the block atoms.
fvec4 f[blockSize];
transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment