Commit 939ecf28 authored by one's avatar one
Browse files

Optimize HIP pair-list handling for CDNA LJPME

- Use bitwise prefix accounting when storing sparse interactions as single pairs in the HIP pair-list kernel. This reduces the number of ballot operations needed to compute per-lane single-pair offsets.
- For HIP CDNA single precision, raise MAX_BITS_FOR_PAIRS to 8 so more sparse interactions are emitted as single pairs instead of full tiles. Keep the existing double precision and RDNA thresholds unchanged.
- Also simplify the HIP LJPME direct correction by computing alpha^2*r2
parent 14f1b515
...@@ -41,8 +41,12 @@ ...@@ -41,8 +41,12 @@
#if DO_LJPME #if DO_LJPME
// The multiplicative term to correct for the multiplicative terms that are always // The multiplicative term to correct for the multiplicative terms that are always
// present in reciprocal space. // present in reciprocal space.
#if defined(USE_HIP)
const real dar2 = EWALD_DISPERSION_ALPHA*EWALD_DISPERSION_ALPHA*r2;
#else
const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r; const real dispersionAlphaR = EWALD_DISPERSION_ALPHA*r;
const real dar2 = dispersionAlphaR*dispersionAlphaR; const real dar2 = dispersionAlphaR*dispersionAlphaR;
#endif
const real dar4 = dar2*dar2; const real dar4 = dar2*dar2;
const real dar6 = dar4*dar2; const real dar6 = dar4*dar2;
const real invR2 = invR*invR; const real invR2 = invR*invR;
......
...@@ -549,11 +549,9 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -549,11 +549,9 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
} }
else { else {
if (context.getSIMDWidth() > 32) { if (context.getSIMDWidth() > 32) {
// CDNA // CDNA wave64 GPUs benefit from storing more sparse
if (context.getNumAtoms() < 100000) // interactions as single pairs.
maxBits = 4; maxBits = 8;
else // Large systems
maxBits = 0;
} }
else { else {
// RDNA // RDNA
......
...@@ -553,10 +553,10 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti ...@@ -553,10 +553,10 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
if (__ballot(storeAsSinglePair)) { if (__ballot(storeAsSinglePair)) {
unsigned int sum = 0; unsigned int sum = 0;
unsigned int prevSum = 0; unsigned int prevSum = 0;
for (int i = 1; i <= MAX_BITS_FOR_PAIRS; i++) { for (int bit = 1; bit <= MAX_BITS_FOR_PAIRS; bit <<= 1) {
warpflags b = __ballot(interactCount == i); warpflags b = __ballot(storeAsSinglePair && ((interactCount & bit) != 0));
sum += warpPopc(b) * i; sum += warpPopc(b) * bit;
prevSum += warpPopc(b&warpMask) * i; prevSum += warpPopc(b&warpMask) * bit;
} }
unsigned int pairStartIndex = 0; unsigned int pairStartIndex = 0;
if (indexInWarp == 0) if (indexInWarp == 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment