Port changes from the main repository (mostly related to large blocks)

Skip neighbor list for very small systems https://github.com/openmm/openmm/pull/4070 Store bounding box sizes in half precision https://github.com/openmm/openmm/commit/2ae50f9 Use large blocks to optimize building the neighbor list https://github.com/openmm/openmm/commit/3955033 Improved sorting of blocks when building neighbor list https://github.com/openmm/openmm/commit/796ffaa Fixed bug in large blocks optimization with triclinic boxes https://github.com/openmm/openmm/commit/4c10732 Optimize sorting of non-uniformly distributed data https://github.com/openmm/openmm/commit/71d9bb1 Co-authored-by: bdenhollander <44237618+bdenhollander@users.noreply.github.com>

Port changes from the main repository (mostly related to large blocks)
Skip neighbor list for very small systems https://github.com/openmm/openmm/pull/4070 Store bounding box sizes in half precision https://github.com/openmm/openmm/commit/2ae50f9 Use large blocks to optimize building the neighbor list https://github.com/openmm/openmm/commit/3955033 Improved sorting of blocks when building neighbor list https://github.com/openmm/openmm/commit/796ffaa Fixed bug in large blocks optimization with triclinic boxes https://github.com/openmm/openmm/commit/4c10732 Optimize sorting of non-uniformly distributed data https://github.com/openmm/openmm/commit/71d9bb1 Co-authored-by: bdenhollander <44237618+bdenhollander@users.noreply.github.com>
7d7490ea · Anton Gorenko · f965707b · 7d7490ea · 7d7490ea · 7d7490ea
Unverified Commit 7d7490ea authored Aug 25, 2024 by Anton Gorenko
7 changed files
--- a/platforms/hip/include/HipNonbondedUtilities.h
+++ b/platforms/hip/include/HipNonbondedUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
 * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
@@ -82,8 +82,10 @@ public:
     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
     * @param kernel           the code to evaluate the interaction
     * @param forceGroup       the force group in which the interaction should be calculated
+     * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction.  This should
+     *                         be viewed as only a suggestion.  Even when it is false, a neighbor list may be used anyway.
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList = true);
    /**
     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
     *
@@ -94,9 +96,11 @@ public:
     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
     * @param kernel           the code to evaluate the interaction
     * @param forceGroup       the force group in which the interaction should be calculated
+     * @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction.  This should
+     *                         be viewed as only a suggestion.  Even when it is false, a neighbor list may be used anyway.
     * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList);
    /**
     * Add a per-atom parameter that the default interaction kernel may depend on.
     */
@@ -335,12 +339,15 @@ private:
    HipArray sortedBlocks;
    HipArray sortedBlockCenter;
    HipArray sortedBlockBoundingBox;
+    HipArray blockSizeRange;
+    HipArray largeBlockCenter;
+    HipArray largeBlockBoundingBox;
    HipArray oldPositions;
    HipArray rebuildNeighborList;
    HipSort* blockSorter;
    hipEvent_t downloadCountEvent;
    unsigned int* pinnedCountBuffer;
-    std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
+    std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
@@ -348,7 +355,7 @@ private:
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks;
    int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions;
    int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags;
    unsigned int maxTiles, maxSinglePairs, tilesAfterReorder;
@@ -367,6 +374,7 @@ public:
    std::string source;
    hipFunction_t forceKernel, energyKernel, forceEnergyKernel;
    hipFunction_t findBlockBoundsKernel;
+    hipFunction_t computeSortKeysKernel;
    hipFunction_t sortBoxDataKernel;
    hipFunction_t findInteractingBlocksKernel;
    hipFunction_t copyInteractionCountsKernel;

--- a/platforms/hip/src/HipKernels.cpp
+++ b/platforms/hip/src/HipKernels.cpp
@@ -995,7 +995,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    }
    source = cu.replaceStrings(source, replacements);
    if (force.getIncludeDirectSpace())
-        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);

    // Initialize the exceptions.


--- a/platforms/hip/src/HipNonbondedUtilities.cpp
+++ b/platforms/hip/src/HipNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2022 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
 * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
@@ -50,21 +50,18 @@ using namespace std;

 class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
 public:
-    BlockSortTrait(bool useDouble) : useDouble(useDouble) {
-    }
-    int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
-    int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
-    const char* getDataType() const {return "real2";}
-    const char* getKeyType() const {return "real";}
-    const char* getMinKey() const {return "-3.40282e+38f";}
-    const char* getMaxKey() const {return "3.40282e+38f";}
-    const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
-    const char* getSortKey() const {return "value.x";}
-private:
-    bool useDouble;
+    BlockSortTrait() {}
+    int getDataSize() const {return sizeof(int);}
+    int getKeySize() const {return sizeof(int);}
+    const char* getDataType() const {return "unsigned int";}
+    const char* getKeyType() const {return "unsigned int";}
+    const char* getMinKey() const {return "0";}
+    const char* getMaxKey() const {return "0xFFFFFFFFu";}
+    const char* getMaxValue() const {return "0xFFFFFFFFu";}
+    const char* getSortKey() const {return "value";}
 };

-HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
+HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
        blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
    // Decide how many thread blocks to use.

@@ -74,6 +71,13 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont
    numForceThreadBlocks = 5*4*context.getMultiprocessors();
    forceThreadBlockSize = 64;
    findInteractingBlocksThreadBlockSize = context.getSIMDWidth();
+
+    // When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to
+    // accelerate the process.  This makes building the neighbor list faster, but it prevents
+    // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
+    // list.  We guess based on system size which will be faster.
+
+    useLargeBlocks = (context.getNumAtoms() > 90000);
    setKernelSource(HipKernelSources::nonbonded);
 }

@@ -85,11 +89,11 @@ HipNonbondedUtilities::~HipNonbondedUtilities() {
    hipEventDestroy(downloadCountEvent);
 }

-void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
-    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) {
+    addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false);
 }

-void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
+void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) {
    if (groupCutoff.size() > 0) {
        if (usesCutoff != useCutoff)
            throw OpenMMException("All Forces must agree on whether to use a cutoff");
@@ -102,6 +106,7 @@ void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, b
        requestExclusions(exclusionList);
    useCutoff = usesCutoff;
    usePeriodic = usesPeriodic;
+    useNeighborList |= (usesNeighborList && useCutoff);
    groupCutoff[forceGroup] = cutoffDistance;
    groupFlags |= 1<<forceGroup;
    canUsePairList &= supportsPairList;
@@ -290,15 +295,23 @@ void HipNonbondedUtilities::initialize(const System& system) {
        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
        blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
        blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
-        sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
+        sortedBlocks.initialize<unsigned int>(context, numAtomBlocks, "sortedBlocks");
        sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
        sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
+        blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange");
+        largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
+        largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox");
        oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
        rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
-        blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
+        blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false);
        vector<unsigned int> count(2, 0);
        interactionCount.upload(count);
        rebuildNeighborList.upload(&count[0]);
+        if (context.getUseDoublePrecision()) {
+            blockSizeRange.upload(vector<double>{1e38, 0});
+        } else {
+            blockSizeRange.upload(vector<float>{1e38, 0});
+        }
    }

    // Record arguments for kernels.
@@ -342,17 +355,30 @@ void HipNonbondedUtilities::initialize(const System& system) {
        findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
        findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
        findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
-        findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
+        findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer());
+        computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer());
+        computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer());
+        computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer());
        sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
        sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
        sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
        sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
        sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        if (useLargeBlocks) {
+            sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer());
+            sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer());
+            sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer());
+            sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer());
+        }
        sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
        sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
        sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
        sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
        sortBoxDataArgs.push_back(&forceRebuildNeighborList);
+        sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer());
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
@@ -370,6 +396,10 @@ void HipNonbondedUtilities::initialize(const System& system) {
        findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
        findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
        findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
+        if (useLargeBlocks) {
+            findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer());
+            findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
+        }
        findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
        findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
        findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
@@ -396,23 +426,24 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
        return;
    if (groupKernels.find(forceGroups) == groupKernels.end())
        createKernelsForGroups(forceGroups);
-    if (!useCutoff)
-        return;
-    if (numTiles == 0)
-        return;
    KernelSet& kernels = groupKernels[forceGroups];
-    if (usePeriodic) {
+    if (useCutoff && usePeriodic) {
        double4 box = context.getPeriodicBoxSize();
        double minAllowedSize = 1.999999*kernels.cutoffDistance;
        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
    }
+    if (!useNeighborList)
+        return;
+    if (numTiles == 0)
+        return;

    // Compute the neighbor list.

    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
    context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth());
+    context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks());
    blockSorter->sort(sortedBlocks);
    context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64);
    context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
@@ -432,7 +463,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
        context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
-    if (useCutoff && numTiles > 0) {
+    if (useNeighborList && numTiles > 0) {
        hipEventSynchronize(downloadCountEvent);
        updateNeighborListSize();
    }
@@ -521,6 +552,8 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
            defines["USE_PERIODIC"] = "1";
        if (context.getBoxIsTriclinic())
            defines["TRICLINIC"] = "1";
+        if (useLargeBlocks)
+            defines["USE_LARGE_BLOCKS"] = "1";
        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
        int maxBits = 0;
        if (canUsePairList) {
@@ -549,8 +582,14 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
        defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits);
        defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch);
        defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize);
+        int binShift = 1;
+        while (1<<binShift <= context.getNumAtomBlocks())
+            binShift++;
+        defines["BIN_SHIFT"] = context.intToString(binShift);
+        defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
        hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
        kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
+        kernels.computeSortKeysKernel = context.getKernel(interactingBlocksProgram, "computeSortKeys");
        kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
        kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
        kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
@@ -669,6 +708,8 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
        defines["USE_EXCLUSIONS"] = "1";
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
+    if (useNeighborList)
+        defines["USE_NEIGHBOR_LIST"] = "1";
    defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
    if (includeForces)
        defines["INCLUDE_FORCES"] = "1";

--- a/platforms/hip/src/HipSort.cpp
+++ b/platforms/hip/src/HipSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2021 Stanford University and the Authors.      *
 * Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc.              *
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
@@ -44,11 +44,12 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
+    replacements["UNIFORM"] = (uniform ? "1" : "0");
    hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
    shortListKernel = context.getKernel(module, "sortShortList");
    shortList2Kernel = context.getKernel(module, "sortShortList2");
    computeRangeKernel = context.getKernel(module, "computeRange");
-    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
+    assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
    copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
    sortBucketsKernel = context.getKernel(module, "sortBuckets");
@@ -58,7 +59,7 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo
    int maxSharedMem;
    hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
-    int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
+    int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
    isShortList = (length <= maxShortList);
    sortKernelSize = 256;
    rangeKernelSize = 256;

--- a/platforms/hip/src/kernels/findInteractingBlocks.hip
+++ b/platforms/hip/src/kernels/findInteractingBlocks.hip
@@ -18,12 +18,27 @@ __device__ inline int warpPopc(warpflags x) {

 #endif

+struct alignas(sizeof(__half) * 4) BoundingBox {
+    __device__ BoundingBox(real3 f) {
+        // Round up so we'll err on the side of making the box a little too large.
+        // This ensures interactions will never be missed.
+        v[0] = __float2half_ru((float) f.x);
+        v[1] = __float2half_ru((float) f.y);
+        v[2] = __float2half_ru((float) f.z);
+    }
+    __device__ real3 toReal3() const {
+        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
+    }
+private:
+    __half v[3];
+};
+
 /**
 * Find a bounding box for the atoms in each block.
 */
 extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
-        real2* __restrict__ sortedBlocks) {
+        real2* __restrict__ blockSizeRange) {
    const int indexInTile = threadIdx.x%TILE_SIZE;
    const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE);
    const int base = index * TILE_SIZE;
@@ -97,27 +112,77 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
        center.w = SQRT(tcenter);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = center;
-        // blockSize.x+blockSize.y+blockSize.z has a distibution that looks like a normal distribution.
-        // This causes HipSort's buckets to have very non-uniform sizes, so a few very long buckets are
-        // sorted in global memory. -1/max(x, y, z) or -1/(x+y+z) have a "faster" distribution.
-        sortedBlocks[index] = make_real2(-RECIP(max(max(blockSize.x, blockSize.y), blockSize.z)), index);
+
+        // Record the range of sizes.
+        real totalSize = blockSize.x+blockSize.y+blockSize.z;
+        atomicMin(&blockSizeRange->x, totalSize);
+        atomicMax(&blockSizeRange->y, totalSize);
    }
    if (blockIdx.x == 0 && threadIdx.x == 0)
        rebuildNeighborList[0] = 0;
 }

+extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) {
+    // Sort keys store the bin in the high order part and the block in the low
+    // order part.
+
+    real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y));
+    int numSizeBins = 20;
+    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
+    unsigned int i = threadIdx.x+blockIdx.x*blockDim.x;
+    if (i < NUM_BLOCKS) {
+        real4 box = blockBoundingBox[i];
+        real size = LOG(box.x+box.y+box.z);
+        int bin = (size-sizeRange.x)*scale;
+        bin = max(0, min(bin, numSizeBins-1));
+        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
+    }
+}
+
 /**
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
 */
-extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
-        real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
-        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
+extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
+        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, BoundingBox* __restrict__ sortedBlockBoundingBox,
+#ifdef USE_LARGE_BLOCKS
+        real4* __restrict__ largeBlockCenter, BoundingBox* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
+        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+#endif
+        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild,
+        real2* __restrict__ blockSizeRange) {
    int i = threadIdx.x+blockIdx.x*blockDim.x;
    if (i < NUM_BLOCKS) {
-        int index = (int) sortedBlock[i].y;
+        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
        sortedBlockCenter[i] = blockCenter[index];
-        sortedBlockBoundingBox[i] = blockBoundingBox[index];
+        sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index]));
+
+#ifdef USE_LARGE_BLOCKS
+        // Compute the sizes of large blocks (composed of warpSize regular blocks) starting from each block.
+
+        real4 minPos = blockCenter[index]-blockBoundingBox[index];
+        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
+        int last = min(i+warpSize, NUM_BLOCKS);
+        for (int j = i+1; j < last; j++) {
+            unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
+            real4 blockPos = blockCenter[index2];
+            real4 width = blockBoundingBox[index2];
+#ifdef USE_PERIODIC
+            real4 center = 0.5f*(maxPos+minPos);
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
+#endif
+            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
+            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
+        }
+        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
+        largeBlockBoundingBox[i] = BoundingBox(trimTo3(0.5f*(maxPos-minPos)));
+#endif
+    }
+
+    // Initialize min and max for the atomic reduction in findBlockBounds (for the next step)
+    if (i == 0) {
+        blockSizeRange->x = 1e38;
+        blockSizeRange->y = 0;
    }

    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
@@ -239,8 +304,12 @@ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned i
 extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
-        unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
-        const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
+        const BoundingBox* __restrict__ sortedBlockBoundingBox,
+#ifdef USE_LARGE_BLOCKS
+        const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox,
+#endif
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {

    if (rebuildNeighborList[0] == 0)
@@ -271,10 +340,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
    if (block1 < startBlockIndex+numBlocks) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.

-        real2 sortedKey = sortedBlocks[block1];
-        int x = (int) sortedKey.y;
+        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
        real4 blockCenterX = sortedBlockCenter[block1];
-        real4 blockSizeX = sortedBlockBoundingBox[block1];
+        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
        int neighborsInBuffer = 0;
        real4 pos1 = posq[x*TILE_SIZE+indexInTile];
 #ifdef USE_PERIODIC
@@ -309,10 +377,52 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
        // units are idle at the end of the kernel (the kernel works on the upper triangle of
        // the NUM_BLOCKS x NUM_BLOCKS matrix).

+#ifdef USE_LARGE_BLOCKS
+        warpflags largeBlockFlags = 0;
+        int loadedLargeBlocks = 0;
+#endif
        int block2Count = 0;
        // Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox.
        for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) {
+            // Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2)
            const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS;
+#ifdef USE_LARGE_BLOCKS
+            if (loadedLargeBlocks == 0) {
+                // Check the next set of large blocks.
+
+                int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp;
+                bool includeLargeBlock = false;
+                if (largeBlockIndex < NUM_BLOCKS) {
+                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
+                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
+                    real4 blockDelta = blockCenterX-largeCenter;
+#ifdef USE_PERIODIC
+                    APPLY_PERIODIC_TO_DELTA(blockDelta)
+#endif
+                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
+                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
+                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
+                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+#ifdef TRICLINIC
+                    // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
+                    // If there's any possibility we might have missed it, do a detailed check.
+
+                    if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
+                        includeLargeBlock = true;
+#endif
+                }
+                largeBlockFlags = __ballot(includeLargeBlock);
+                loadedLargeBlocks = warpSize;
+            }
+            loadedLargeBlocks--;
+            if ((largeBlockFlags&1) == 0 && !lastIteration) {
+                // None of the next warpSize blocks interact with block 1.
+
+                largeBlockFlags >>= 1;
+                continue;
+            }
+            largeBlockFlags >>= 1;
+#endif
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS);
            block2 = includeBlock2 ? block2 : block1;
@@ -327,7 +437,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
            if (!lastIteration && __ballot(includeBlock2) == 0)
                continue;
 #endif
-            real4 blockSizeY = sortedBlockBoundingBox[block2];
+            real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
            blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
            blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
            blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
@@ -359,7 +469,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
                const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)];
                const bool forceInclude = b & 1;
                const int block2 = b >> 1;
-                int y = (int) sortedBlocks[block2].y;
+                int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;

                #pragma unroll 1
                for (int k = indexInTile; k < numExclusions; k += TILE_SIZE)

--- a/platforms/hip/src/kernels/nonbonded.hip
+++ b/platforms/hip/src/kernels/nonbonded.hip
@@ -236,7 +236,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

-#ifdef USE_CUTOFF
+#ifdef USE_NEIGHBOR_LIST
    const unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
@@ -261,7 +261,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
        // Extract the coordinates of this tile.
        int x, y;
        bool singlePeriodicCopy = false;
-#ifdef USE_CUTOFF
+#ifdef USE_NEIGHBOR_LIST
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
@@ -296,7 +296,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
-#ifdef USE_CUTOFF
+#ifdef USE_NEIGHBOR_LIST
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
@@ -453,7 +453,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded

    // Third loop: single pairs that aren't part of a tile.

-#if USE_CUTOFF
+#if USE_NEIGHBOR_LIST
    const unsigned int numPairs = interactionCount[1];
    if (numPairs > maxSinglePairs)
        return; // There wasn't enough memory for the neighbor list.

--- a/platforms/hip/src/kernels/sort.hip
+++ b/platforms/hip/src/kernels/sort.hip
@@ -95,6 +95,7 @@ inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE
 */
 __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
        unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
+#if UNIFORM
    extern __shared__ KEY_TYPE minBuffer[];
    KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
    KEY_TYPE minimum = MAX_KEY;
@@ -131,6 +132,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
        }
        reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
    }
+#endif

    // Clear the bucket counters in preparation for the next kernel.

@@ -139,7 +141,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
 }

 /**
- * Assign elements to buckets.
+ * Assign elements to buckets.  This version is optimized for uniformly distributed data.
 */
 __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
        unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
@@ -154,6 +156,86 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
    }
 }

+
+/**
+ * Assign elements to buckets.  This version is optimized for non-uniformly distributed data.
+ */
+__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    // Load 64 datapoints and sort them to get an estimate of the data distribution.
+
+    __shared__ KEY_TYPE elements[64];
+    if (threadIdx.x < 64) {
+        int index = (int) (threadIdx.x*length/64.0);
+        elements[threadIdx.x] = getValue(data[index]);
+    }
+    __syncthreads();
+    for (unsigned int k = 2; k <= 64; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            if (threadIdx.x < 64) {
+                int ixj = threadIdx.x^j;
+                if (ixj > threadIdx.x) {
+                    KEY_TYPE value1 = elements[threadIdx.x];
+                    KEY_TYPE value2 = elements[ixj];
+                    bool ascending = (threadIdx.x&k) == 0;
+                    KEY_TYPE lowKey = (ascending ? value1 : value2);
+                    KEY_TYPE highKey = (ascending ? value2 : value1);
+                    if (lowKey > highKey) {
+                        elements[threadIdx.x] = value2;
+                        elements[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Create a function composed of linear segments mapping data values to bucket indices.
+
+    __shared__ float segmentLowerBound[9];
+    __shared__ float segmentBaseIndex[9];
+    __shared__ float segmentIndexScale[9];
+    if (threadIdx.x == 0) {
+        segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
+        segmentLowerBound[1] = elements[5];
+        segmentLowerBound[2] = elements[10];
+        segmentLowerBound[3] = elements[20];
+        segmentLowerBound[4] = elements[30];
+        segmentLowerBound[5] = elements[40];
+        segmentLowerBound[6] = elements[50];
+        segmentLowerBound[7] = elements[60];
+        segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
+        segmentBaseIndex[0] = numBuckets/16;
+        segmentBaseIndex[1] = 3*numBuckets/16;
+        segmentBaseIndex[2] = 5*numBuckets/16;
+        segmentBaseIndex[3] = 7*numBuckets/16;
+        segmentBaseIndex[4] = 9*numBuckets/16;
+        segmentBaseIndex[5] = 11*numBuckets/16;
+        segmentBaseIndex[6] = 13*numBuckets/16;
+        segmentBaseIndex[7] = 15*numBuckets/16;
+        segmentBaseIndex[8] = numBuckets;
+        for (int i = 0; i < 8; i++)
+            if (segmentLowerBound[i+1] == segmentLowerBound[i])
+                segmentIndexScale[i] = 0;
+            else
+                segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
+    }
+    __syncthreads();
+
+    // Assign elements to buckets.
+
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        int segment;
+        for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
+            ;
+        unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
+        bucketIndex = min(max(0, bucketIndex), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
 /**
 * Sum the bucket sizes to compute the start position of each bucket.  This kernel
 * is executed as a single work group.