Unverified Commit 7d7490ea authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Port changes from the main repository (mostly related to large blocks)

Skip neighbor list for very small systems

    https://github.com/openmm/openmm/pull/4070

Store bounding box sizes in half precision

    https://github.com/openmm/openmm/commit/2ae50f9

Use large blocks to optimize building the neighbor list

    https://github.com/openmm/openmm/commit/3955033

Improved sorting of blocks when building neighbor list

    https://github.com/openmm/openmm/commit/796ffaa

Fixed bug in large blocks optimization with triclinic boxes

    https://github.com/openmm/openmm/commit/4c10732

Optimize sorting of non-uniformly distributed data

    https://github.com/openmm/openmm/commit/71d9bb1

Co-authored-by: default avatarbdenhollander <44237618+bdenhollander@users.noreply.github.com>
parent f965707b
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
......@@ -82,8 +82,10 @@ public:
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList = true);
/**
* Add a nonbonded interaction to be evaluated by the default interaction kernel.
*
......@@ -94,9 +96,11 @@ public:
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param usesNeighborList specifies whether a neighbor list should be used to optimize this interaction. This should
* be viewed as only a suggestion. Even when it is false, a neighbor list may be used anyway.
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList);
void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList);
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
......@@ -335,12 +339,15 @@ private:
HipArray sortedBlocks;
HipArray sortedBlockCenter;
HipArray sortedBlockBoundingBox;
HipArray blockSizeRange;
HipArray largeBlockCenter;
HipArray largeBlockBoundingBox;
HipArray oldPositions;
HipArray rebuildNeighborList;
HipSort* blockSorter;
hipEvent_t downloadCountEvent;
unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs, copyInteractionCountsArgs;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
......@@ -348,7 +355,7 @@ private:
std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks;
int startTileIndex, startBlockIndex, numBlocks, numTilesInBatch, maxExclusions;
int numForceThreadBlocks, forceThreadBlockSize, findInteractingBlocksThreadBlockSize, numAtoms, groupFlags;
unsigned int maxTiles, maxSinglePairs, tilesAfterReorder;
......@@ -367,6 +374,7 @@ public:
std::string source;
hipFunction_t forceKernel, energyKernel, forceEnergyKernel;
hipFunction_t findBlockBoundsKernel;
hipFunction_t computeSortKeysKernel;
hipFunction_t sortBoxDataKernel;
hipFunction_t findInteractingBlocksKernel;
hipFunction_t copyInteractionCountsKernel;
......
......@@ -995,7 +995,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
}
source = cu.replaceStrings(source, replacements);
if (force.getIncludeDirectSpace())
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), numParticles > 3000, true);
// Initialize the exceptions.
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
......@@ -50,21 +50,18 @@ using namespace std;
class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) {
}
int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
const char* getDataType() const {return "real2";}
const char* getKeyType() const {return "real";}
const char* getMinKey() const {return "-3.40282e+38f";}
const char* getMaxKey() const {return "3.40282e+38f";}
const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
BlockSortTrait() {}
int getDataSize() const {return sizeof(int);}
int getKeySize() const {return sizeof(int);}
const char* getDataType() const {return "unsigned int";}
const char* getKeyType() const {return "unsigned int";}
const char* getMinKey() const {return "0";}
const char* getMaxKey() const {return "0xFFFFFFFFu";}
const char* getMaxValue() const {return "0xFFFFFFFFu";}
const char* getSortKey() const {return "value";}
};
HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true), tilesAfterReorder(0) {
// Decide how many thread blocks to use.
......@@ -74,6 +71,13 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont
numForceThreadBlocks = 5*4*context.getMultiprocessors();
forceThreadBlockSize = 64;
findInteractingBlocksThreadBlockSize = context.getSIMDWidth();
// When building the neighbor list, we can optionally use large blocks (32 * warpSize atoms) to
// accelerate the process. This makes building the neighbor list faster, but it prevents
// us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
// list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 90000);
setKernelSource(HipKernelSources::nonbonded);
}
......@@ -85,11 +89,11 @@ HipNonbondedUtilities::~HipNonbondedUtilities() {
hipEventDestroy(downloadCountEvent);
}
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, usesNeighborList, false);
}
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool usesNeighborList, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
......@@ -102,6 +106,7 @@ void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, b
requestExclusions(exclusionList);
useCutoff = usesCutoff;
usePeriodic = usesPeriodic;
useNeighborList |= (usesNeighborList && useCutoff);
groupCutoff[forceGroup] = cutoffDistance;
groupFlags |= 1<<forceGroup;
canUsePairList &= supportsPairList;
......@@ -290,15 +295,23 @@ void HipNonbondedUtilities::initialize(const System& system) {
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlocks.initialize<unsigned int>(context, numAtomBlocks, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
blockSizeRange.initialize(context, 2, elementSize, "blockSizeRange");
largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
largeBlockBoundingBox.initialize(context, numAtomBlocks*4, elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
blockSorter = new HipSort(context, new BlockSortTrait(), numAtomBlocks, false);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
if (context.getUseDoublePrecision()) {
blockSizeRange.upload(vector<double>{1e38, 0});
} else {
blockSizeRange.upload(vector<float>{1e38, 0});
}
}
// Record arguments for kernels.
......@@ -342,17 +355,30 @@ void HipNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer());
computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer());
computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer());
computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
if (useLargeBlocks) {
sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer());
sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer());
}
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList);
sortBoxDataArgs.push_back(&blockSizeRange.getDevicePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
......@@ -370,6 +396,10 @@ void HipNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
if (useLargeBlocks) {
findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
}
findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
......@@ -396,23 +426,24 @@ void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
return;
if (groupKernels.find(forceGroups) == groupKernels.end())
createKernelsForGroups(forceGroups);
if (!useCutoff)
return;
if (numTiles == 0)
return;
KernelSet& kernels = groupKernels[forceGroups];
if (usePeriodic) {
if (useCutoff && usePeriodic) {
double4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*kernels.cutoffDistance;
if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
}
if (!useNeighborList)
return;
if (numTiles == 0)
return;
// Compute the neighbor list.
if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true;
context.executeKernelFlat(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getPaddedNumAtoms(), context.getSIMDWidth());
context.executeKernelFlat(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks());
blockSorter->sort(sortedBlocks);
context.executeKernelFlat(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms(), 64);
context.executeKernelFlat(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtomBlocks() * context.getSIMDWidth() * numTilesInBatch, findInteractingBlocksThreadBlockSize);
......@@ -432,7 +463,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
if (useCutoff && numTiles > 0) {
if (useNeighborList && numTiles > 0) {
hipEventSynchronize(downloadCountEvent);
updateNeighborListSize();
}
......@@ -521,6 +552,8 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
defines["USE_PERIODIC"] = "1";
if (context.getBoxIsTriclinic())
defines["TRICLINIC"] = "1";
if (useLargeBlocks)
defines["USE_LARGE_BLOCKS"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
int maxBits = 0;
if (canUsePairList) {
......@@ -549,8 +582,14 @@ void HipNonbondedUtilities::createKernelsForGroups(int groups) {
defines["MAX_BITS_FOR_PAIRS"] = context.intToString(maxBits);
defines["NUM_TILES_IN_BATCH"] = context.intToString(numTilesInBatch);
defines["GROUP_SIZE"] = context.intToString(findInteractingBlocksThreadBlockSize);
int binShift = 1;
while (1<<binShift <= context.getNumAtomBlocks())
binShift++;
defines["BIN_SHIFT"] = context.intToString(binShift);
defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.computeSortKeysKernel = context.getKernel(interactingBlocksProgram, "computeSortKeys");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
kernels.copyInteractionCountsKernel = context.getKernel(interactingBlocksProgram, "copyInteractionCounts");
......@@ -669,6 +708,8 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
if (useNeighborList)
defines["USE_NEIGHBOR_LIST"] = "1";
defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
if (includeForces)
defines["INCLUDE_FORCES"] = "1";
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2010-2021 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
......@@ -44,11 +44,12 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["UNIFORM"] = (uniform ? "1" : "0");
hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
shortList2Kernel = context.getKernel(module, "sortShortList2");
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
assignElementsKernel = context.getKernel(module, uniform ? "assignElementsToBuckets" : "assignElementsToBuckets2");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
......@@ -58,7 +59,7 @@ HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length, boo
int maxSharedMem;
hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
int maxShortList = min(1024, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
isShortList = (length <= maxShortList);
sortKernelSize = 256;
rangeKernelSize = 256;
......
......@@ -18,12 +18,27 @@ __device__ inline int warpPopc(warpflags x) {
#endif
struct alignas(sizeof(__half) * 4) BoundingBox {
__device__ BoundingBox(real3 f) {
// Round up so we'll err on the side of making the box a little too large.
// This ensures interactions will never be missed.
v[0] = __float2half_ru((float) f.x);
v[1] = __float2half_ru((float) f.y);
v[2] = __float2half_ru((float) f.z);
}
__device__ real3 toReal3() const {
return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
}
private:
__half v[3];
};
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
real2* __restrict__ sortedBlocks) {
real2* __restrict__ blockSizeRange) {
const int indexInTile = threadIdx.x%TILE_SIZE;
const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE);
const int base = index * TILE_SIZE;
......@@ -97,27 +112,77 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
center.w = SQRT(tcenter);
blockBoundingBox[index] = blockSize;
blockCenter[index] = center;
// blockSize.x+blockSize.y+blockSize.z has a distibution that looks like a normal distribution.
// This causes HipSort's buckets to have very non-uniform sizes, so a few very long buckets are
// sorted in global memory. -1/max(x, y, z) or -1/(x+y+z) have a "faster" distribution.
sortedBlocks[index] = make_real2(-RECIP(max(max(blockSize.x, blockSize.y), blockSize.z)), index);
// Record the range of sizes.
real totalSize = blockSize.x+blockSize.y+blockSize.z;
atomicMin(&blockSizeRange->x, totalSize);
atomicMax(&blockSizeRange->y, totalSize);
}
if (blockIdx.x == 0 && threadIdx.x == 0)
rebuildNeighborList[0] = 0;
}
extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) {
// Sort keys store the bin in the high order part and the block in the low
// order part.
real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y));
int numSizeBins = 20;
real scale = numSizeBins/(sizeRange.y-sizeRange.x);
unsigned int i = threadIdx.x+blockIdx.x*blockDim.x;
if (i < NUM_BLOCKS) {
real4 box = blockBoundingBox[i];
real size = LOG(box.x+box.y+box.z);
int bin = (size-sizeRange.x)*scale;
bin = max(0, min(bin, numSizeBins-1));
sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
}
}
/**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, BoundingBox* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
real4* __restrict__ largeBlockCenter, BoundingBox* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild,
real2* __restrict__ blockSizeRange) {
int i = threadIdx.x+blockIdx.x*blockDim.x;
if (i < NUM_BLOCKS) {
int index = (int) sortedBlock[i].y;
unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index];
sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index]));
#ifdef USE_LARGE_BLOCKS
// Compute the sizes of large blocks (composed of warpSize regular blocks) starting from each block.
real4 minPos = blockCenter[index]-blockBoundingBox[index];
real4 maxPos = blockCenter[index]+blockBoundingBox[index];
int last = min(i+warpSize, NUM_BLOCKS);
for (int j = i+1; j < last; j++) {
unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
real4 blockPos = blockCenter[index2];
real4 width = blockBoundingBox[index2];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
}
largeBlockCenter[i] = 0.5f*(maxPos+minPos);
largeBlockBoundingBox[i] = BoundingBox(trimTo3(0.5f*(maxPos-minPos)));
#endif
}
// Initialize min and max for the atomic reduction in findBlockBounds (for the next step)
if (i == 0) {
blockSizeRange->x = 1e38;
blockSizeRange->y = 0;
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
......@@ -239,8 +304,12 @@ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned i
extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const BoundingBox* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox,
#endif
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
......@@ -271,10 +340,9 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
if (block1 < startBlockIndex+numBlocks) {
// Load data for this block. Note that all threads in a warp are processing the same block.
real2 sortedKey = sortedBlocks[block1];
int x = (int) sortedKey.y;
int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
real4 blockCenterX = sortedBlockCenter[block1];
real4 blockSizeX = sortedBlockBoundingBox[block1];
real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
int neighborsInBuffer = 0;
real4 pos1 = posq[x*TILE_SIZE+indexInTile];
#ifdef USE_PERIODIC
......@@ -309,10 +377,52 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
// units are idle at the end of the kernel (the kernel works on the upper triangle of
// the NUM_BLOCKS x NUM_BLOCKS matrix).
#ifdef USE_LARGE_BLOCKS
warpflags largeBlockFlags = 0;
int loadedLargeBlocks = 0;
#endif
int block2Count = 0;
// Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox.
for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) {
// Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2)
const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS;
#ifdef USE_LARGE_BLOCKS
if (loadedLargeBlocks == 0) {
// Check the next set of large blocks.
int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp;
bool includeLargeBlock = false;
if (largeBlockIndex < NUM_BLOCKS) {
real4 largeCenter = largeBlockCenter[largeBlockIndex];
real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
#ifdef TRICLINIC
// The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
// If there's any possibility we might have missed it, do a detailed check.
if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
includeLargeBlock = true;
#endif
}
largeBlockFlags = __ballot(includeLargeBlock);
loadedLargeBlocks = warpSize;
}
loadedLargeBlocks--;
if ((largeBlockFlags&1) == 0 && !lastIteration) {
// None of the next warpSize blocks interact with block 1.
largeBlockFlags >>= 1;
continue;
}
largeBlockFlags >>= 1;
#endif
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS);
block2 = includeBlock2 ? block2 : block1;
......@@ -327,7 +437,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
if (!lastIteration && __ballot(includeBlock2) == 0)
continue;
#endif
real4 blockSizeY = sortedBlockBoundingBox[block2];
real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
......@@ -359,7 +469,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteracti
const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)];
const bool forceInclude = b & 1;
const int block2 = b >> 1;
int y = (int) sortedBlocks[block2].y;
int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
#pragma unroll 1
for (int k = indexInTile; k < numExclusions; k += TILE_SIZE)
......
......@@ -236,7 +236,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
#ifdef USE_NEIGHBOR_LIST
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
......@@ -261,7 +261,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
// Extract the coordinates of this tile.
int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
#ifdef USE_NEIGHBOR_LIST
x = tiles[pos];
real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
......@@ -296,7 +296,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_CUTOFF
#ifdef USE_NEIGHBOR_LIST
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
......@@ -453,7 +453,7 @@ extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
#if USE_NEIGHBOR_LIST
const unsigned int numPairs = interactionCount[1];
if (numPairs > maxSinglePairs)
return; // There wasn't enough memory for the neighbor list.
......
......@@ -95,6 +95,7 @@ inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
#if UNIFORM
extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY;
......@@ -131,6 +132,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
}
reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
}
#endif
// Clear the bucket counters in preparation for the next kernel.
......@@ -139,7 +141,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
}
/**
* Assign elements to buckets.
* Assign elements to buckets. This version is optimized for uniformly distributed data.
*/
__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
......@@ -154,6 +156,86 @@ __global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsi
}
}
/**
* Assign elements to buckets. This version is optimized for non-uniformly distributed data.
*/
__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
// Load 64 datapoints and sort them to get an estimate of the data distribution.
__shared__ KEY_TYPE elements[64];
if (threadIdx.x < 64) {
int index = (int) (threadIdx.x*length/64.0);
elements[threadIdx.x] = getValue(data[index]);
}
__syncthreads();
for (unsigned int k = 2; k <= 64; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
if (threadIdx.x < 64) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
KEY_TYPE value1 = elements[threadIdx.x];
KEY_TYPE value2 = elements[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? value1 : value2);
KEY_TYPE highKey = (ascending ? value2 : value1);
if (lowKey > highKey) {
elements[threadIdx.x] = value2;
elements[ixj] = value1;
}
}
}
__syncthreads();
}
}
// Create a function composed of linear segments mapping data values to bucket indices.
__shared__ float segmentLowerBound[9];
__shared__ float segmentBaseIndex[9];
__shared__ float segmentIndexScale[9];
if (threadIdx.x == 0) {
segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
segmentLowerBound[1] = elements[5];
segmentLowerBound[2] = elements[10];
segmentLowerBound[3] = elements[20];
segmentLowerBound[4] = elements[30];
segmentLowerBound[5] = elements[40];
segmentLowerBound[6] = elements[50];
segmentLowerBound[7] = elements[60];
segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
segmentBaseIndex[0] = numBuckets/16;
segmentBaseIndex[1] = 3*numBuckets/16;
segmentBaseIndex[2] = 5*numBuckets/16;
segmentBaseIndex[3] = 7*numBuckets/16;
segmentBaseIndex[4] = 9*numBuckets/16;
segmentBaseIndex[5] = 11*numBuckets/16;
segmentBaseIndex[6] = 13*numBuckets/16;
segmentBaseIndex[7] = 15*numBuckets/16;
segmentBaseIndex[8] = numBuckets;
for (int i = 0; i < 8; i++)
if (segmentLowerBound[i+1] == segmentLowerBound[i])
segmentIndexScale[i] = 0;
else
segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
}
__syncthreads();
// Assign elements to buckets.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
int segment;
for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
;
unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
bucketIndex = min(max(0, bucketIndex), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment