Unverified Commit 3955033a authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Use large blocks to optimize building the neighbor list (#4147)

* Use large blocks to optimize building the neighbor list

* Large blocks optimization for OpenCL

* Fix test failures

* Select whether to use large blocks based on system size
parent ae5bc677
......@@ -338,6 +338,8 @@ private:
CudaArray sortedBlocks;
CudaArray sortedBlockCenter;
CudaArray sortedBlockBoundingBox;
CudaArray largeBlockCenter;
CudaArray largeBlockBoundingBox;
CudaArray oldPositions;
CudaArray rebuildNeighborList;
CudaSort* blockSorter;
......@@ -351,7 +353,7 @@ private:
std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList;
bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
unsigned int maxTiles, maxSinglePairs, tilesAfterReorder;
long long numTiles;
......
......@@ -74,6 +74,13 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(unsigned int), CU_MEMHOSTALLOC_PORTABLE));
numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
// When building the neighbor list, we can optionally use large blocks (1024 atoms) to
// accelerate the process. This makes building the neighbor list faster, but it prevents
// us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
// list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 90000);
setKernelSource(CudaKernelSources::nonbonded);
}
......@@ -278,6 +285,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
......@@ -333,6 +342,15 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
if (useLargeBlocks) {
sortBoxDataArgs.push_back(&largeBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxSizePointer());
sortBoxDataArgs.push_back(context.getInvPeriodicBoxSizePointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecXPointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecYPointer());
sortBoxDataArgs.push_back(context.getPeriodicBoxVecZPointer());
}
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
......@@ -355,6 +373,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
if (useLargeBlocks) {
findInteractingBlocksArgs.push_back(&largeBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&largeBlockBoundingBox.getDevicePointer());
}
findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
......@@ -396,6 +418,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true;
context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
if (!useLargeBlocks)
blockSorter->sort(sortedBlocks);
context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
......@@ -503,6 +526,8 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
defines["USE_PERIODIC"] = "1";
if (context.getBoxIsTriclinic())
defines["TRICLINIC"] = "1";
if (useLargeBlocks)
defines["USE_LARGE_BLOCKS"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? (context.getComputeCapability() < 8.0 ? "2" : "3") : "0");
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
......
......@@ -86,15 +86,39 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
half3* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index]));
}
#ifdef USE_LARGE_BLOCKS
// Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
real4 minPos = blockCenter[i]-blockBoundingBox[i];
real4 maxPos = blockCenter[i]+blockBoundingBox[i];
int last = min(i+32, NUM_BLOCKS);
for (int j = i+1; j < last; j++) {
real4 blockPos = blockCenter[j];
real4 width = blockBoundingBox[j];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
}
largeBlockCenter[i] = 0.5f*(maxPos+minPos);
largeBlockBoundingBox[i] = half3(trimTo3(0.5f*(maxPos-minPos)));
}
#endif
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = forceRebuild;
......@@ -213,9 +237,12 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
*/
extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const half3* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex,
unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox,
#endif
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
......@@ -278,7 +305,41 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32
// other blocks in parallel.
#ifdef USE_LARGE_BLOCKS
int largeBlockFlags = 0;
int loadedLargeBlocks = 0;
#endif
for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
#ifdef USE_LARGE_BLOCKS
if (loadedLargeBlocks == 0) {
// Check the next set of large blocks.
int largeBlockIndex = block2Base + 32*indexInWarp;
bool includeLargeBlock = false;
if (largeBlockIndex < NUM_BLOCKS) {
real4 largeCenter = largeBlockCenter[largeBlockIndex];
real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
}
largeBlockFlags = BALLOT(includeLargeBlock);
loadedLargeBlocks = 32;
}
loadedLargeBlocks--;
if ((largeBlockFlags&1) == 0) {
// None of the next 32 blocks interact with block 1.
largeBlockFlags >>= 1;
continue;
}
largeBlockFlags >>= 1;
#endif
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS);
bool forceInclude = false;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -319,6 +319,8 @@ private:
OpenCLArray sortedBlocks;
OpenCLArray sortedBlockCenter;
OpenCLArray sortedBlockBoundingBox;
OpenCLArray largeBlockCenter;
OpenCLArray largeBlockBoundingBox;
OpenCLArray oldPositions;
OpenCLArray rebuildNeighborList;
OpenCLSort* blockSorter;
......@@ -332,7 +334,7 @@ private:
std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
unsigned int tilesAfterReorder;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. *
* Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -80,6 +80,13 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
}
pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int));
pinnedCountMemory = (unsigned int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
// When building the neighbor list, we can optionally use large blocks (1024 atoms) to
// accelerate the process. This makes building the neighbor list faster, but it prevents
// us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
// list. We guess based on system size which will be faster.
useLargeBlocks = (context.getNumAtoms() > 100000);
setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
}
......@@ -293,6 +300,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false);
......@@ -354,6 +363,9 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList = true;
setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
if (useLargeBlocks)
setPeriodicBoxArgs(context, kernels.sortBoxDataKernel, 12);
else
blockSorter->sort(sortedBlocks);
kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
......@@ -502,6 +514,8 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
defines["USE_PERIODIC"] = "1";
if (context.getBoxIsTriclinic())
defines["TRICLINIC"] = "1";
if (useLargeBlocks)
defines["USE_LARGE_BLOCKS"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
......@@ -527,6 +541,10 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl_int>(9, true);
if (useLargeBlocks) {
kernels.sortBoxDataKernel.setArg<cl::Buffer>(10, largeBlockCenter.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(11, largeBlockBoundingBox.getDeviceBuffer());
}
kernels.findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles.getDeviceBuffer());
......@@ -542,6 +560,10 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(16, exclusionRowIndices.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(17, oldPositions.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList.getDeviceBuffer());
if (useLargeBlocks) {
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(19, largeBlockCenter.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(20, largeBlockBoundingBox.getDeviceBuffer());
}
if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
// The device can't handle this block size, so reduce it.
......
......@@ -54,13 +54,38 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild) {
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
#ifdef USE_LARGE_BLOCKS
, __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index];
}
#ifdef USE_LARGE_BLOCKS
// Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
real4 minPos = blockCenter[i]-blockBoundingBox[i];
real4 maxPos = blockCenter[i]+blockBoundingBox[i];
int last = min(i+32, NUM_BLOCKS);
for (int j = i+1; j < last; j++) {
real4 blockPos = blockCenter[j];
real4 width = blockBoundingBox[j];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
minPos = min(minPos, blockPos-width);
maxPos = max(maxPos, blockPos+width);
}
largeBlockCenter[i] = 0.5f*(maxPos+minPos);
largeBlockBoundingBox[i] = 0.5f*(maxPos-minPos);
}
#endif
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = forceRebuild;
......@@ -84,7 +109,11 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
__global const int* restrict rebuildNeighborList
#ifdef USE_LARGE_BLOCKS
, __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox
#endif
) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
......@@ -103,6 +132,9 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
__local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
__local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
__local volatile unsigned int* tileStartIndex = workgroupTileIndex+(warpStart/32);
#ifdef USE_LARGE_BLOCKS
__local bool largeBlockFlags[GROUP_SIZE];
#endif
// Loop over blocks.
......@@ -143,7 +175,38 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32
// other blocks in parallel.
#ifdef USE_LARGE_BLOCKS
int loadedLargeBlocks = 0;
#endif
for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
#ifdef USE_LARGE_BLOCKS
if (loadedLargeBlocks == 0) {
// Check the next set of large blocks.
int largeBlockIndex = block2Base + 32*indexInWarp;
bool includeLargeBlock = false;
if (largeBlockIndex < NUM_BLOCKS) {
real4 largeCenter = largeBlockCenter[largeBlockIndex];
real4 largeSize = largeBlockBoundingBox[largeBlockIndex];
real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
}
largeBlockFlags[get_local_id(0)] = includeLargeBlock;
loadedLargeBlocks = 32;
SYNC_WARPS;
}
if (!largeBlockFlags[warpStart+32-(loadedLargeBlocks--)]) {
// None of the next 32 blocks interact with block 1.
continue;
}
#endif
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS);
if (includeBlock2) {
......
......@@ -44,7 +44,12 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild) {
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
#ifdef USE_LARGE_BLOCKS
, __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
......@@ -164,7 +169,11 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
__global const int* restrict rebuildNeighborList
#ifdef USE_LARGE_BLOCKS
, __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox
#endif
) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
int buffer[BUFFER_SIZE];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment