Unverified Commit 796ffaaa authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improved sorting of blocks when building neighbor list (#4343)

* Improved sorting of blocks when building neighbor list

* Improved block sorting for OpenCL

* Made sort keys more evenly distributed
parent 3bff14a2
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. * * Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -338,6 +338,7 @@ private: ...@@ -338,6 +338,7 @@ private:
CudaArray sortedBlocks; CudaArray sortedBlocks;
CudaArray sortedBlockCenter; CudaArray sortedBlockCenter;
CudaArray sortedBlockBoundingBox; CudaArray sortedBlockBoundingBox;
CudaArray blockSizeRange;
CudaArray largeBlockCenter; CudaArray largeBlockCenter;
CudaArray largeBlockBoundingBox; CudaArray largeBlockBoundingBox;
CudaArray oldPositions; CudaArray oldPositions;
...@@ -345,7 +346,7 @@ private: ...@@ -345,7 +346,7 @@ private:
CudaSort* blockSorter; CudaSort* blockSorter;
CUevent downloadCountEvent; CUevent downloadCountEvent;
unsigned int* pinnedCountBuffer; unsigned int* pinnedCountBuffer;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs; std::vector<void*> forceArgs, findBlockBoundsArgs, computeSortKeysArgs, sortBoxDataArgs, findInteractingBlocksArgs;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
...@@ -354,7 +355,7 @@ private: ...@@ -354,7 +355,7 @@ private:
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks; bool useCutoff, usePeriodic, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, canUsePairList, useLargeBlocks;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags; int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags, numBlockSizes;
unsigned int maxTiles, maxSinglePairs, tilesAfterReorder; unsigned int maxTiles, maxSinglePairs, tilesAfterReorder;
long long numTiles; long long numTiles;
std::string kernelSource; std::string kernelSource;
...@@ -371,6 +372,7 @@ public: ...@@ -371,6 +372,7 @@ public:
std::string source; std::string source;
CUfunction forceKernel, energyKernel, forceEnergyKernel; CUfunction forceKernel, energyKernel, forceEnergyKernel;
CUfunction findBlockBoundsKernel; CUfunction findBlockBoundsKernel;
CUfunction computeSortKeysKernel;
CUfunction sortBoxDataKernel; CUfunction sortBoxDataKernel;
CUfunction findInteractingBlocksKernel; CUfunction findInteractingBlocksKernel;
CUfunction findInteractionsWithinBlocksKernel; CUfunction findInteractionsWithinBlocksKernel;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2022 Stanford University and the Authors. * * Portions copyright (c) 2009-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -49,18 +49,15 @@ using namespace std; ...@@ -49,18 +49,15 @@ using namespace std;
class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait { class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait {
public: public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) { BlockSortTrait() {}
} int getDataSize() const {return sizeof(int);}
int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);} int getKeySize() const {return sizeof(int);}
int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);} const char* getDataType() const {return "unsigned int";}
const char* getDataType() const {return "real2";} const char* getKeyType() const {return "unsigned int";}
const char* getKeyType() const {return "real";} const char* getMinKey() const {return "0";}
const char* getMinKey() const {return "-3.40282e+38f";} const char* getMaxKey() const {return "0xFFFFFFFFu";}
const char* getMaxKey() const {return "3.40282e+38f";} const char* getMaxValue() const {return "0xFFFFFFFFu";}
const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";} const char* getSortKey() const {return "value";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
}; };
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true), CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
...@@ -282,14 +279,16 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -282,14 +279,16 @@ void CudaNonbondedUtilities::initialize(const System& system) {
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks"); sortedBlocks.initialize<unsigned int>(context, numAtomBlocks, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
numBlockSizes = min((context.getNumAtomBlocks()+63)/64, context.getNumThreadBlocks());
blockSizeRange.initialize(context, numBlockSizes, 2*elementSize, "blockSizeRange");
largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter"); largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox"); largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false); blockSorter = new CudaSort(context, new BlockSortTrait(), numAtomBlocks, false);
vector<unsigned int> count(2, 0); vector<unsigned int> count(2, 0);
interactionCount.upload(count); interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]); rebuildNeighborList.upload(&count[0]);
...@@ -336,7 +335,11 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -336,7 +335,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer()); findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer()); findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer()); findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer()); findBlockBoundsArgs.push_back(&blockSizeRange.getDevicePointer());
computeSortKeysArgs.push_back(&blockBoundingBox.getDevicePointer());
computeSortKeysArgs.push_back(&sortedBlocks.getDevicePointer());
computeSortKeysArgs.push_back(&blockSizeRange.getDevicePointer());
computeSortKeysArgs.push_back(&numBlockSizes);
sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter.getDevicePointer()); sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer()); sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
...@@ -417,9 +420,9 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -417,9 +420,9 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
if (lastCutoff != kernels.cutoffDistance) if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms()); context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtomBlocks());
if (!useLargeBlocks) context.executeKernel(kernels.computeSortKeysKernel, &computeSortKeysArgs[0], context.getNumAtomBlocks());
blockSorter->sort(sortedBlocks); blockSorter->sort(sortedBlocks);
context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms()); context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256); context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
forceRebuildNeighborList = false; forceRebuildNeighborList = false;
...@@ -530,8 +533,14 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -530,8 +533,14 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
defines["USE_LARGE_BLOCKS"] = "1"; defines["USE_LARGE_BLOCKS"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? (context.getComputeCapability() < 8.0 ? "2" : "3") : "0"); defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? (context.getComputeCapability() < 8.0 ? "2" : "3") : "0");
int binShift = 1;
while (1<<binShift <= context.getNumAtomBlocks())
binShift++;
defines["BIN_SHIFT"] = context.intToString(binShift);
defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines); CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds"); kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.computeSortKeysKernel = context.getKernel(interactingBlocksProgram, "computeSortKeys");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData"); kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions"); kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
} }
......
...@@ -40,9 +40,10 @@ private: ...@@ -40,9 +40,10 @@ private:
*/ */
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
real2* __restrict__ sortedBlocks) { real2* __restrict__ blockSizeRange) {
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
real minSize = 1e38, maxSize = 0;
while (base < numAtoms) { while (base < numAtoms) {
real4 pos = posq[base]; real4 pos = posq[base];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -74,18 +75,66 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -74,18 +75,66 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
center.w = sqrt(center.w); center.w = sqrt(center.w);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = center; blockCenter[index] = center;
sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index); real totalSize = blockSize.x+blockSize.y+blockSize.z;
minSize = min(minSize, totalSize);
maxSize = max(maxSize, totalSize);
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
// Record the range of sizes seen by threads in this block.
__shared__ real minBuffer[64], maxBuffer[64];
minBuffer[threadIdx.x] = minSize;
maxBuffer[threadIdx.x] = maxSize;
__syncthreads();
for (int step = 1; step < 64; step *= 2) {
if (threadIdx.x+step < 64 && threadIdx.x%(2*step) == 0) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
}
__syncthreads();
}
if (threadIdx.x == 0)
blockSizeRange[blockIdx.x] = make_real2(minBuffer[0], maxBuffer[0]);
if (blockIdx.x == 0 && threadIdx.x == 0) if (blockIdx.x == 0 && threadIdx.x == 0)
rebuildNeighborList[0] = 0; rebuildNeighborList[0] = 0;
} }
extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, real2* __restrict__ blockSizeRange, int numSizes) {
// Find the total range of sizes recorded by all blocks.
__shared__ real2 sizeRange;
if (threadIdx.x == 0) {
sizeRange = blockSizeRange[0];
for (int i = 1; i < numSizes; i++) {
real2 size = blockSizeRange[i];
sizeRange.x = min(sizeRange.x, size.x);
sizeRange.y = max(sizeRange.y, size.y);
}
sizeRange.x = LOG(sizeRange.x);
sizeRange.y = LOG(sizeRange.y);
}
__syncthreads();
// Sort keys store the bin in the high order part and the block in the low
// order part.
int numSizeBins = 20;
real scale = numSizeBins/(sizeRange.y-sizeRange.x);
for (unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
real4 box = blockBoundingBox[i];
real size = LOG(box.x+box.y+box.z);
int bin = (size-sizeRange.x)*scale;
bin = max(0, min(bin, numSizeBins-1));
sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
}
}
/** /**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel. * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/ */
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter, extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS #ifdef USE_LARGE_BLOCKS
real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize, real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
...@@ -94,7 +143,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -94,7 +143,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
const real4* __restrict__ posq, const real4* __restrict__ oldPositions, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) { unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y; unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
sortedBlockCenter[i] = blockCenter[index]; sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index])); sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index]));
} }
...@@ -102,12 +151,14 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -102,12 +151,14 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
// Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block. // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
real4 minPos = blockCenter[i]-blockBoundingBox[i]; unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
real4 maxPos = blockCenter[i]+blockBoundingBox[i]; real4 minPos = blockCenter[index]-blockBoundingBox[index];
real4 maxPos = blockCenter[index]+blockBoundingBox[index];
int last = min(i+32, NUM_BLOCKS); int last = min(i+32, NUM_BLOCKS);
for (int j = i+1; j < last; j++) { for (int j = i+1; j < last; j++) {
real4 blockPos = blockCenter[j]; index = sortedBlocks[j] & BLOCK_INDEX_MASK;
real4 width = blockBoundingBox[j]; real4 blockPos = blockCenter[index];
real4 width = blockBoundingBox[index];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos); real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center) APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
...@@ -238,7 +289,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign ...@@ -238,7 +289,7 @@ __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsign
extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex,
unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox, unsigned int numBlocks, unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS #ifdef USE_LARGE_BLOCKS
real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox,
#endif #endif
...@@ -271,8 +322,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac ...@@ -271,8 +322,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac
for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) { for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
// Load data for this block. Note that all threads in a warp are processing the same block. // Load data for this block. Note that all threads in a warp are processing the same block.
real2 sortedKey = sortedBlocks[block1]; int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[block1]; real4 blockCenterX = sortedBlockCenter[block1];
real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3(); real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
int neighborsInBuffer = 0; int neighborsInBuffer = 0;
...@@ -363,7 +413,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac ...@@ -363,7 +413,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac
includeBlock2 = forceInclude = true; includeBlock2 = forceInclude = true;
#endif #endif
if (includeBlock2) { if (includeBlock2) {
int y = (int) sortedBlocks[block2].y; int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
#pragma unroll 4 // (MAX_EXCLUSIONS) #pragma unroll 4 // (MAX_EXCLUSIONS)
for (int k = 0; k < numExclusions; k++) for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y); includeBlock2 &= (exclusionsForX[k] != y);
...@@ -378,7 +428,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac ...@@ -378,7 +428,7 @@ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInterac
int i = __ffs(includeBlockFlags)-1; int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1; includeBlockFlags &= includeBlockFlags-1;
forceInclude = (forceIncludeFlags>>i) & 1; forceInclude = (forceIncludeFlags>>i) & 1;
int y = (int) sortedBlocks[block2Base+i].y; int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
// Check each atom in block Y for interactions. // Check each atom in block Y for interactions.
......
...@@ -319,6 +319,7 @@ private: ...@@ -319,6 +319,7 @@ private:
OpenCLArray sortedBlocks; OpenCLArray sortedBlocks;
OpenCLArray sortedBlockCenter; OpenCLArray sortedBlockCenter;
OpenCLArray sortedBlockBoundingBox; OpenCLArray sortedBlockBoundingBox;
OpenCLArray blockSizeRange;
OpenCLArray largeBlockCenter; OpenCLArray largeBlockCenter;
OpenCLArray largeBlockBoundingBox; OpenCLArray largeBlockBoundingBox;
OpenCLArray oldPositions; OpenCLArray oldPositions;
...@@ -336,7 +337,7 @@ private: ...@@ -336,7 +337,7 @@ private:
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks, isAMD; bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, useNeighborList, forceRebuildNeighborList, useLargeBlocks, isAMD;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks; int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags; int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags, numBlockSizes;
unsigned int tilesAfterReorder; unsigned int tilesAfterReorder;
long long numTiles; long long numTiles;
std::string kernelSource; std::string kernelSource;
...@@ -353,6 +354,7 @@ public: ...@@ -353,6 +354,7 @@ public:
std::string source; std::string source;
cl::Kernel forceKernel, energyKernel, forceEnergyKernel; cl::Kernel forceKernel, energyKernel, forceEnergyKernel;
cl::Kernel findBlockBoundsKernel; cl::Kernel findBlockBoundsKernel;
cl::Kernel computeSortKeysKernel;
cl::Kernel sortBoxDataKernel; cl::Kernel sortBoxDataKernel;
cl::Kernel findInteractingBlocksKernel; cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel; cl::Kernel findInteractionsWithinBlocksKernel;
......
...@@ -41,18 +41,15 @@ using namespace std; ...@@ -41,18 +41,15 @@ using namespace std;
class OpenCLNonbondedUtilities::BlockSortTrait : public OpenCLSort::SortTrait { class OpenCLNonbondedUtilities::BlockSortTrait : public OpenCLSort::SortTrait {
public: public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) { BlockSortTrait() {}
} int getDataSize() const {return sizeof(int);}
int getDataSize() const {return useDouble ? sizeof(mm_double2) : sizeof(mm_float2);} int getKeySize() const {return sizeof(int);}
int getKeySize() const {return useDouble ? sizeof(cl_double) : sizeof(cl_float);} const char* getDataType() const {return "unsigned int";}
const char* getDataType() const {return "real2";} const char* getKeyType() const {return "unsigned int";}
const char* getKeyType() const {return "real";} const char* getMinKey() const {return "0";}
const char* getMinKey() const {return "-MAXFLOAT";} const char* getMaxKey() const {return "0xFFFFFFFFu";}
const char* getMaxKey() const {return "MAXFLOAT";} const char* getMaxValue() const {return "0xFFFFFFFFu";}
const char* getMaxValue() const {return "(real2) (MAXFLOAT, MAXFLOAT)";} const char* getSortKey() const {return "value";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
}; };
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
...@@ -301,14 +298,16 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -301,14 +298,16 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int elementSize = (context.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float)); int elementSize = (context.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks"); sortedBlocks.initialize<cl_uint>(context, numAtomBlocks, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
numBlockSizes = min((context.getNumAtomBlocks()+63)/64, context.getNumThreadBlocks());
blockSizeRange.initialize(context, numBlockSizes, 2*elementSize, "blockSizeRange");
largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter"); largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox"); largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks, false); blockSorter = new OpenCLSort(context, new BlockSortTrait(), numAtomBlocks, false);
vector<cl_uint> count(1, 0); vector<cl_uint> count(1, 0);
interactionCount.upload(count); interactionCount.upload(count);
rebuildNeighborList.upload(count); rebuildNeighborList.upload(count);
...@@ -366,11 +365,11 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -366,11 +365,11 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
if (lastCutoff != kernels.cutoffDistance) if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1); setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms()); context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtomBlocks());
context.executeKernel(kernels.computeSortKeysKernel, context.getNumAtomBlocks());
if (useLargeBlocks) if (useLargeBlocks)
setPeriodicBoxArgs(context, kernels.sortBoxDataKernel, 12); setPeriodicBoxArgs(context, kernels.sortBoxDataKernel, 12);
else blockSorter->sort(sortedBlocks);
blockSorter->sort(sortedBlocks);
kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList); kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms()); context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0); setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
...@@ -526,6 +525,11 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -526,6 +525,11 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
defines["USE_LARGE_BLOCKS"] = "1"; defines["USE_LARGE_BLOCKS"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2"); defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
int binShift = 1;
while (1<<binShift <= context.getNumAtomBlocks())
binShift++;
defines["BIN_SHIFT"] = context.intToString(binShift);
defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks); string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256); int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256);
while (true) { while (true) {
...@@ -537,7 +541,12 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -537,7 +541,12 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter.getDeviceBuffer()); kernels.findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter.getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox.getDeviceBuffer()); kernels.findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox.getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList.getDeviceBuffer()); kernels.findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList.getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(10, sortedBlocks.getDeviceBuffer()); kernels.findBlockBoundsKernel.setArg<cl::Buffer>(10, blockSizeRange.getDeviceBuffer());
kernels.computeSortKeysKernel = cl::Kernel(interactingBlocksProgram, "computeSortKeys");
kernels.computeSortKeysKernel.setArg<cl::Buffer>(0, blockBoundingBox.getDeviceBuffer());
kernels.computeSortKeysKernel.setArg<cl::Buffer>(1, sortedBlocks.getDeviceBuffer());
kernels.computeSortKeysKernel.setArg<cl::Buffer>(2, blockSizeRange.getDeviceBuffer());
kernels.computeSortKeysKernel.setArg<cl_int>(3, numBlockSizes);
kernels.sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData"); kernels.sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
kernels.sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks.getDeviceBuffer()); kernels.sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter.getDeviceBuffer()); kernels.sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter.getDeviceBuffer());
......
...@@ -6,9 +6,10 @@ ...@@ -6,9 +6,10 @@
*/ */
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
__global real2* restrict sortedBlocks) { __global real2* restrict blockSizeRange) {
int index = get_global_id(0); int index = get_global_id(0);
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
real minSize = 1e38, maxSize = 0;
while (base < numAtoms) { while (base < numAtoms) {
real4 pos = posq[base]; real4 pos = posq[base];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -40,18 +41,65 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri ...@@ -40,18 +41,65 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
center.w = sqrt(center.w); center.w = sqrt(center.w);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = center; blockCenter[index] = center;
sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index); real totalSize = blockSize.x+blockSize.y+blockSize.z;
minSize = min(minSize, totalSize);
maxSize = max(maxSize, totalSize);
index += get_global_size(0); index += get_global_size(0);
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
// Record the range of sizes seen by threads in this block.
__local real minBuffer[64], maxBuffer[64];
minBuffer[get_local_id(0)] = minSize;
maxBuffer[get_local_id(0)] = maxSize;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < 64; step *= 2) {
if (get_local_id(0)+step < 64 && get_local_id(0)%(2*step) == 0) {
minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0)
blockSizeRange[get_group_id(0)] = make_real2(minBuffer[0], maxBuffer[0]);
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
rebuildNeighborList[0] = 0; rebuildNeighborList[0] = 0;
} }
__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
// Find the total range of sizes recorded by all blocks.
__local real2 sizeRange;
if (get_local_id(0) == 0) {
sizeRange = blockSizeRange[0];
for (int i = 1; i < numSizes; i++) {
real2 size = blockSizeRange[i];
sizeRange.x = min(sizeRange.x, size.x);
sizeRange.y = max(sizeRange.y, size.y);
}
sizeRange.x = LOG(sizeRange.x);
sizeRange.y = LOG(sizeRange.y);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Sort keys store the bin in the high order part and the block in the low
// order part.
int numSizeBins = 20;
real scale = numSizeBins/(sizeRange.y-sizeRange.x);
for (unsigned int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
real4 box = blockBoundingBox[i];
real size = LOG(box.x+box.y+box.z);
int bin = (size-sizeRange.x)*scale;
bin = max(0, min(bin, numSizeBins-1));
sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
}
}
/** /**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel. * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/ */
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter, __kernel void sortBoxData(__global const unsigned int* restrict sortedBlocks, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter, __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions, __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
...@@ -61,7 +109,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -61,7 +109,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
#endif #endif
) { ) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y; unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
sortedBlockCenter[i] = blockCenter[index]; sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index]; sortedBlockBoundingBox[i] = blockBoundingBox[index];
} }
...@@ -69,12 +117,14 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -69,12 +117,14 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
// Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block. // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
real4 minPos = blockCenter[i]-blockBoundingBox[i]; unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
real4 maxPos = blockCenter[i]+blockBoundingBox[i]; real4 minPos = blockCenter[index]-blockBoundingBox[index];
real4 maxPos = blockCenter[index]+blockBoundingBox[index];
int last = min(i+32, NUM_BLOCKS); int last = min(i+32, NUM_BLOCKS);
for (int j = i+1; j < last; j++) { for (int j = i+1; j < last; j++) {
real4 blockPos = blockCenter[j]; index = sortedBlocks[j] & BLOCK_INDEX_MASK;
real4 width = blockBoundingBox[j]; real4 blockPos = blockCenter[index];
real4 width = blockBoundingBox[index];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos); real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center) APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
...@@ -106,7 +156,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -106,7 +156,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global unsigned int* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList __global const int* restrict rebuildNeighborList
...@@ -141,8 +191,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -141,8 +191,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) { for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
// Load data for this block. Note that all threads in a warp are processing the same block. // Load data for this block. Note that all threads in a warp are processing the same block.
real2 sortedKey = sortedBlocks[block1]; int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[block1]; real4 blockCenterX = sortedBlockCenter[block1];
real4 blockSizeX = sortedBlockBoundingBox[block1]; real4 blockSizeX = sortedBlockBoundingBox[block1];
int neighborsInBuffer = 0; int neighborsInBuffer = 0;
...@@ -229,7 +278,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -229,7 +278,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
includeBlock2 = true; includeBlock2 = true;
#endif #endif
if (includeBlock2) { if (includeBlock2) {
int y = (int) sortedBlocks[block2].y; int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
for (int k = 0; k < numExclusions; k++) for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y); includeBlock2 &= (exclusionsForX[k] != y);
} }
...@@ -243,7 +292,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -243,7 +292,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
while (i < TILE_SIZE && !includeBlockFlags[warpStart+i]) while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
i++; i++;
if (i < TILE_SIZE) { if (i < TILE_SIZE) {
int y = (int) sortedBlocks[block2Base+i].y; int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
// Check each atom in block Y for interactions. // Check each atom in block Y for interactions.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment