"platforms/cuda/vscode:/vscode.git/clone" did not exist on "30fa6ecb8106793ce9464f2c0ab0264c1ea6805e"
Commit 80be998e authored by peastman's avatar peastman
Browse files

Merge pull request #609 from peastman/nb

Rewrote the neighbor list generation kernel to be simpler and (slightly) faster
parents f9e24fea 68984320
#define GROUP_SIZE 256
#define BUFFER_GROUPS 2
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
#define BUFFER_SIZE 256
/**
* Find a bounding box for the atoms in each block.
......@@ -70,220 +67,27 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
}
}
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
__device__ void prefixSum(short* sum, ushort2* temp) {
#if __CUDA_ARCH__ >= 300
const int indexInWarp = threadIdx.x%WARP_SIZE;
const int warpMask = (2<<indexInWarp)-1;
for (int base = 0; base < BUFFER_SIZE; base += blockDim.x)
temp[base+threadIdx.x].x = __popc(__ballot(sum[base+threadIdx.x])&warpMask);
__syncthreads();
if (threadIdx.x < BUFFER_SIZE/WARP_SIZE) {
int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
for (int offset = 1; offset < BUFFER_SIZE/WARP_SIZE; offset *= 2) {
short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
if (indexInWarp >= offset)
multiWarpSum += n;
}
temp[threadIdx.x].y = multiWarpSum;
}
__syncthreads();
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].x+(i < WARP_SIZE ? 0 : temp[i/WARP_SIZE-1].y);
__syncthreads();
#else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].x = sum[i];
__syncthreads();
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer;
__syncthreads();
}
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].x;
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].y;
__syncthreads();
#endif
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
__device__ void storeInteractionData(int x, unsigned short* buffer, short* sum, ushort2* temp, int* atoms, int& numAtoms,
int& baseIndex, unsigned int* interactionCount, int* interactingTiles, unsigned int* interactingAtoms, real4 periodicBoxSize,
real4 invPeriodicBoxSize, const real4* posq, real3* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (threadIdx.x < TILE_SIZE) {
real3 pos = trimTo3(posq[x*TILE_SIZE+threadIdx.x]);
posBuffer[threadIdx.x] = pos;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
posBuffer[threadIdx.x] = pos;
}
#endif
}
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = (buffer[i] == INVALID ? 0 : 1);
__syncthreads();
prefixSum(sum, temp);
int numValid = sum[BUFFER_SIZE-1];
// Compact the buffer.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
if (buffer[i] != INVALID)
temp[sum[i]-1].x = buffer[i];
__syncthreads();
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
buffer[i] = temp[i].x;
__syncthreads();
// Loop over the tiles and find specific interactions in them.
const int indexInWarp = threadIdx.x%WARP_SIZE;
for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for (int i = threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
// Check each atom in block Y for interactions.
real3 pos = trimTo3(posq[buffer[base+i]*TILE_SIZE+indexInWarp]);
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
#endif
bool interacts = false;
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos-posBuffer[j];
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos-posBuffer[j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
#ifdef USE_PERIODIC
}
#endif
sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
}
for (int i = numValid-base+threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp] = 0;
// Compact the list of atoms.
__syncthreads();
prefixSum(sum, temp);
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
if (sum[i] != (i == 0 ? 0 : sum[i-1]))
atoms[numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
// Store them to global memory.
int atomsToStore = numAtoms+sum[BUFFER_SIZE-1];
bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) {
if (threadIdx.x == 0)
baseIndex = atomicAdd(interactionCount, tilesToStore);
__syncthreads();
if (threadIdx.x == 0)
numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
if (baseIndex+tilesToStore <= maxTiles) {
if (threadIdx.x < tilesToStore)
interactingTiles[baseIndex+threadIdx.x] = x;
for (int i = threadIdx.x; i < tilesToStore*TILE_SIZE; i += blockDim.x)
interactingAtoms[baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
}
}
else {
__syncthreads();
if (threadIdx.x == 0)
numAtoms += sum[BUFFER_SIZE-1];
}
__syncthreads();
if (threadIdx.x < numAtoms && !storePartialTile)
atoms[threadIdx.x] = atoms[tilesToStore*TILE_SIZE+threadIdx.x];
}
if (numValid == 0 && numAtoms > 0 && finish) {
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if (threadIdx.x == 0)
baseIndex = atomicAdd(interactionCount, 1);
__syncthreads();
if (baseIndex < maxTiles) {
if (threadIdx.x == 0)
interactingTiles[baseIndex] = x;
if (threadIdx.x < TILE_SIZE)
interactingAtoms[baseIndex*TILE_SIZE+threadIdx.x] = (threadIdx.x < numAtoms ? atoms[threadIdx.x] : NUM_ATOMS);
}
}
// Reset the buffer for processing more tiles.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
buffer[i] = INVALID;
__syncthreads();
}
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
*
* STAGE 1:
*
* A coarse grain atomblock against interacting atomblock neighbourlist is constructed.
* A coarse grained atom block against interacting atom block neighbour list is constructed.
*
* Each threadblock first loads in some block X of interest. Each thread within the threadblock then loads
* in a different atomblock Y. If Y has exclusions with X, then Y is not processed. If the bounding boxes
* of the two atomblocks are within the cutoff distance, then the two atomblocks are considered to be
* interacting and Y is added to the buffer for X. If during any given iteration an atomblock (or thread)
* finds BUFFER_GROUP interacting blocks, the entire buffer is sent for compaction by storeInteractionData().
* Each warp first loads in some block X of interest. Each thread within the warp then loads
* in a different atom block Y. If Y has exclusions with X, then Y is not processed. If the bounding boxes
* of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
* interacting and Y is added to the buffer for X.
*
* STAGE 2:
*
* A fine grain atomblock against interacting atoms neighbourlist is constructed.
* A fine grained atom block against interacting atoms neighbour list is constructed.
*
* The input is an atomblock list detailing the interactions with other atomblocks. The list of interacting
* atom blocks are initially stored in the buffer array in shared memory. buffer is then compacted using
* prefixSum. Afterwards, each threadblock processes one contiguous atomblock X. Each warp in a threadblock
* processes a block Y to find the atoms that interact with any given atom in X. Once BUFFER_SIZE/WARP_SIZE
* (eg. 16) atomblocks have been processed for a given X, the list of interacting atoms in these 16 blocks
* are subsequently compacted. The process repeats until all atomblocks that interact with X are computed.
* The warp loops over atom blocks Y that were found to (possibly) interact with atom block X. Each thread
* in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y.
* If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
* This continues until the buffer fills up, at which point the results are written to global memory.
*
* [in] periodicBoxSize - size of the rectangular periodic box
* [in] invPeriodicBoxSize - inverse of the periodic box
......@@ -317,87 +121,167 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const real4* __restrict__ sortedBlockBoundingBox,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions,
const int* __restrict__ rebuildNeighborList) {
__shared__ unsigned short buffer[BUFFER_SIZE];
__shared__ short sum[BUFFER_SIZE];
__shared__ ushort2 temp[BUFFER_SIZE];
__shared__ int atoms[BUFFER_SIZE+TILE_SIZE];
__shared__ real3 posBuffer[TILE_SIZE];
__shared__ int exclusionsForX[MAX_EXCLUSIONS];
__shared__ int bufferFull;
__shared__ int globalIndex;
__shared__ int numAtoms;
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
int valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
buffer[i*GROUP_SIZE+threadIdx.x] = INVALID;
__syncthreads();
// Loop over blocks sorted by size.
for (int i = startBlockIndex+blockIdx.x; i < startBlockIndex+numBlocks; i += gridDim.x) {
if (threadIdx.x == blockDim.x-1)
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
const int indexInWarp = threadIdx.x%32;
const int warpStart = threadIdx.x-indexInWarp;
const int totalWarps = blockDim.x*gridDim.x/32;
const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
const int warpMask = (1<<indexInWarp)-1;
__shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
__shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
__shared__ real3 posBuffer[GROUP_SIZE];
__shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
volatile int& tileStartIndex = workgroupTileIndex[warpStart/32];
// Loop over blocks.
for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
// Load data for this block. Note that all threads in a warp are processing the same block.
real2 sortedKey = sortedBlocks[block1];
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i];
real4 blockCenterX = sortedBlockCenter[block1];
real4 blockSizeX = sortedBlockBoundingBox[block1];
int neighborsInBuffer = 0;
real3 pos1 = trimTo3(posq[x*TILE_SIZE+indexInWarp]);
#ifdef USE_PERIODIC
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos1.x -= floor((pos1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos1.y -= floor((pos1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos1.z -= floor((pos1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
#endif
posBuffer[threadIdx.x] = pos1;
// Load exclusion data for block x.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
for (int j = threadIdx.x; j < numExclusions; j += blockDim.x)
for (int j = indexInWarp; j < numExclusions; j += 32)
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
if (MAX_EXCLUSIONS > 32)
__syncthreads();
// Compare it to other blocks after this one in sorted order.
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32
// other blocks in parallel.
for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS);
if (includeBlock2) {
real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : make_real4(0));
real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : make_real4(0));
real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
if (includeBlock2) {
unsigned short y = (unsigned short) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y);
}
}
// Loop over any blocks we identified as potentially containing neighbors.
int includeBlockFlags = __ballot(includeBlock2);
while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1;
unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
for (int base = i+1; base < NUM_BLOCKS; base += blockDim.x) {
int j = base+threadIdx.x;
real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : make_real2(0));
real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : make_real4(0));
real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : make_real4(0));
unsigned short y = (unsigned short) sortedKey2.y;
real4 delta = blockCenterX-blockCenterY;
// Check each atom in block Y for interactions.
int start = y*TILE_SIZE;
int atom2 = start+indexInWarp;
real3 pos2 = trimTo3(posq[atom2]);
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
pos2.x -= floor((pos2.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos2.y -= floor((pos2.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos2.z -= floor((pos2.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
#endif
bool interacts = false;
if (atom2 < NUM_ATOMS) {
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos2-posBuffer[warpStart+j];
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
delta.x = max(0.0f, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
delta.y = max(0.0f, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
delta.z = max(0.0f, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
buffer[bufferIndex] = y;
valuesInBuffer++;
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos2-posBuffer[warpStart+j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
#ifdef USE_PERIODIC
}
#endif
}
// cuda-memcheck --tool racecheck will throw errors about this as
// RAW/WAW/WAR race condition errors. But this is safe in all instances
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
// Add any interacting atoms to the buffer.
int includeAtomFlags = __ballot(interacts);
if (interacts)
buffer[neighborsInBuffer+__popc(includeAtomFlags&warpMask)] = atom2;
neighborsInBuffer += __popc(includeAtomFlags);
if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
// Store the new tiles to memory.
int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore < maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
}
__syncthreads();
if (bufferFull) {
storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
neighborsInBuffer -= TILE_SIZE*tilesToStore;
}
}
}
// If we have a partially filled buffer, store it to memory.
if (neighborsInBuffer > 0) {
int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore < maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
}
__syncthreads();
}
storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
}
// Record the positions the neighbor list is based on.
......
......@@ -471,21 +471,25 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double3 real3;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double3 mixed3;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
......
......@@ -319,7 +319,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
int groupSize = (deviceIsCpu ? 32 : 128);
int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256);
while (true) {
defines["GROUP_SIZE"] = context.intToString(groupSize);
cl::Program interactingBlocksProgram = context.createProgram(file, defines);
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
#define BUFFER_SIZE 256
/**
* Find a bounding box for the atoms in each block.
......@@ -67,92 +65,115 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
}
}
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
void prefixSum(__local short* sum, __local ushort2* temp) {
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x = sum[i];
barrier(CLK_LOCAL_MEM_FENCE);
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (whichBuffer == 0)
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
sum[i] = temp[i].x;
else
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
sum[i] = temp[i].y;
barrier(CLK_LOCAL_MEM_FENCE);
}
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
__global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
void storeInteractionData(int x, __local unsigned short* buffer, __local short* sum, __local ushort2* temp, __local int* atoms, __local int* numAtoms,
__local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
real4 invPeriodicBoxSize, __global const real4* posq, __local real4* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
const int indexInWarp = get_local_id(0)%32;
const int warpStart = get_local_id(0)-indexInWarp;
const int totalWarps = get_global_size(0)/32;
const int warpIndex = get_global_id(0)/32;
const int warpMask = (1<<indexInWarp)-1;
__local int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
__local int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
__local real3 posBuffer[GROUP_SIZE];
__local volatile int workgroupTileIndex[GROUP_SIZE/32];
__local bool includeBlockFlags[GROUP_SIZE];
__local short2 atomCountBuffer[GROUP_SIZE];
__local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
__local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
__local volatile int* tileStartIndex = workgroupTileIndex+(warpStart/32);
// Loop over blocks.
for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
// Load data for this block. Note that all threads in a warp are processing the same block.
real2 sortedKey = sortedBlocks[block1];
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[block1];
real4 blockSizeX = sortedBlockBoundingBox[block1];
int neighborsInBuffer = 0;
real3 pos1 = posq[x*TILE_SIZE+indexInWarp].xyz;
#ifdef USE_PERIODIC
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (get_local_id(0) < TILE_SIZE) {
real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
pos1.xyz -= floor((pos1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
}
#endif
posBuffer[get_local_id(0)] = pos;
}
posBuffer[get_local_id(0)] = pos1;
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
// Load exclusion data for block x.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
for (int j = indexInWarp; j < numExclusions; j += 32)
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
if (MAX_EXCLUSIONS > 32)
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
sum[i] = (buffer[i] == INVALID ? 0 : 1);
barrier(CLK_LOCAL_MEM_FENCE);
prefixSum(sum, temp);
int numValid = sum[BUFFER_SIZE-1];
// Compact the buffer.
else
SYNC_WARPS;
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32
// other blocks in parallel.
for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS);
if (includeBlock2) {
real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : (real4) (0));
real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : (real4) (0));
real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
blockDelta.xyz -= floor(blockDelta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
if (includeBlock2) {
unsigned short y = (unsigned short) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y);
}
}
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
if (buffer[i] != INVALID)
temp[sum[i]-1].x = buffer[i];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
buffer[i] = temp[i].x;
barrier(CLK_LOCAL_MEM_FENCE);
// Loop over any blocks we identified as potentially containing neighbors.
// Loop over the tiles and find specific interactions in them.
includeBlockFlags[get_local_id(0)] = includeBlock2;
SYNC_WARPS;
for (int i = 0; i < TILE_SIZE; i++) {
while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
i++;
if (i < TILE_SIZE) {
unsigned short y = (unsigned short) sortedBlocks[block2Base+i].y;
const int indexInWarp = get_local_id(0)%WARP_SIZE;
for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
// Check each atom in block Y for interactions.
real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
int start = y*TILE_SIZE;
int atom2 = start+indexInWarp;
real3 pos2 = posq[atom2].xyz;
#ifdef USE_PERIODIC
if (singlePeriodicCopy)
pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
pos2.xyz -= floor((pos2.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
bool interacts = false;
if (atom2 < NUM_ATOMS) {
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
real3 delta = pos2-posBuffer[warpStart+j];
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
......@@ -160,170 +181,69 @@ void storeInteractionData(int x, __local unsigned short* buffer, __local short*
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
real3 delta = pos2-posBuffer[warpStart+j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
#ifdef USE_PERIODIC
}
#endif
sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
}
for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp] = 0;
// Compact the list of atoms.
// Do a prefix sum to compact the list of atoms.
barrier(CLK_LOCAL_MEM_FENCE);
prefixSum(sum, temp);
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
if (sum[i] != (i == 0 ? 0 : sum[i-1]))
atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
atomCountBuffer[get_local_id(0)].x = (interacts ? 1 : 0);
SYNC_WARPS;
int whichBuffer = 0;
for (int offset = 1; offset < TILE_SIZE; offset *= 2) {
if (whichBuffer == 0)
atomCountBuffer[get_local_id(0)].y = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].x : atomCountBuffer[get_local_id(0)].x+atomCountBuffer[get_local_id(0)-offset].x);
else
atomCountBuffer[get_local_id(0)].x = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].y : atomCountBuffer[get_local_id(0)].y+atomCountBuffer[get_local_id(0)-offset].y);
whichBuffer = 1-whichBuffer;
SYNC_WARPS;
}
// Store them to global memory.
// Add any interacting atoms to the buffer.
int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) {
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, tilesToStore);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0)
*numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
if (*baseIndex+tilesToStore <= maxTiles) {
if (get_local_id(0) < tilesToStore)
interactingTiles[*baseIndex+get_local_id(0)] = x;
for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
}
if (interacts)
buffer[neighborsInBuffer+atomCountBuffer[get_local_id(0)].y-1] = atom2;
neighborsInBuffer += atomCountBuffer[warpStart+TILE_SIZE-1].y;
if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
// Store the new tiles to memory.
int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (indexInWarp == 0)
*tileStartIndex = atom_add(interactionCount, tilesToStore);
SYNC_WARPS;
int newTileStartIndex = *tileStartIndex;
if (newTileStartIndex+tilesToStore < maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
}
else {
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0)
*numAtoms += sum[BUFFER_SIZE-1];
buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
neighborsInBuffer -= TILE_SIZE*tilesToStore;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < *numAtoms && !storePartialTile)
atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
}
if (numValid == 0 && *numAtoms > 0 && finish) {
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, 1);
barrier(CLK_LOCAL_MEM_FENCE);
if (*baseIndex < maxTiles) {
if (get_local_id(0) == 0)
interactingTiles[*baseIndex] = x;
if (get_local_id(0) < TILE_SIZE)
interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
}
}
// Reset the buffer for processing more tiles.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
buffer[i] = INVALID;
barrier(CLK_LOCAL_MEM_FENCE);
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
__global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList) {
__local unsigned short buffer[BUFFER_SIZE];
__local short sum[BUFFER_SIZE];
__local ushort2 temp[BUFFER_SIZE];
__local int atoms[BUFFER_SIZE+TILE_SIZE];
__local real4 posBuffer[TILE_SIZE];
__local int exclusionsForX[MAX_EXCLUSIONS];
__local int bufferFull;
__local int globalIndex;
__local int numAtoms;
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
// The IF actually causes the write to never be executed, its presence is all that is needed.
// AMD APP SDK 2.4 has this problem.
if (get_global_id(0) == get_local_id(0)+1)
((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
int valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
barrier(CLK_LOCAL_MEM_FENCE);
// Loop over blocks sorted by size.
for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
if (get_local_id(0) == get_local_size(0)-1)
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i];
// Load exclusion data for block x.
// If we have a partially filled buffer, store it to memory.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
barrier(CLK_LOCAL_MEM_FENCE);
// Compare it to other blocks after this one in sorted order.
for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
int j = base+get_local_id(0);
real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
unsigned short y = (unsigned short) sortedKey2.y;
real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
buffer[bufferIndex] = y;
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
if (neighborsInBuffer > 0) {
int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0)
*tileStartIndex = atom_add(interactionCount, tilesToStore);
SYNC_WARPS;
int newTileStartIndex = *tileStartIndex;
if (newTileStartIndex+tilesToStore < maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (bufferFull) {
storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
}
// Record the positions the neighbor list is based on.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment