#define BUFFER_SIZE 256 #if defined(AMD_RDNA) typedef unsigned int warpflags; __device__ inline int warpPopc(warpflags x) { return __popc(x); } #else typedef unsigned long long warpflags; __device__ inline int warpPopc(warpflags x) { return __popcll(x); } #endif struct alignas(sizeof(__half) * 4) BoundingBox { __device__ BoundingBox(real3 f) { // Round up so we'll err on the side of making the box a little too large. // This ensures interactions will never be missed. v[0] = __float2half_ru((float) f.x); v[1] = __float2half_ru((float) f.y); v[2] = __float2half_ru((float) f.z); } __device__ real3 toReal3() const { return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2])); } private: __half v[3]; }; /** * Find a bounding box for the atoms in each block. */ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, real2* __restrict__ blockSizeRange) { const int indexInTile = threadIdx.x%TILE_SIZE; const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE); const int base = index * TILE_SIZE; if (base >= numAtoms) return; real4 tPos = posq[base + indexInTile < numAtoms ? base + indexInTile : 0]; #ifdef USE_PERIODIC real4 pos; pos.x = SHFL(tPos.x, 0); pos.y = SHFL(tPos.y, 0); pos.z = SHFL(tPos.z, 0); APPLY_PERIODIC_TO_POS(pos) real4 minPos = pos; real4 maxPos = pos; for (int i = 1; i < TILE_SIZE; i++) { pos.x = SHFL(tPos.x, i); pos.y = SHFL(tPos.y, i); pos.z = SHFL(tPos.z, i); real4 center = 0.5f*(maxPos+minPos); APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center) minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0); maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0); } #else real4 minPos = tPos; real4 maxPos = tPos; for (int i = TILE_SIZE >> 1; i > 0; i >>= 1) { real4 tpos1, tpos2; tpos1.x = __shfl_down(minPos.x, i, TILE_SIZE); tpos1.y = __shfl_down(minPos.y, i, TILE_SIZE); tpos1.z = __shfl_down(minPos.z, i, TILE_SIZE); tpos2.x = __shfl_down(maxPos.x, i, TILE_SIZE); tpos2.y = __shfl_down(maxPos.y, i, TILE_SIZE); tpos2.z = __shfl_down(maxPos.z, i, TILE_SIZE); minPos.x = min(minPos.x, tpos1.x); minPos.y = min(minPos.y, tpos1.y); minPos.z = min(minPos.z, tpos1.z); maxPos.x = max(maxPos.x, tpos2.x); maxPos.y = max(maxPos.y, tpos2.y); maxPos.z = max(maxPos.z, tpos2.z); } minPos.x = SHFL(minPos.x, 0); minPos.y = SHFL(minPos.y, 0); minPos.z = SHFL(minPos.z, 0); maxPos.x = SHFL(maxPos.x, 0); maxPos.y = SHFL(maxPos.y, 0); maxPos.z = SHFL(maxPos.z, 0); #endif real4 blockSize = 0.5f*(maxPos-minPos); real4 center = 0.5f*(maxPos+minPos); center.w = 0; real4 delta = tPos - center; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(delta) #endif real tdelta = delta.x*delta.x+delta.y*delta.y+delta.z*delta.z; real tcenter = max(center.w, tdelta); for (int i = TILE_SIZE >> 1; i > 0; i >>= 1) { real t = __shfl_down(tcenter, i, TILE_SIZE); tcenter = max(tcenter, t); } if (indexInTile == 0) { center.w = SQRT(tcenter); blockBoundingBox[index] = blockSize; blockCenter[index] = center; // Record the range of sizes. real totalSize = blockSize.x+blockSize.y+blockSize.z; atomicMin(&blockSizeRange->x, totalSize); atomicMax(&blockSizeRange->y, totalSize); } if (blockIdx.x == 0 && threadIdx.x == 0) rebuildNeighborList[0] = 0; } extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) { // Sort keys store the bin in the high order part and the block in the low // order part. real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y)); int numSizeBins = 20; real scale = numSizeBins/(sizeRange.y-sizeRange.x); unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; if (i < NUM_BLOCKS) { real4 box = blockBoundingBox[i]; real size = LOG(box.x+box.y+box.z); int bin = (size-sizeRange.x)*scale; bin = max(0, min(bin, numSizeBins-1)); sortedBlocks[i] = (((unsigned int) bin)<x = 1e38; blockSizeRange->y = 0; } // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. bool rebuild = forceRebuild; if (i < NUM_ATOMS) { real4 delta = oldPositions[i]-posq[i]; if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING) rebuild = true; } if (rebuild) { rebuildNeighborList[0] = 1; interactionCount[0] = 0; interactionCount[1] = 0; } } #if MAX_BITS_FOR_PAIRS > 0 __device__ inline void collectInteractions(unsigned int& interacts, float d, int bit) { interacts |= (__float_as_uint(d) >> 31) << bit; } __device__ inline void collectInteractions(unsigned int& interacts, double d, int bit) { interacts |= ((unsigned int)__double2hiint(d) >> 31) << bit; } #else // Simplified version that does not collect individual bits (they are not needed without single pairs), // only sets the flag that there are any interactions. __device__ inline void collectInteractions(unsigned int& interacts, float d, int) { interacts |= __float_as_uint(d) & (1 << 31); } __device__ inline void collectInteractions(unsigned int& interacts, double d, int) { interacts |= (unsigned int)__double2hiint(d) & (1 << 31); } #endif #if !defined(USE_DOUBLE_PRECISION) && __has_builtin(__builtin_amdgcn_mfma_f32_4x4x1f32) #define USE_MFMA using vfloat = __attribute__((__vector_size__(4 * sizeof(float)))) float; template inline __device__ void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned int& interacts) { vfloat d; d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.x, -pos2.x, c, 4, BlockId, 0); d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.y, -pos2.y, d, 4, BlockId, 0); d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.z, -pos2.z, d, 4, BlockId, 0); d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.w, 1.0f, d, 4, BlockId, 0); #pragma unroll for (int i = 0; i < 4; i++) { collectInteractions(interacts, d[i], BlockId * 4 + i); } } #endif /** * Compare the bounding boxes for each pair of atom blocks (comprised of TILE_SIZE atoms each), forming a tile. If the two * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm. * * STAGE 1: * * A coarse grained atom block against interacting atom block neighbour list is constructed. * * Each warp first loads in some block X of interest. Each thread within the warp then loads * in a different atom block Y. If Y has exclusions with X, then Y is not processed. If the bounding boxes * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be * interacting and Y is added to the buffer for X. * * STAGE 2: * * A fine grained atom block against interacting atoms neighbour list is constructed. * * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X. Each thread * in the warp loops over the TILE_SIZE atoms in X and compares their positions to one particular atom from block Y. * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X. * This continues until the buffer fills up, at which point the results are written to global memory. * * [in] periodicBoxSize - size of the rectangular periodic box * [in] invPeriodicBoxSize - inverse of the periodic box * [in] blockCenter - the center of each bounding box * [in] blockBoundingBox - bounding box of each atom block * [out] interactionCount - total number of tiles that have interactions * [out] interactingTiles - set of blocks that have interactions * [out] interactingAtoms - a list of atoms that interact with each atom block * [in] posq - x,y,z coordinates of each atom and charge q * [in] maxTiles - maximum number of tiles to process, used for multi-GPUs * [in] startBlockIndex - first block to process, used for multi-GPUs, * [in] numBlocks - total number of atom blocks * [in] sortedBlocks - a sorted list of atom blocks based on volume * [in] sortedBlockCenter - sorted centers, duplicated for fast access to avoid indexing * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access * [in] exclusionIndices - maps into exclusionRowIndices with the starting position for a given atom * [in] exclusionRowIndices - stores the a continuous list of exclusions * eg: block 0 is excluded from atom 3,5,6 * block 1 is excluded from atom 3,4 * block 2 is excluded from atom 1,3,5,6 * exclusionIndices[0][3][5][8] * exclusionRowIndices[3][5][6][3][4][1][3][5][6] * index 0 1 2 3 4 5 6 7 8 * [out] oldPos - stores the positions of the atoms in which this neighbourlist was built on * - this is used to decide when to rebuild a neighbourlist * [in] rebuildNeighbourList - whether or not to execute this kernel * */ extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const BoundingBox* __restrict__ sortedBlockBoundingBox, #ifdef USE_LARGE_BLOCKS const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox, #endif const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { if (rebuildNeighborList[0] == 0) return; // The neighbor list doesn't need to be rebuilt. constexpr int tilesPerWarp = warpSize/TILE_SIZE; constexpr int warpsPerBlock = GROUP_SIZE/warpSize; const int indexInWarp = threadIdx.x%warpSize; const int indexInTile = threadIdx.x%TILE_SIZE; const int tileInWarp = tilesPerWarp == 1 ? 0 : (threadIdx.x/TILE_SIZE)%tilesPerWarp; const int warpInBlock = warpsPerBlock == 1 ? 0 : threadIdx.x/warpSize; const int warpIndex = blockIdx.x*warpsPerBlock + (warpsPerBlock == 1 ? 0 : warpInBlock); const warpflags warpMask = (static_cast(1)<= PADDED_CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF); if (singlePeriodicCopy) { // The box is small enough that we can just translate all the atoms into a single periodic // box, then skip having to apply periodic boundary conditions later. APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX) } #endif pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z); if (tileInWarp == 0) { posBuffer[indexInTile] = pos1; } // Load exclusion data for block x. const int exclusionStart = exclusionRowIndices[x]; const int exclusionEnd = exclusionRowIndices[x+1]; const int numExclusions = exclusionEnd-exclusionStart; #pragma unroll 1 for (int j = indexInWarp; j < numExclusions; j += warpSize) exclusionsForX[j] = exclusionIndices[exclusionStart+j]; // Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against warpSize // other blocks in parallel. // For small systems multiple warps (NUM_TILES_IN_BATCH = 4, 2...) process one block1 reducing the overall // duration of the kernel because first blocks block1 have to process more block2 blocks so most of compute // units are idle at the end of the kernel (the kernel works on the upper triangle of // the NUM_BLOCKS x NUM_BLOCKS matrix). #ifdef USE_LARGE_BLOCKS warpflags largeBlockFlags = 0; int loadedLargeBlocks = 0; #endif int block2Count = 0; // Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox. for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) { // Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2) const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS; #ifdef USE_LARGE_BLOCKS if (loadedLargeBlocks == 0) { // Check the next set of large blocks. int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp; bool includeLargeBlock = false; if (largeBlockIndex < NUM_BLOCKS) { real4 largeCenter = largeBlockCenter[largeBlockIndex]; real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3(); real4 blockDelta = blockCenterX-largeCenter; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(blockDelta) #endif blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z); includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); #ifdef TRICLINIC // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away. // If there's any possibility we might have missed it, do a detailed check. if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF) includeLargeBlock = true; #endif } largeBlockFlags = __ballot(includeLargeBlock); loadedLargeBlocks = warpSize; } loadedLargeBlocks--; if ((largeBlockFlags&1) == 0 && !lastIteration) { // None of the next warpSize blocks interact with block 1. largeBlockFlags >>= 1; continue; } largeBlockFlags >>= 1; #endif int block2 = block2Base+indexInWarp; bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS); block2 = includeBlock2 ? block2 : block1; bool forceInclude = false; real4 blockCenterY = sortedBlockCenter[block2]; real4 blockDelta = blockCenterX-blockCenterY; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(blockDelta) #endif includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)); #ifndef TRICLINIC if (!lastIteration && __ballot(includeBlock2) == 0) continue; #endif real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3(); blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z); includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); #ifdef TRICLINIC // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away. // If there's any possibility we might have missed it, do a detailed check. if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF) includeBlock2 = forceInclude = true; #endif // Collect any blocks we identified as potentially containing neighbors. warpflags includeBlockFlags = __ballot(includeBlock2); if (includeBlock2) { int index = block2Count + warpPopc(includeBlockFlags&warpMask); block2Buffer[index] = (block2 << 1) | (forceInclude ? 1 : 0); } block2Count += warpPopc(includeBlockFlags); // Loop over the collected candidates (each warp processes 2 blocks on CDNA or 1 block on RDNA). // Process even number of blocks on CDNA so both half-warps have work to do (except for // the last iteration of the for-block2Base loop when the left-over must be processed). const int block2ToProcess = lastIteration ? block2Count : block2Count/tilesPerWarp*tilesPerWarp; for (int block2Index = 0; block2Index < block2ToProcess; block2Index += tilesPerWarp) { bool includeBlock2 = block2Index + tileInWarp < block2Count; const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)]; const bool forceInclude = b & 1; const int block2 = b >> 1; int y = sortedBlocks[block2] & BLOCK_INDEX_MASK; #pragma unroll 1 for (int k = indexInTile; k < numExclusions; k += TILE_SIZE) includeBlock2 &= (exclusionsForX[k] != y); includeBlock2 = BALLOT(!includeBlock2) == 0; // Check each atom in block Y for interactions. int atom2 = y*TILE_SIZE+indexInTile; real4 pos2 = posq[atom2]; #ifdef USE_PERIODIC if (singlePeriodicCopy) { APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX) } #endif pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z); real4 blockCenterY = sortedBlockCenter[block2]; real3 atomDelta = trimTo3(pos1)-trimTo3(blockCenterY); #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(atomDelta) #endif tileflags atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w)); tileflags interacts = 0; // The condition `posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z < 0.5f * PADDED_CUTOFF_SQUARED` is expressed as // `posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z - posj.w - 0.5f * PADDED_CUTOFF_SQUARED - pos2.w` and computed using fma // (it saves 1 instruction). // Sign bit is used directly instead of `halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<(1) << j); real3 delta = trimTo3(pos2)-trimTo3(posBuffer[j]); APPLY_PERIODIC_TO_DELTA(delta) real d = delta.x*delta.x+delta.y*delta.y+delta.z*delta.z - PADDED_CUTOFF_SQUARED; collectInteractions(interacts, d, j); } } else { #endif const real lim = 0.5f * PADDED_CUTOFF_SQUARED - pos2.w; #if defined(USE_MFMA) const vfloat c = { -lim, -lim, -lim, -lim }; mfma4x4<0>(pos1, pos2, c, interacts); mfma4x4<1>(pos1, pos2, c, interacts); mfma4x4<2>(pos1, pos2, c, interacts); mfma4x4<3>(pos1, pos2, c, interacts); mfma4x4<4>(pos1, pos2, c, interacts); mfma4x4<5>(pos1, pos2, c, interacts); mfma4x4<6>(pos1, pos2, c, interacts); mfma4x4<7>(pos1, pos2, c, interacts); #else while (atomFlags) { int j = __ffs(atomFlags)-1; atomFlags = atomFlags ^ (static_cast(1) << j); real4 posj = posBuffer[j]; real d = fma(-posj.x, pos2.x, fma(-posj.y, pos2.y, fma(-posj.z, pos2.z, posj.w - lim))); collectInteractions(interacts, d, j); } #endif #ifdef USE_PERIODIC } #endif if (atom2 >= NUM_ATOMS || !includeBlock2) { interacts = 0; } #if MAX_BITS_FOR_PAIRS > 0 const unsigned int interactCount = __popc(interacts); // Record interactions that should be computed as single pairs rather than in blocks. const bool storeAsSinglePair = interactCount > 0 && interactCount <= MAX_BITS_FOR_PAIRS; if (__ballot(storeAsSinglePair)) { unsigned int sum = 0; unsigned int prevSum = 0; for (int i = 1; i <= MAX_BITS_FOR_PAIRS; i++) { warpflags b = __ballot(interactCount == i); sum += warpPopc(b) * i; prevSum += warpPopc(b&warpMask) * i; } unsigned int pairStartIndex = 0; if (indexInWarp == 0) pairStartIndex = atomicAdd(&interactionCount[1], sum); pairStartIndex = __shfl(pairStartIndex, 0); unsigned int pairIndex = pairStartIndex + prevSum; if (storeAsSinglePair && pairIndex+interactCount <= maxSinglePairs) { while (interacts != 0) { int j = __ffs(interacts)-1; singlePairs[pairIndex] = make_int2(atom2, x*TILE_SIZE+j); interacts = interacts ^ (static_cast(1) << j); pairIndex++; } } } #else const unsigned int interactCount = interacts; #endif // Add any interacting atoms to the buffer. warpflags includeAtomFlags = __ballot(interactCount > MAX_BITS_FOR_PAIRS); if (interactCount > MAX_BITS_FOR_PAIRS) { int index = neighborsInBuffer+warpPopc(includeAtomFlags&warpMask); buffer[index] = atom2; } neighborsInBuffer += warpPopc(includeAtomFlags); if (neighborsInBuffer > BUFFER_SIZE-warpSize) { // Store the new tiles to memory. unsigned int tilesToStore = neighborsInBuffer/warpSize*tilesPerWarp; unsigned int tileStartIndex = 0; if (indexInWarp == 0) tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore); unsigned int newTileStartIndex = __shfl(tileStartIndex, 0); if (newTileStartIndex+tilesToStore <= maxTiles) { if (indexInWarp < tilesToStore) interactingTiles[newTileStartIndex+indexInWarp] = x; for (int j = 0; j < tilesToStore/tilesPerWarp; j++) interactingAtoms[newTileStartIndex*TILE_SIZE+j*warpSize+indexInWarp] = buffer[j*warpSize+indexInWarp]; } if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE) buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore]; neighborsInBuffer -= TILE_SIZE*tilesToStore; } } // Move not processed blocks to the head of block2Buffer. if (indexInWarp < block2Count - block2ToProcess) block2Buffer[indexInWarp] = block2Buffer[block2ToProcess + indexInWarp]; block2Count = block2Count - block2ToProcess; } // If we have a partially filled buffer, store it to memory. if (neighborsInBuffer > 0) { unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE; unsigned int tileStartIndex = 0; if (indexInWarp == 0) tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore); unsigned int newTileStartIndex = __shfl(tileStartIndex, 0); if (newTileStartIndex+tilesToStore <= maxTiles) { if (indexInWarp < tilesToStore) interactingTiles[newTileStartIndex+indexInWarp] = x; for (int j = 0; j <= tilesToStore/tilesPerWarp; j++) { if (j*warpSize+indexInWarp < tilesToStore*TILE_SIZE) interactingAtoms[newTileStartIndex*TILE_SIZE+j*warpSize+indexInWarp] = (j*warpSize+indexInWarp < neighborsInBuffer ? buffer[j*warpSize+indexInWarp] : PADDED_NUM_ATOMS); } } } } // Record the positions the neighbor list is based on. int i = threadIdx.x+blockIdx.x*GROUP_SIZE; if (i < NUM_ATOMS) { oldPositions[i] = posq[i]; } } extern "C" __global__ void copyInteractionCounts(const unsigned int* __restrict__ interactionCount, unsigned int* __restrict__ pinnedInteractionCount) { pinnedInteractionCount[0] = interactionCount[0]; pinnedInteractionCount[1] = interactionCount[1]; }