#define GROUP_SIZE 256 #define BUFFER_SIZE 256 /** * To use half precision, we're supposed to include cuda_fp16.h. Unfortunately, * it isn't included in the search path automatically, and there's no reliable * way to find where it's located on disk. Instead we provide our own definitions * for the few symbols we need. */ struct __align__(2) __half { unsigned short x; }; __device__ __half __float2half_ru(const float f) { __half h; asm("{cvt.rp.f16.f32 %0, %1;}" : "=h"(*reinterpret_cast(&h)) : "f"(f)); return h; } __device__ float __half2float(const __half h) { float f; asm("{cvt.f32.f16 %0, %1;}" : "=f"(f) : "h"(*reinterpret_cast(&h))); return f; } struct half3 { __device__ half3(real3 f) { // Round up so we'll err on the side of making the box a little too large. // This ensures interactions will never be missed. v[0] = __float2half_ru((float) f.x); v[1] = __float2half_ru((float) f.y); v[2] = __float2half_ru((float) f.z); } __device__ real3 toReal3() const { return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2])); } private: __half v[3]; }; /** * Find a bounding box for the atoms in each block. */ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, real2* __restrict__ blockSizeRange) { int index = blockIdx.x*blockDim.x+threadIdx.x; int base = index*TILE_SIZE; real minSize = 1e38, maxSize = 0; while (base < numAtoms) { real4 pos = posq[base]; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_POS(pos) #endif real4 minPos = pos; real4 maxPos = pos; int last = min(base+TILE_SIZE, numAtoms); for (int i = base+1; i < last; i++) { pos = posq[i]; #ifdef USE_PERIODIC real4 center = 0.5f*(maxPos+minPos); APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center) #endif minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0); maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0); } real4 blockSize = 0.5f*(maxPos-minPos); real4 center = 0.5f*(maxPos+minPos); center.w = 0; for (int i = base; i < last; i++) { pos = posq[i]; real4 delta = posq[i]-center; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(delta) #endif center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z); } center.w = sqrt(center.w); blockBoundingBox[index] = blockSize; blockCenter[index] = center; real totalSize = blockSize.x+blockSize.y+blockSize.z; minSize = min(minSize, totalSize); maxSize = max(maxSize, totalSize); index += blockDim.x*gridDim.x; base = index*TILE_SIZE; } // Record the range of sizes seen by threads in this block. __shared__ real minBuffer[64], maxBuffer[64]; minBuffer[threadIdx.x] = minSize; maxBuffer[threadIdx.x] = maxSize; __syncthreads(); for (int step = 1; step < 64; step *= 2) { if (threadIdx.x+step < 64 && threadIdx.x%(2*step) == 0) { minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]); maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]); } __syncthreads(); } if (threadIdx.x == 0) blockSizeRange[blockIdx.x] = make_real2(minBuffer[0], maxBuffer[0]); if (blockIdx.x == 0 && threadIdx.x == 0) rebuildNeighborList[0] = 0; } extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, real2* __restrict__ blockSizeRange, int numSizes) { // Find the total range of sizes recorded by all blocks. __shared__ real2 sizeRange; if (threadIdx.x == 0) { sizeRange = blockSizeRange[0]; for (int i = 1; i < numSizes; i++) { real2 size = blockSizeRange[i]; if (size.x > 0) sizeRange.x = min(sizeRange.x, size.x); sizeRange.y = max(sizeRange.y, size.y); } sizeRange.x = LOG(sizeRange.x); sizeRange.y = LOG(sizeRange.y); } __syncthreads(); // Sort keys store the bin in the high order part and the block in the low // order part. int numSizeBins = 20; real scale = numSizeBins/(sizeRange.y-sizeRange.x); for (unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) { real4 box = blockBoundingBox[i]; real size = LOG(box.x+box.y+box.z); int bin = (size-sizeRange.x)*scale; bin = max(0, min(bin, numSizeBins-1)); sortedBlocks[i] = (((unsigned int) bin)< 0.25f*PADDING*PADDING) rebuild = true; } if (rebuild) { rebuildNeighborList[0] = 1; interactionCount[0] = 0; interactionCount[1] = 0; } } __device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile unsigned int& pairStartIndex) { // Record interactions that should be computed as single pairs rather than in blocks. const int indexInWarp = threadIdx.x%32; int sum = 0; #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE) for (int i = indexInWarp; i < length; i += 32) { int count = __popc(flags[i]); sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0); } for (int i = 1; i < 32; i *= 2) { int n = __shfl_up_sync(0xffffffff, sum, i); if (indexInWarp >= i) sum += n; } if (indexInWarp == 31) pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum); __syncwarp(); int prevSum = __shfl_up_sync(0xffffffff, sum, 1); unsigned int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0); for (int i = indexInWarp; i < length; i += 32) { int count = __popc(flags[i]); if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) { int f = flags[i]; while (f != 0) { singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1); f &= f-1; pairIndex++; } } } // Compact the remaining interactions. const int warpMask = (1< MAX_BITS_FOR_PAIRS); int includeFlags = BALLOT(include); if (include) { int index = numCompacted+__popc(includeFlags&warpMask); atoms[index] = atom; flags[index] = flag; } numCompacted += __popc(includeFlags); } return numCompacted; } /** * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm. * * STAGE 1: * * A coarse grained atom block against interacting atom block neighbour list is constructed. * * Each warp first loads in some block X of interest. Each thread within the warp then loads * in a different atom block Y. If Y has exclusions with X, then Y is not processed. If the bounding boxes * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be * interacting and Y is added to the buffer for X. * * STAGE 2: * * A fine grained atom block against interacting atoms neighbour list is constructed. * * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X. Each thread * in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y. * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X. * This continues until the buffer fills up, at which point the results are written to global memory. * * [in] periodicBoxSize - size of the rectangular periodic box * [in] invPeriodicBoxSize - inverse of the periodic box * [in] blockCenter - the center of each bounding box * [in] blockBoundingBox - bounding box of each atom block * [out] interactionCount - total number of tiles that have interactions * [out] interactingTiles - set of blocks that have interactions * [out] interactingAtoms - a list of atoms that interact with each atom block * [in] posq - x,y,z coordinates of each atom and charge q * [in] maxTiles - maximum number of tiles to process, used for multi-GPUs * [in] startBlockIndex - first block to process, used for multi-GPUs, * [in] numBlocks - total number of atom blocks * [in] sortedBlocks - a sorted list of atom blocks based on volume * [in] sortedBlockCenter - sorted centers, duplicated for fast access to avoid indexing * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access * [in] exclusionIndices - maps into exclusionRowIndices with the starting position for a given atom * [in] exclusionRowIndices - stores the a continuous list of exclusions * eg: block 0 is excluded from atom 3,5,6 * block 1 is excluded from atom 3,4 * block 2 is excluded from atom 1,3,5,6 * exclusionIndices[0][3][5][8] * exclusionRowIndices[3][5][6][3][4][1][3][5][6] * index 0 1 2 3 4 5 6 7 8 * [out] oldPos - stores the positions of the atoms in which this neighbourlist was built on * - this is used to decide when to rebuild a neighbourlist * [in] rebuildNeighbourList - whether or not to execute this kernel * */ extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex, unsigned int numBlocks, unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox, #ifdef USE_LARGE_BLOCKS real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, #endif const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) { if (rebuildNeighborList[0] == 0) return; // The neighbor list doesn't need to be rebuilt. const int indexInWarp = threadIdx.x%32; const int warpStart = threadIdx.x-indexInWarp; const int totalWarps = blockDim.x*gridDim.x/32; const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32; const int warpMask = (1<= PADDED_CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF); if (singlePeriodicCopy) { // The box is small enough that we can just translate all the atoms into a single periodic // box, then skip having to apply periodic boundary conditions later. APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX) } #endif pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z); posBuffer[threadIdx.x] = pos1; // Load exclusion data for block x. const int exclusionStart = exclusionRowIndices[x]; const int exclusionEnd = exclusionRowIndices[x+1]; const int numExclusions = exclusionEnd-exclusionStart; #pragma unroll 4 // (MAX_EXCLUSIONS) for (int j = indexInWarp; j < numExclusions; j += 32) exclusionsForX[j] = exclusionIndices[exclusionStart+j]; if (MAX_EXCLUSIONS > 32) __syncthreads(); // Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32 // other blocks in parallel. #ifdef USE_LARGE_BLOCKS int largeBlockFlags = 0; int loadedLargeBlocks = 0; #endif for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) { #ifdef USE_LARGE_BLOCKS if (loadedLargeBlocks == 0) { // Check the next set of large blocks. int largeBlockIndex = block2Base + 32*indexInWarp; bool includeLargeBlock = false; if (largeBlockIndex < NUM_BLOCKS) { real4 largeCenter = largeBlockCenter[largeBlockIndex]; real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3(); real4 blockDelta = blockCenterX-largeCenter; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(blockDelta) #endif blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z); includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); #ifdef TRICLINIC // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away. // If there's any possibility we might have missed it, do a detailed check. if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF) includeLargeBlock = true; #endif } largeBlockFlags = BALLOT(includeLargeBlock); loadedLargeBlocks = 32; } loadedLargeBlocks--; if ((largeBlockFlags&1) == 0) { // None of the next 32 blocks interact with block 1. largeBlockFlags >>= 1; continue; } largeBlockFlags >>= 1; #endif int block2 = block2Base+indexInWarp; bool includeBlock2 = (block2 < NUM_BLOCKS); bool forceInclude = false; if (includeBlock2) { real4 blockCenterY = sortedBlockCenter[block2]; real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3(); real4 blockDelta = blockCenterX-blockCenterY; #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(blockDelta) #endif includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)); blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x); blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y); blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z); includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED); #ifdef TRICLINIC // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away. // If there's any possibility we might have missed it, do a detailed check. if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF) includeBlock2 = forceInclude = true; #endif if (includeBlock2) { int y = sortedBlocks[block2] & BLOCK_INDEX_MASK; #pragma unroll 4 // (MAX_EXCLUSIONS) for (int k = 0; k < numExclusions; k++) includeBlock2 &= (exclusionsForX[k] != y); } } // Loop over any blocks we identified as potentially containing neighbors. int includeBlockFlags = BALLOT(includeBlock2); int forceIncludeFlags = BALLOT(forceInclude); while (includeBlockFlags != 0) { int i = __ffs(includeBlockFlags)-1; includeBlockFlags &= includeBlockFlags-1; forceInclude = (forceIncludeFlags>>i) & 1; int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK; // Check each atom in block Y for interactions. int atom2 = y*TILE_SIZE+indexInWarp; real4 pos2 = posq[atom2]; #ifdef USE_PERIODIC if (singlePeriodicCopy) { APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX) } #endif pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z); real4 blockCenterY = sortedBlockCenter[block2Base+i]; real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY); #ifdef USE_PERIODIC APPLY_PERIODIC_TO_DELTA(atomDelta) #endif int atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w)); int interacts = 0; if (atom2 < NUM_ATOMS && atomFlags != 0) { #ifdef USE_PERIODIC if (!singlePeriodicCopy) { int first = __ffs(atomFlags)-1; int last = 32-__clz(atomFlags); for (int j = first; j < last; j++) { real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]); APPLY_PERIODIC_TO_DELTA(delta) interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1< BUFFER_SIZE-TILE_SIZE) { // Store the new tiles to memory. #if MAX_BITS_FOR_PAIRS > 0 neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex); #endif unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE; if (tilesToStore > 0) { if (indexInWarp == 0) tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore); unsigned int newTileStartIndex = tileStartIndex; if (newTileStartIndex+tilesToStore <= maxTiles) { if (indexInWarp < tilesToStore) interactingTiles[newTileStartIndex+indexInWarp] = x; #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE) for (int j = 0; j < tilesToStore; j++) interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE]; } if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE) buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore]; neighborsInBuffer -= TILE_SIZE*tilesToStore; } } } } // If we have a partially filled buffer, store it to memory. #if MAX_BITS_FOR_PAIRS > 0 if (neighborsInBuffer > 32) neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex); #endif if (neighborsInBuffer > 0) { unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE; if (indexInWarp == 0) tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore); unsigned int newTileStartIndex = tileStartIndex; if (newTileStartIndex+tilesToStore <= maxTiles) { if (indexInWarp < tilesToStore) interactingTiles[newTileStartIndex+indexInWarp] = x; #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE) for (int j = 0; j < tilesToStore; j++) interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS); } } } // Record the positions the neighbor list is based on. for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) oldPositions[i] = posq[i]; }