findInteractingBlocks.cu 25.8 KB
Newer Older
1
#define GROUP_SIZE 256
2
#define BUFFER_SIZE 256
3

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
 * To use half precision, we're supposed to include cuda_fp16.h.  Unfortunately,
 * it isn't included in the search path automatically, and there's no reliable
 * way to find where it's located on disk.  Instead we provide our own definitions
 * for the few symbols we need.
 */
struct __align__(2) __half {
    unsigned short x;
};
__device__ __half __float2half_ru(const float f) {
    __half h;
    asm("{cvt.rp.f16.f32 %0, %1;}" : "=h"(*reinterpret_cast<unsigned short *>(&h)) : "f"(f));
    return h;
}
__device__ float __half2float(const __half h) {
    float f;
    asm("{cvt.f32.f16 %0, %1;}" : "=f"(f) : "h"(*reinterpret_cast<const unsigned short *>(&h)));
    return f;
}
struct half3 {
    __device__ half3(real3 f) {
        // Round up so we'll err on the side of making the box a little too large.
        // This ensures interactions will never be missed.
        v[0] = __float2half_ru((float) f.x);
        v[1] = __float2half_ru((float) f.y);
        v[2] = __float2half_ru((float) f.z);
    }
    __device__ real3 toReal3() const {
        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
    }
private:
    __half v[3];
};

38
39
40
/**
 * Find a bounding box for the atoms in each block.
 */
41
42
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
43
        real2* __restrict__ blockSizeRange) {
44
45
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    int base = index*TILE_SIZE;
46
    real minSize = 1e38, maxSize = 0;
47
48
49
    while (base < numAtoms) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
50
        APPLY_PERIODIC_TO_POS(pos)
51
52
53
54
55
56
57
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, numAtoms);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
58
            real4 center = 0.5f*(maxPos+minPos);
59
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
60
61
62
63
#endif
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
64
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
65
        real4 center = 0.5f*(maxPos+minPos);
66
        center.w = 0;
67
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
68
69
70
71
72
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
73
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
74
        }
75
        center.w = sqrt(center.w);
76
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
77
        blockCenter[index] = center;
78
79
80
        real totalSize = blockSize.x+blockSize.y+blockSize.z;
        minSize = min(minSize, totalSize);
        maxSize = max(maxSize, totalSize);
81
82
83
        index += blockDim.x*gridDim.x;
        base = index*TILE_SIZE;
    }
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    
    // Record the range of sizes seen by threads in this block.

    __shared__ real minBuffer[64], maxBuffer[64];
    minBuffer[threadIdx.x] = minSize;
    maxBuffer[threadIdx.x] = maxSize;
    __syncthreads();
    for (int step = 1; step < 64; step *= 2) {
        if (threadIdx.x+step < 64 && threadIdx.x%(2*step) == 0) {
            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
        }
        __syncthreads();
    }
    if (threadIdx.x == 0)
        blockSizeRange[blockIdx.x] = make_real2(minBuffer[0], maxBuffer[0]);
100
    if (blockIdx.x == 0 && threadIdx.x == 0)
101
        rebuildNeighborList[0] = 0;
102
103
}

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, real2* __restrict__ blockSizeRange, int numSizes) {
    // Find the total range of sizes recorded by all blocks.

    __shared__ real2 sizeRange;
    if (threadIdx.x == 0) {
        sizeRange = blockSizeRange[0];
        for (int i = 1; i < numSizes; i++) {
            real2 size = blockSizeRange[i];
            sizeRange.x = min(sizeRange.x, size.x);
            sizeRange.y = max(sizeRange.y, size.y);
        }
        sizeRange.x = LOG(sizeRange.x);
        sizeRange.y = LOG(sizeRange.y);
    }
    __syncthreads();

    // Sort keys store the bin in the high order part and the block in the low
    // order part.

    int numSizeBins = 20;
    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
    for (unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
        real4 box = blockBoundingBox[i];
        real size = LOG(box.x+box.y+box.z);
        int bin = (size-sizeRange.x)*scale;
        bin = max(0, min(bin, numSizeBins-1));
        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
    }
}

134
/**
135
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
136
 */
137
extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
138
139
140
141
142
143
        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
144
        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
145
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
146
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
147
        sortedBlockCenter[i] = blockCenter[index];
148
        sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index]));
149
    }
150
151
#ifdef USE_LARGE_BLOCKS
    // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
152
    
153
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
154
155
156
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
        real4 minPos = blockCenter[index]-blockBoundingBox[index];
        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
157
158
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
159
160
161
            index = sortedBlocks[j] & BLOCK_INDEX_MASK;
            real4 blockPos = blockCenter[index];
            real4 width = blockBoundingBox[index];
162
163
164
165
166
167
168
169
170
171
172
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = half3(trimTo3(0.5f*(maxPos-minPos)));
    }
#endif
173
174
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.

175
    bool rebuild = forceRebuild;
176
177
178
179
180
181
182
183
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
184
        interactionCount[1] = 0;
185
186
    }
}
187

188
__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile unsigned int& pairStartIndex) {
189
190
    // Record interactions that should be computed as single pairs rather than in blocks.
    
191
192
    const int indexInWarp = threadIdx.x%32;
    int sum = 0;
193
    #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
194
195
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
196
        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
197
    }
peastman's avatar
peastman committed
198
199
200
201
    for (int i = 1; i < 32; i *= 2) {
        int n = __shfl_up_sync(0xffffffff, sum, i);
        if (indexInWarp >= i)
            sum += n;
202
    }
peastman's avatar
peastman committed
203
204
205
206
    if (indexInWarp == 31)
        pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum);
    __syncwarp();
    int prevSum = __shfl_up_sync(0xffffffff, sum, 1);
207
    unsigned int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0);
208
209
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
210
        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) {
211
212
213
214
215
216
217
218
219
            int f = flags[i];
            while (f != 0) {
                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
                f &= f-1;
                pairIndex++;
            }
        }
    }
    
220
    // Compact the remaining interactions.
221
222
223
    
    const int warpMask = (1<<indexInWarp)-1;
    int numCompacted = 0;
224
225
    for (int start = 0; start < length; start += 32) {
        int i = start+indexInWarp;
226
227
        int atom = atoms[i];
        int flag = flags[i];
228
        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
229
        int includeFlags = BALLOT(include);
230
231
232
233
234
235
236
237
238
239
        if (include) {
            int index = numCompacted+__popc(includeFlags&warpMask);
            atoms[index] = atom;
            flags[index] = flag;
        }
        numCompacted += __popc(includeFlags);
    }
    return numCompacted;
}

240
/**
241
242
243
244
245
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
 *
 * STAGE 1:
 *
246
 * A coarse grained atom block against interacting atom block neighbour list is constructed. 
247
 *
248
249
250
251
 * Each warp first loads in some block X of interest. Each thread within the warp then loads 
 * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes 
 * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
 * interacting and Y is added to the buffer for X.
252
253
254
 *
 * STAGE 2:
 *
255
 * A fine grained atom block against interacting atoms neighbour list is constructed.
256
 *
257
258
259
260
 * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
 * in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y.
 * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
 * This continues until the buffer fills up, at which point the results are written to global memory.
261
262
263
264
265
266
 *
 * [in] periodicBoxSize        - size of the rectangular periodic box
 * [in] invPeriodicBoxSize     - inverse of the periodic box
 * [in] blockCenter            - the center of each bounding box
 * [in] blockBoundingBox       - bounding box of each atom block
 * [out] interactionCount      - total number of tiles that have interactions
267
 * [out] interactingTiles      - set of blocks that have interactions
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
 * [out] interactingAtoms      - a list of atoms that interact with each atom block
 * [in] posq                   - x,y,z coordinates of each atom and charge q
 * [in] maxTiles               - maximum number of tiles to process, used for multi-GPUs
 * [in] startBlockIndex        - first block to process, used for multi-GPUs,
 * [in] numBlocks              - total number of atom blocks
 * [in] sortedBlocks           - a sorted list of atom blocks based on volume
 * [in] sortedBlockCenter      - sorted centers, duplicated for fast access to avoid indexing
 * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
 * [in] exclusionIndices       - maps into exclusionRowIndices with the starting position for a given atom
 * [in] exclusionRowIndices    - stores the a continuous list of exclusions
 *           eg: block 0 is excluded from atom 3,5,6
 *               block 1 is excluded from atom 3,4
 *               block 2 is excluded from atom 1,3,5,6
 *              exclusionIndices[0][3][5][8]
 *           exclusionRowIndices[3][5][6][3][4][1][3][5][6]
 *                         index 0  1  2  3  4  5  6  7  8 
 * [out] oldPos                - stores the positions of the atoms in which this neighbourlist was built on
 *                             - this is used to decide when to rebuild a neighbourlist
 * [in] rebuildNeighbourList   - whether or not to execute this kernel
 *
288
 */
289
extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
290
        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
291
        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex,
292
        unsigned int numBlocks, unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox,
293
294
295
296
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox,
#endif
        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
297
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
298

299
300
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.
301
302
303
304
305
306
307

    const int indexInWarp = threadIdx.x%32;
    const int warpStart = threadIdx.x-indexInWarp;
    const int totalWarps = blockDim.x*gridDim.x/32;
    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
308
    __shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
309
    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
310
    __shared__ real4 posBuffer[GROUP_SIZE];
311
312
    __shared__ volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
    __shared__ unsigned int workgroupPairStartIndex[GROUP_SIZE/32];
peastman's avatar
peastman committed
313
    int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
314
    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
315
    int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
316
    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
317
318
    volatile unsigned int& tileStartIndex = workgroupTileIndex[warpStart/32];
    volatile unsigned int& pairStartIndex = workgroupPairStartIndex[warpStart/32];
319
320

    // Loop over blocks.
321
    
322
323
324
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
325
        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
326
        real4 blockCenterX = sortedBlockCenter[block1];
327
        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
328
        int neighborsInBuffer = 0;
329
        real4 pos1 = posq[x*TILE_SIZE+indexInWarp];
330
331
332
333
334
335
336
337
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
338
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
339
340
        }
#endif
341
        pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
342
        posBuffer[threadIdx.x] = pos1;
343

344
345
346
347
348
        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
349
        #pragma unroll 4 // (MAX_EXCLUSIONS)
350
        for (int j = indexInWarp; j < numExclusions; j += 32)
351
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
352
353
        if (MAX_EXCLUSIONS > 32)
            __syncthreads();
354
        
355
356
357
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

358
359
360
361
#ifdef USE_LARGE_BLOCKS
        int largeBlockFlags = 0;
        int loadedLargeBlocks = 0;
#endif
362
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
                }
                largeBlockFlags = BALLOT(includeLargeBlock);
                loadedLargeBlocks = 32;
            }
            loadedLargeBlocks--;
            if ((largeBlockFlags&1) == 0) {
                // None of the next 32 blocks interact with block 1.

                largeBlockFlags >>= 1;
                continue;
            }
            largeBlockFlags >>= 1;
#endif
393
394
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
395
            bool forceInclude = false;
396
            if (includeBlock2) {
397
                real4 blockCenterY = sortedBlockCenter[block2];
398
                real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
399
                real4 blockDelta = blockCenterX-blockCenterY;
400
#ifdef USE_PERIODIC
401
                APPLY_PERIODIC_TO_DELTA(blockDelta)
402
#endif
403
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
404
405
406
407
                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
408
409
410
411
412
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
413
                    includeBlock2 = forceInclude = true;
414
#endif
415
                if (includeBlock2) {
416
                    int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
417
                    #pragma unroll 4 // (MAX_EXCLUSIONS)
418
419
420
421
422
423
424
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
425
            int includeBlockFlags = BALLOT(includeBlock2);
Peter Eastman's avatar
Peter Eastman committed
426
            int forceIncludeFlags = BALLOT(forceInclude);
427
428
429
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
Peter Eastman's avatar
Peter Eastman committed
430
                forceInclude = (forceIncludeFlags>>i) & 1;
431
                int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
432

433
                // Check each atom in block Y for interactions.
434

435
                int atom2 = y*TILE_SIZE+indexInWarp;
436
                real4 pos2 = posq[atom2];
437
438
#ifdef USE_PERIODIC
                if (singlePeriodicCopy) {
439
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
440
441
                }
#endif
442
443
                pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);

444
                real4 blockCenterY = sortedBlockCenter[block2Base+i];
445
                real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY);
446
447
448
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
449
                int atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
450
                int interacts = 0;
451
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
452
453
#ifdef USE_PERIODIC
                    if (!singlePeriodicCopy) {
454
455
                        int first = __ffs(atomFlags)-1;
                        int last = 32-__clz(atomFlags);
456
                        for (int j = first; j < last; j++) {
457
                            real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]);
458
                            APPLY_PERIODIC_TO_DELTA(delta)
459
                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
460
461
462
463
                        }
                    }
                    else {
#endif
464
465
466
467
468
                        #pragma unroll
                        for (int j = 0; j < 32; j++) {
                            real4 posj = posBuffer[warpStart+j];
                            real halfDist2 = posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z;
                            interacts |= (halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<<j : 0);
469
470
471
472
473
474
475
476
                        }
#ifdef USE_PERIODIC
                    }
#endif
                }
                
                // Add any interacting atoms to the buffer.
                
477
                int includeAtomFlags = BALLOT(interacts);
478
479
480
481
482
                if (interacts) {
                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
                    buffer[index] = atom2;
                    flagsBuffer[index] = interacts;
                }
483
484
485
486
                neighborsInBuffer += __popc(includeAtomFlags);
                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                    // Store the new tiles to memory.
                    
487
488
489
#if MAX_BITS_FOR_PAIRS > 0
                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
490
                    unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
491
492
                    if (tilesToStore > 0) {
                        if (indexInWarp == 0)
493
                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
494
                        unsigned int newTileStartIndex = tileStartIndex;
495
496
497
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
498
                            #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
499
500
501
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
502
503
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
504
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
505
506
                    }
                }
507
            }
508
509
510
511
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
512
#if MAX_BITS_FOR_PAIRS > 0
513
        if (neighborsInBuffer > 32)
514
515
            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
516
        if (neighborsInBuffer > 0) {
517
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
518
            if (indexInWarp == 0)
519
                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
520
            unsigned int newTileStartIndex = tileStartIndex;
521
            if (newTileStartIndex+tilesToStore <= maxTiles) {
522
523
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
524
                #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
525
526
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
527
528
529
            }
        }
    }
530
531
532
533
534
    
    // Record the positions the neighbor list is based on.
    
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
        oldPositions[i] = posq[i];
535
}