findInteractingBlocks.cu 26.1 KB
Newer Older
1
#define GROUP_SIZE 256
2
#define BUFFER_SIZE 256
3

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
 * To use half precision, we're supposed to include cuda_fp16.h.  Unfortunately,
 * it isn't included in the search path automatically, and there's no reliable
 * way to find where it's located on disk.  Instead we provide our own definitions
 * for the few symbols we need.
 */
struct __align__(2) __half {
    unsigned short x;
};
__device__ __half __float2half_ru(const float f) {
    __half h;
    asm("{cvt.rp.f16.f32 %0, %1;}" : "=h"(*reinterpret_cast<unsigned short *>(&h)) : "f"(f));
    return h;
}
__device__ float __half2float(const __half h) {
    float f;
    asm("{cvt.f32.f16 %0, %1;}" : "=f"(f) : "h"(*reinterpret_cast<const unsigned short *>(&h)));
    return f;
}
struct half3 {
    __device__ half3(real3 f) {
        // Round up so we'll err on the side of making the box a little too large.
        // This ensures interactions will never be missed.
        v[0] = __float2half_ru((float) f.x);
        v[1] = __float2half_ru((float) f.y);
        v[2] = __float2half_ru((float) f.z);
    }
    __device__ real3 toReal3() const {
        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
    }
private:
    __half v[3];
};

38
39
40
/**
 * Find a bounding box for the atoms in each block.
 */
41
42
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
43
        real2* __restrict__ blockSizeRange) {
44
45
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    int base = index*TILE_SIZE;
46
    real minSize = 1e38, maxSize = 0;
47
48
49
    while (base < numAtoms) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
50
        APPLY_PERIODIC_TO_POS(pos)
51
52
53
54
55
56
57
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, numAtoms);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
58
            real4 center = 0.5f*(maxPos+minPos);
59
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
60
61
62
63
#endif
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
64
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
65
        real4 center = 0.5f*(maxPos+minPos);
66
        center.w = 0;
67
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
68
69
70
71
72
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
73
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
74
        }
75
        center.w = sqrt(center.w);
76
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
77
        blockCenter[index] = center;
78
79
80
        real totalSize = blockSize.x+blockSize.y+blockSize.z;
        minSize = min(minSize, totalSize);
        maxSize = max(maxSize, totalSize);
81
82
83
        index += blockDim.x*gridDim.x;
        base = index*TILE_SIZE;
    }
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    
    // Record the range of sizes seen by threads in this block.

    __shared__ real minBuffer[64], maxBuffer[64];
    minBuffer[threadIdx.x] = minSize;
    maxBuffer[threadIdx.x] = maxSize;
    __syncthreads();
    for (int step = 1; step < 64; step *= 2) {
        if (threadIdx.x+step < 64 && threadIdx.x%(2*step) == 0) {
            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
        }
        __syncthreads();
    }
    if (threadIdx.x == 0)
        blockSizeRange[blockIdx.x] = make_real2(minBuffer[0], maxBuffer[0]);
100
    if (blockIdx.x == 0 && threadIdx.x == 0)
101
        rebuildNeighborList[0] = 0;
102
103
}

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, real2* __restrict__ blockSizeRange, int numSizes) {
    // Find the total range of sizes recorded by all blocks.

    __shared__ real2 sizeRange;
    if (threadIdx.x == 0) {
        sizeRange = blockSizeRange[0];
        for (int i = 1; i < numSizes; i++) {
            real2 size = blockSizeRange[i];
            sizeRange.x = min(sizeRange.x, size.x);
            sizeRange.y = max(sizeRange.y, size.y);
        }
        sizeRange.x = LOG(sizeRange.x);
        sizeRange.y = LOG(sizeRange.y);
    }
    __syncthreads();

    // Sort keys store the bin in the high order part and the block in the low
    // order part.

    int numSizeBins = 20;
    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
    for (unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
        real4 box = blockBoundingBox[i];
        real size = LOG(box.x+box.y+box.z);
        int bin = (size-sizeRange.x)*scale;
        bin = max(0, min(bin, numSizeBins-1));
        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
    }
}

134
/**
135
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
136
 */
137
extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
138
139
140
141
142
143
        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
144
        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
145
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
146
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
147
        sortedBlockCenter[i] = blockCenter[index];
148
        sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index]));
149

150
#ifdef USE_LARGE_BLOCKS
151
        // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
152
    
153
154
        real4 minPos = blockCenter[index]-blockBoundingBox[index];
        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
155
156
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
157
158
159
            unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
            real4 blockPos = blockCenter[index2];
            real4 width = blockBoundingBox[index2];
160
161
162
163
164
165
166
167
168
169
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = half3(trimTo3(0.5f*(maxPos-minPos)));
#endif
170
171
    }

172
173
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.

174
    bool rebuild = forceRebuild;
175
176
177
178
179
180
181
182
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
183
        interactionCount[1] = 0;
184
185
    }
}
186

187
__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile unsigned int& pairStartIndex) {
188
189
    // Record interactions that should be computed as single pairs rather than in blocks.
    
190
191
    const int indexInWarp = threadIdx.x%32;
    int sum = 0;
192
    #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
193
194
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
195
        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
196
    }
peastman's avatar
peastman committed
197
198
199
200
    for (int i = 1; i < 32; i *= 2) {
        int n = __shfl_up_sync(0xffffffff, sum, i);
        if (indexInWarp >= i)
            sum += n;
201
    }
peastman's avatar
peastman committed
202
203
204
205
    if (indexInWarp == 31)
        pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum);
    __syncwarp();
    int prevSum = __shfl_up_sync(0xffffffff, sum, 1);
206
    unsigned int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0);
207
208
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
209
        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) {
210
211
212
213
214
215
216
217
218
            int f = flags[i];
            while (f != 0) {
                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
                f &= f-1;
                pairIndex++;
            }
        }
    }
    
219
    // Compact the remaining interactions.
220
221
222
    
    const int warpMask = (1<<indexInWarp)-1;
    int numCompacted = 0;
223
224
    for (int start = 0; start < length; start += 32) {
        int i = start+indexInWarp;
225
226
        int atom = atoms[i];
        int flag = flags[i];
227
        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
228
        int includeFlags = BALLOT(include);
229
230
231
232
233
234
235
236
237
238
        if (include) {
            int index = numCompacted+__popc(includeFlags&warpMask);
            atoms[index] = atom;
            flags[index] = flag;
        }
        numCompacted += __popc(includeFlags);
    }
    return numCompacted;
}

239
/**
240
241
242
243
244
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
 *
 * STAGE 1:
 *
245
 * A coarse grained atom block against interacting atom block neighbour list is constructed. 
246
 *
247
248
249
250
 * Each warp first loads in some block X of interest. Each thread within the warp then loads 
 * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes 
 * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
 * interacting and Y is added to the buffer for X.
251
252
253
 *
 * STAGE 2:
 *
254
 * A fine grained atom block against interacting atoms neighbour list is constructed.
255
 *
256
257
258
259
 * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
 * in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y.
 * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
 * This continues until the buffer fills up, at which point the results are written to global memory.
260
261
262
263
264
265
 *
 * [in] periodicBoxSize        - size of the rectangular periodic box
 * [in] invPeriodicBoxSize     - inverse of the periodic box
 * [in] blockCenter            - the center of each bounding box
 * [in] blockBoundingBox       - bounding box of each atom block
 * [out] interactionCount      - total number of tiles that have interactions
266
 * [out] interactingTiles      - set of blocks that have interactions
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
 * [out] interactingAtoms      - a list of atoms that interact with each atom block
 * [in] posq                   - x,y,z coordinates of each atom and charge q
 * [in] maxTiles               - maximum number of tiles to process, used for multi-GPUs
 * [in] startBlockIndex        - first block to process, used for multi-GPUs,
 * [in] numBlocks              - total number of atom blocks
 * [in] sortedBlocks           - a sorted list of atom blocks based on volume
 * [in] sortedBlockCenter      - sorted centers, duplicated for fast access to avoid indexing
 * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
 * [in] exclusionIndices       - maps into exclusionRowIndices with the starting position for a given atom
 * [in] exclusionRowIndices    - stores the a continuous list of exclusions
 *           eg: block 0 is excluded from atom 3,5,6
 *               block 1 is excluded from atom 3,4
 *               block 2 is excluded from atom 1,3,5,6
 *              exclusionIndices[0][3][5][8]
 *           exclusionRowIndices[3][5][6][3][4][1][3][5][6]
 *                         index 0  1  2  3  4  5  6  7  8 
 * [out] oldPos                - stores the positions of the atoms in which this neighbourlist was built on
 *                             - this is used to decide when to rebuild a neighbourlist
 * [in] rebuildNeighbourList   - whether or not to execute this kernel
 *
287
 */
288
extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
289
        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
290
        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex,
291
        unsigned int numBlocks, unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox,
292
293
294
295
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox,
#endif
        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
296
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
297

298
299
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.
300
301
302
303
304
305
306

    const int indexInWarp = threadIdx.x%32;
    const int warpStart = threadIdx.x-indexInWarp;
    const int totalWarps = blockDim.x*gridDim.x/32;
    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
307
    __shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
308
    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
309
    __shared__ real4 posBuffer[GROUP_SIZE];
310
311
    __shared__ volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
    __shared__ unsigned int workgroupPairStartIndex[GROUP_SIZE/32];
peastman's avatar
peastman committed
312
    int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
313
    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
314
    int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
315
    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
316
317
    volatile unsigned int& tileStartIndex = workgroupTileIndex[warpStart/32];
    volatile unsigned int& pairStartIndex = workgroupPairStartIndex[warpStart/32];
318
319

    // Loop over blocks.
320
    
321
322
323
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
324
        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
325
        real4 blockCenterX = sortedBlockCenter[block1];
326
        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
327
        int neighborsInBuffer = 0;
328
        real4 pos1 = posq[x*TILE_SIZE+indexInWarp];
329
330
331
332
333
334
335
336
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
337
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
338
339
        }
#endif
340
        pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
341
        posBuffer[threadIdx.x] = pos1;
342

343
344
345
346
347
        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
348
        #pragma unroll 4 // (MAX_EXCLUSIONS)
349
        for (int j = indexInWarp; j < numExclusions; j += 32)
350
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
351
352
        if (MAX_EXCLUSIONS > 32)
            __syncthreads();
353
        
354
355
356
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

357
358
359
360
#ifdef USE_LARGE_BLOCKS
        int largeBlockFlags = 0;
        int loadedLargeBlocks = 0;
#endif
361
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
379
380
381
382
383
384
385
#ifdef TRICLINIC
                    // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                    // If there's any possibility we might have missed it, do a detailed check.

                    if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
                        includeLargeBlock = true;
#endif
386
387
388
389
390
391
392
393
394
395
396
397
398
                }
                largeBlockFlags = BALLOT(includeLargeBlock);
                loadedLargeBlocks = 32;
            }
            loadedLargeBlocks--;
            if ((largeBlockFlags&1) == 0) {
                // None of the next 32 blocks interact with block 1.

                largeBlockFlags >>= 1;
                continue;
            }
            largeBlockFlags >>= 1;
#endif
399
400
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
401
            bool forceInclude = false;
402
            if (includeBlock2) {
403
                real4 blockCenterY = sortedBlockCenter[block2];
404
                real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
405
                real4 blockDelta = blockCenterX-blockCenterY;
406
#ifdef USE_PERIODIC
407
                APPLY_PERIODIC_TO_DELTA(blockDelta)
408
#endif
409
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
410
411
412
413
                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
414
415
416
417
418
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
419
                    includeBlock2 = forceInclude = true;
420
#endif
421
                if (includeBlock2) {
422
                    int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
423
                    #pragma unroll 4 // (MAX_EXCLUSIONS)
424
425
426
427
428
429
430
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
431
            int includeBlockFlags = BALLOT(includeBlock2);
Peter Eastman's avatar
Peter Eastman committed
432
            int forceIncludeFlags = BALLOT(forceInclude);
433
434
435
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
Peter Eastman's avatar
Peter Eastman committed
436
                forceInclude = (forceIncludeFlags>>i) & 1;
437
                int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
438

439
                // Check each atom in block Y for interactions.
440

441
                int atom2 = y*TILE_SIZE+indexInWarp;
442
                real4 pos2 = posq[atom2];
443
444
#ifdef USE_PERIODIC
                if (singlePeriodicCopy) {
445
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
446
447
                }
#endif
448
449
                pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);

450
                real4 blockCenterY = sortedBlockCenter[block2Base+i];
451
                real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY);
452
453
454
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
455
                int atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
456
                int interacts = 0;
457
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
458
459
#ifdef USE_PERIODIC
                    if (!singlePeriodicCopy) {
460
461
                        int first = __ffs(atomFlags)-1;
                        int last = 32-__clz(atomFlags);
462
                        for (int j = first; j < last; j++) {
463
                            real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]);
464
                            APPLY_PERIODIC_TO_DELTA(delta)
465
                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
466
467
468
469
                        }
                    }
                    else {
#endif
470
471
472
473
474
                        #pragma unroll
                        for (int j = 0; j < 32; j++) {
                            real4 posj = posBuffer[warpStart+j];
                            real halfDist2 = posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z;
                            interacts |= (halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<<j : 0);
475
476
477
478
479
480
481
482
                        }
#ifdef USE_PERIODIC
                    }
#endif
                }
                
                // Add any interacting atoms to the buffer.
                
483
                int includeAtomFlags = BALLOT(interacts);
484
485
486
487
488
                if (interacts) {
                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
                    buffer[index] = atom2;
                    flagsBuffer[index] = interacts;
                }
489
490
491
492
                neighborsInBuffer += __popc(includeAtomFlags);
                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                    // Store the new tiles to memory.
                    
493
494
495
#if MAX_BITS_FOR_PAIRS > 0
                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
496
                    unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
497
498
                    if (tilesToStore > 0) {
                        if (indexInWarp == 0)
499
                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
500
                        unsigned int newTileStartIndex = tileStartIndex;
501
502
503
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
504
                            #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
505
506
507
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
508
509
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
510
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
511
512
                    }
                }
513
            }
514
515
516
517
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
518
#if MAX_BITS_FOR_PAIRS > 0
519
        if (neighborsInBuffer > 32)
520
521
            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
522
        if (neighborsInBuffer > 0) {
523
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
524
            if (indexInWarp == 0)
525
                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
526
            unsigned int newTileStartIndex = tileStartIndex;
527
            if (newTileStartIndex+tilesToStore <= maxTiles) {
528
529
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
530
                #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
531
532
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
533
534
535
            }
        }
    }
536
537
538
539
540
    
    // Record the positions the neighbor list is based on.
    
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
        oldPositions[i] = posq[i];
541
}