findInteractingBlocks.cu 26.1 KB
Newer Older
1
#define GROUP_SIZE 256
2
#define BUFFER_SIZE 256
3

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
 * To use half precision, we're supposed to include cuda_fp16.h.  Unfortunately,
 * it isn't included in the search path automatically, and there's no reliable
 * way to find where it's located on disk.  Instead we provide our own definitions
 * for the few symbols we need.
 */
struct __align__(2) __half {
    unsigned short x;
};
__device__ __half __float2half_ru(const float f) {
    __half h;
    asm("{cvt.rp.f16.f32 %0, %1;}" : "=h"(*reinterpret_cast<unsigned short *>(&h)) : "f"(f));
    return h;
}
__device__ float __half2float(const __half h) {
    float f;
    asm("{cvt.f32.f16 %0, %1;}" : "=f"(f) : "h"(*reinterpret_cast<const unsigned short *>(&h)));
    return f;
}
struct half3 {
    __device__ half3(real3 f) {
        // Round up so we'll err on the side of making the box a little too large.
        // This ensures interactions will never be missed.
        v[0] = __float2half_ru((float) f.x);
        v[1] = __float2half_ru((float) f.y);
        v[2] = __float2half_ru((float) f.z);
    }
    __device__ real3 toReal3() const {
        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
    }
private:
    __half v[3];
};

38
39
40
/**
 * Find a bounding box for the atoms in each block.
 */
41
42
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
43
        real2* __restrict__ blockSizeRange) {
44
45
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    int base = index*TILE_SIZE;
46
    real minSize = 1e38, maxSize = 0;
47
48
49
    while (base < numAtoms) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
50
        APPLY_PERIODIC_TO_POS(pos)
51
52
53
54
55
56
57
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, numAtoms);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
58
            real4 center = 0.5f*(maxPos+minPos);
59
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
60
61
62
63
#endif
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
64
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
65
        real4 center = 0.5f*(maxPos+minPos);
66
        center.w = 0;
67
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
68
69
70
71
72
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
73
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
74
        }
75
        center.w = sqrt(center.w);
76
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
77
        blockCenter[index] = center;
78
79
80
        real totalSize = blockSize.x+blockSize.y+blockSize.z;
        minSize = min(minSize, totalSize);
        maxSize = max(maxSize, totalSize);
81
82
83
        index += blockDim.x*gridDim.x;
        base = index*TILE_SIZE;
    }
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    
    // Record the range of sizes seen by threads in this block.

    __shared__ real minBuffer[64], maxBuffer[64];
    minBuffer[threadIdx.x] = minSize;
    maxBuffer[threadIdx.x] = maxSize;
    __syncthreads();
    for (int step = 1; step < 64; step *= 2) {
        if (threadIdx.x+step < 64 && threadIdx.x%(2*step) == 0) {
            minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
            maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
        }
        __syncthreads();
    }
    if (threadIdx.x == 0)
        blockSizeRange[blockIdx.x] = make_real2(minBuffer[0], maxBuffer[0]);
100
    if (blockIdx.x == 0 && threadIdx.x == 0)
101
        rebuildNeighborList[0] = 0;
102
103
}

104
105
106
107
108
109
110
111
extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, real2* __restrict__ blockSizeRange, int numSizes) {
    // Find the total range of sizes recorded by all blocks.

    __shared__ real2 sizeRange;
    if (threadIdx.x == 0) {
        sizeRange = blockSizeRange[0];
        for (int i = 1; i < numSizes; i++) {
            real2 size = blockSizeRange[i];
112
113
            if (size.x > 0)
                sizeRange.x = min(sizeRange.x, size.x);
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
            sizeRange.y = max(sizeRange.y, size.y);
        }
        sizeRange.x = LOG(sizeRange.x);
        sizeRange.y = LOG(sizeRange.y);
    }
    __syncthreads();

    // Sort keys store the bin in the high order part and the block in the low
    // order part.

    int numSizeBins = 20;
    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
    for (unsigned int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
        real4 box = blockBoundingBox[i];
        real size = LOG(box.x+box.y+box.z);
        int bin = (size-sizeRange.x)*scale;
        bin = max(0, min(bin, numSizeBins-1));
        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
    }
}

135
/**
136
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
137
 */
138
extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
139
140
141
142
143
144
        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
145
        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
146
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
147
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
148
        sortedBlockCenter[i] = blockCenter[index];
149
        sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index]));
150

151
#ifdef USE_LARGE_BLOCKS
152
        // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
153
    
154
155
        real4 minPos = blockCenter[index]-blockBoundingBox[index];
        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
156
157
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
158
159
160
            unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
            real4 blockPos = blockCenter[index2];
            real4 width = blockBoundingBox[index2];
161
162
163
164
165
166
167
168
169
170
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = half3(trimTo3(0.5f*(maxPos-minPos)));
#endif
171
172
    }

173
174
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.

175
    bool rebuild = forceRebuild;
176
177
178
179
180
181
182
183
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
184
        interactionCount[1] = 0;
185
186
    }
}
187

188
__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile unsigned int& pairStartIndex) {
189
190
    // Record interactions that should be computed as single pairs rather than in blocks.
    
191
192
    const int indexInWarp = threadIdx.x%32;
    int sum = 0;
193
    #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
194
195
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
196
        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
197
    }
peastman's avatar
peastman committed
198
199
200
201
    for (int i = 1; i < 32; i *= 2) {
        int n = __shfl_up_sync(0xffffffff, sum, i);
        if (indexInWarp >= i)
            sum += n;
202
    }
peastman's avatar
peastman committed
203
204
205
206
    if (indexInWarp == 31)
        pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum);
    __syncwarp();
    int prevSum = __shfl_up_sync(0xffffffff, sum, 1);
207
    unsigned int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0);
208
209
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
210
        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) {
211
212
213
214
215
216
217
218
219
            int f = flags[i];
            while (f != 0) {
                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
                f &= f-1;
                pairIndex++;
            }
        }
    }
    
220
    // Compact the remaining interactions.
221
222
223
    
    const int warpMask = (1<<indexInWarp)-1;
    int numCompacted = 0;
224
225
    for (int start = 0; start < length; start += 32) {
        int i = start+indexInWarp;
226
227
        int atom = atoms[i];
        int flag = flags[i];
228
        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
229
        int includeFlags = BALLOT(include);
230
231
232
233
234
235
236
237
238
239
        if (include) {
            int index = numCompacted+__popc(includeFlags&warpMask);
            atoms[index] = atom;
            flags[index] = flag;
        }
        numCompacted += __popc(includeFlags);
    }
    return numCompacted;
}

240
/**
241
242
243
244
245
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
 *
 * STAGE 1:
 *
246
 * A coarse grained atom block against interacting atom block neighbour list is constructed. 
247
 *
248
249
250
251
 * Each warp first loads in some block X of interest. Each thread within the warp then loads 
 * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes 
 * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
 * interacting and Y is added to the buffer for X.
252
253
254
 *
 * STAGE 2:
 *
255
 * A fine grained atom block against interacting atoms neighbour list is constructed.
256
 *
257
258
259
260
 * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
 * in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y.
 * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
 * This continues until the buffer fills up, at which point the results are written to global memory.
261
262
263
264
265
266
 *
 * [in] periodicBoxSize        - size of the rectangular periodic box
 * [in] invPeriodicBoxSize     - inverse of the periodic box
 * [in] blockCenter            - the center of each bounding box
 * [in] blockBoundingBox       - bounding box of each atom block
 * [out] interactionCount      - total number of tiles that have interactions
267
 * [out] interactingTiles      - set of blocks that have interactions
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
 * [out] interactingAtoms      - a list of atoms that interact with each atom block
 * [in] posq                   - x,y,z coordinates of each atom and charge q
 * [in] maxTiles               - maximum number of tiles to process, used for multi-GPUs
 * [in] startBlockIndex        - first block to process, used for multi-GPUs,
 * [in] numBlocks              - total number of atom blocks
 * [in] sortedBlocks           - a sorted list of atom blocks based on volume
 * [in] sortedBlockCenter      - sorted centers, duplicated for fast access to avoid indexing
 * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
 * [in] exclusionIndices       - maps into exclusionRowIndices with the starting position for a given atom
 * [in] exclusionRowIndices    - stores the a continuous list of exclusions
 *           eg: block 0 is excluded from atom 3,5,6
 *               block 1 is excluded from atom 3,4
 *               block 2 is excluded from atom 1,3,5,6
 *              exclusionIndices[0][3][5][8]
 *           exclusionRowIndices[3][5][6][3][4][1][3][5][6]
 *                         index 0  1  2  3  4  5  6  7  8 
 * [out] oldPos                - stores the positions of the atoms in which this neighbourlist was built on
 *                             - this is used to decide when to rebuild a neighbourlist
 * [in] rebuildNeighbourList   - whether or not to execute this kernel
 *
288
 */
289
extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
290
        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
291
        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex,
292
        unsigned int numBlocks, unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox,
293
294
295
296
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox,
#endif
        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
297
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
298

299
300
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.
301
302
303
304
305
306
307

    const int indexInWarp = threadIdx.x%32;
    const int warpStart = threadIdx.x-indexInWarp;
    const int totalWarps = blockDim.x*gridDim.x/32;
    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
308
    __shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
309
    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
310
    __shared__ real4 posBuffer[GROUP_SIZE];
311
312
    __shared__ volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
    __shared__ unsigned int workgroupPairStartIndex[GROUP_SIZE/32];
peastman's avatar
peastman committed
313
    int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
314
    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
315
    int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
316
    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
317
318
    volatile unsigned int& tileStartIndex = workgroupTileIndex[warpStart/32];
    volatile unsigned int& pairStartIndex = workgroupPairStartIndex[warpStart/32];
319
320

    // Loop over blocks.
321
    
322
323
324
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
325
        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
326
        real4 blockCenterX = sortedBlockCenter[block1];
327
        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
328
        int neighborsInBuffer = 0;
329
        real4 pos1 = posq[x*TILE_SIZE+indexInWarp];
330
331
332
333
334
335
336
337
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
338
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
339
340
        }
#endif
341
        pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
342
        posBuffer[threadIdx.x] = pos1;
343

344
345
346
347
348
        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
349
        #pragma unroll 4 // (MAX_EXCLUSIONS)
350
        for (int j = indexInWarp; j < numExclusions; j += 32)
351
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
352
353
        if (MAX_EXCLUSIONS > 32)
            __syncthreads();
354
        
355
356
357
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

358
359
360
361
#ifdef USE_LARGE_BLOCKS
        int largeBlockFlags = 0;
        int loadedLargeBlocks = 0;
#endif
362
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
380
381
382
383
384
385
386
#ifdef TRICLINIC
                    // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                    // If there's any possibility we might have missed it, do a detailed check.

                    if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
                        includeLargeBlock = true;
#endif
387
388
389
390
391
392
393
394
395
396
397
398
399
                }
                largeBlockFlags = BALLOT(includeLargeBlock);
                loadedLargeBlocks = 32;
            }
            loadedLargeBlocks--;
            if ((largeBlockFlags&1) == 0) {
                // None of the next 32 blocks interact with block 1.

                largeBlockFlags >>= 1;
                continue;
            }
            largeBlockFlags >>= 1;
#endif
400
401
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
402
            bool forceInclude = false;
403
            if (includeBlock2) {
404
                real4 blockCenterY = sortedBlockCenter[block2];
405
                real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
406
                real4 blockDelta = blockCenterX-blockCenterY;
407
#ifdef USE_PERIODIC
408
                APPLY_PERIODIC_TO_DELTA(blockDelta)
409
#endif
410
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
411
412
413
414
                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
415
416
417
418
419
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
420
                    includeBlock2 = forceInclude = true;
421
#endif
422
                if (includeBlock2) {
423
                    int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
424
                    #pragma unroll 4 // (MAX_EXCLUSIONS)
425
426
427
428
429
430
431
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
432
            int includeBlockFlags = BALLOT(includeBlock2);
Peter Eastman's avatar
Peter Eastman committed
433
            int forceIncludeFlags = BALLOT(forceInclude);
434
435
436
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
Peter Eastman's avatar
Peter Eastman committed
437
                forceInclude = (forceIncludeFlags>>i) & 1;
438
                int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
439

440
                // Check each atom in block Y for interactions.
441

442
                int atom2 = y*TILE_SIZE+indexInWarp;
443
                real4 pos2 = posq[atom2];
444
445
#ifdef USE_PERIODIC
                if (singlePeriodicCopy) {
446
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
447
448
                }
#endif
449
450
                pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);

451
                real4 blockCenterY = sortedBlockCenter[block2Base+i];
452
                real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY);
453
454
455
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
456
                int atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
457
                int interacts = 0;
458
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
459
460
#ifdef USE_PERIODIC
                    if (!singlePeriodicCopy) {
461
462
                        int first = __ffs(atomFlags)-1;
                        int last = 32-__clz(atomFlags);
463
                        for (int j = first; j < last; j++) {
464
                            real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]);
465
                            APPLY_PERIODIC_TO_DELTA(delta)
466
                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
467
468
469
470
                        }
                    }
                    else {
#endif
471
472
473
474
475
                        #pragma unroll
                        for (int j = 0; j < 32; j++) {
                            real4 posj = posBuffer[warpStart+j];
                            real halfDist2 = posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z;
                            interacts |= (halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<<j : 0);
476
477
478
479
480
481
482
483
                        }
#ifdef USE_PERIODIC
                    }
#endif
                }
                
                // Add any interacting atoms to the buffer.
                
484
                int includeAtomFlags = BALLOT(interacts);
485
486
487
488
489
                if (interacts) {
                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
                    buffer[index] = atom2;
                    flagsBuffer[index] = interacts;
                }
490
491
492
493
                neighborsInBuffer += __popc(includeAtomFlags);
                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                    // Store the new tiles to memory.
                    
494
495
496
#if MAX_BITS_FOR_PAIRS > 0
                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
497
                    unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
498
499
                    if (tilesToStore > 0) {
                        if (indexInWarp == 0)
500
                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
501
                        unsigned int newTileStartIndex = tileStartIndex;
502
503
504
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
505
                            #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
506
507
508
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
509
510
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
511
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
512
513
                    }
                }
514
            }
515
516
517
518
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
519
#if MAX_BITS_FOR_PAIRS > 0
520
        if (neighborsInBuffer > 32)
521
522
            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
523
        if (neighborsInBuffer > 0) {
524
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
525
            if (indexInWarp == 0)
526
                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
527
            unsigned int newTileStartIndex = tileStartIndex;
528
            if (newTileStartIndex+tilesToStore <= maxTiles) {
529
530
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
531
                #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
532
533
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
534
535
536
            }
        }
    }
537
538
539
540
541
    
    // Record the positions the neighbor list is based on.
    
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
        oldPositions[i] = posq[i];
542
}