findInteractingBlocks.cu 23.7 KB
Newer Older
1
#define GROUP_SIZE 256
2
#define BUFFER_SIZE 256
3

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
 * To use half precision, we're supposed to include cuda_fp16.h.  Unfortunately,
 * it isn't included in the search path automatically, and there's no reliable
 * way to find where it's located on disk.  Instead we provide our own definitions
 * for the few symbols we need.
 */
struct __align__(2) __half {
    unsigned short x;
};
__device__ __half __float2half_ru(const float f) {
    __half h;
    asm("{cvt.rp.f16.f32 %0, %1;}" : "=h"(*reinterpret_cast<unsigned short *>(&h)) : "f"(f));
    return h;
}
__device__ float __half2float(const __half h) {
    float f;
    asm("{cvt.f32.f16 %0, %1;}" : "=f"(f) : "h"(*reinterpret_cast<const unsigned short *>(&h)));
    return f;
}
struct half3 {
    __device__ half3(real3 f) {
        // Round up so we'll err on the side of making the box a little too large.
        // This ensures interactions will never be missed.
        v[0] = __float2half_ru((float) f.x);
        v[1] = __float2half_ru((float) f.y);
        v[2] = __float2half_ru((float) f.z);
    }
    __device__ real3 toReal3() const {
        return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
    }
private:
    __half v[3];
};

38
39
40
/**
 * Find a bounding box for the atoms in each block.
 */
41
42
43
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
        real2* __restrict__ sortedBlocks) {
44
45
46
47
48
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
49
        APPLY_PERIODIC_TO_POS(pos)
50
51
52
53
54
55
56
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, numAtoms);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
57
            real4 center = 0.5f*(maxPos+minPos);
58
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
59
60
61
62
#endif
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
63
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
64
        real4 center = 0.5f*(maxPos+minPos);
65
        center.w = 0;
66
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
67
68
69
70
71
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
72
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
73
        }
74
        center.w = sqrt(center.w);
75
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
76
        blockCenter[index] = center;
77
        sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
78
79
80
81
        index += blockDim.x*gridDim.x;
        base = index*TILE_SIZE;
    }
    if (blockIdx.x == 0 && threadIdx.x == 0)
82
        rebuildNeighborList[0] = 0;
83
84
85
}

/**
86
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
87
 */
88
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
89
90
91
92
93
94
        const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
        const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
95
        unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
96
97
98
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
        int index = (int) sortedBlock[i].y;
        sortedBlockCenter[i] = blockCenter[index];
99
        sortedBlockBoundingBox[i] = half3(trimTo3(blockBoundingBox[index]));
100
    }
101
102
#ifdef USE_LARGE_BLOCKS
    // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
103
    
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
        real4 minPos = blockCenter[i]-blockBoundingBox[i];
        real4 maxPos = blockCenter[i]+blockBoundingBox[i];
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
            real4 blockPos = blockCenter[j];
            real4 width = blockBoundingBox[j];
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
            maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = half3(trimTo3(0.5f*(maxPos-minPos)));
    }
#endif
122
123
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.

124
    bool rebuild = forceRebuild;
125
126
127
128
129
130
131
132
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
133
        interactionCount[1] = 0;
134
135
    }
}
136

137
__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile unsigned int& pairStartIndex) {
138
139
    // Record interactions that should be computed as single pairs rather than in blocks.
    
140
141
    const int indexInWarp = threadIdx.x%32;
    int sum = 0;
142
    #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
143
144
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
145
        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
146
    }
peastman's avatar
peastman committed
147
148
149
150
    for (int i = 1; i < 32; i *= 2) {
        int n = __shfl_up_sync(0xffffffff, sum, i);
        if (indexInWarp >= i)
            sum += n;
151
    }
peastman's avatar
peastman committed
152
153
154
155
    if (indexInWarp == 31)
        pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum);
    __syncwarp();
    int prevSum = __shfl_up_sync(0xffffffff, sum, 1);
156
    unsigned int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0);
157
158
    for (int i = indexInWarp; i < length; i += 32) {
        int count = __popc(flags[i]);
159
        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) {
160
161
162
163
164
165
166
167
168
            int f = flags[i];
            while (f != 0) {
                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
                f &= f-1;
                pairIndex++;
            }
        }
    }
    
169
    // Compact the remaining interactions.
170
171
172
    
    const int warpMask = (1<<indexInWarp)-1;
    int numCompacted = 0;
173
174
    for (int start = 0; start < length; start += 32) {
        int i = start+indexInWarp;
175
176
        int atom = atoms[i];
        int flag = flags[i];
177
        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
178
        int includeFlags = BALLOT(include);
179
180
181
182
183
184
185
186
187
188
        if (include) {
            int index = numCompacted+__popc(includeFlags&warpMask);
            atoms[index] = atom;
            flags[index] = flag;
        }
        numCompacted += __popc(includeFlags);
    }
    return numCompacted;
}

189
/**
190
191
192
193
194
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
 *
 * STAGE 1:
 *
195
 * A coarse grained atom block against interacting atom block neighbour list is constructed. 
196
 *
197
198
199
200
 * Each warp first loads in some block X of interest. Each thread within the warp then loads 
 * in a different atom block Y. If Y has exclusions with X, then Y is not processed.  If the bounding boxes 
 * of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
 * interacting and Y is added to the buffer for X.
201
202
203
 *
 * STAGE 2:
 *
204
 * A fine grained atom block against interacting atoms neighbour list is constructed.
205
 *
206
207
208
209
 * The warp loops over atom blocks Y that were found to (possibly) interact with atom block X.  Each thread
 * in the warp loops over the 32 atoms in X and compares their positions to one particular atom from block Y.
 * If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
 * This continues until the buffer fills up, at which point the results are written to global memory.
210
211
212
213
214
215
 *
 * [in] periodicBoxSize        - size of the rectangular periodic box
 * [in] invPeriodicBoxSize     - inverse of the periodic box
 * [in] blockCenter            - the center of each bounding box
 * [in] blockBoundingBox       - bounding box of each atom block
 * [out] interactionCount      - total number of tiles that have interactions
216
 * [out] interactingTiles      - set of blocks that have interactions
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
 * [out] interactingAtoms      - a list of atoms that interact with each atom block
 * [in] posq                   - x,y,z coordinates of each atom and charge q
 * [in] maxTiles               - maximum number of tiles to process, used for multi-GPUs
 * [in] startBlockIndex        - first block to process, used for multi-GPUs,
 * [in] numBlocks              - total number of atom blocks
 * [in] sortedBlocks           - a sorted list of atom blocks based on volume
 * [in] sortedBlockCenter      - sorted centers, duplicated for fast access to avoid indexing
 * [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
 * [in] exclusionIndices       - maps into exclusionRowIndices with the starting position for a given atom
 * [in] exclusionRowIndices    - stores the a continuous list of exclusions
 *           eg: block 0 is excluded from atom 3,5,6
 *               block 1 is excluded from atom 3,4
 *               block 2 is excluded from atom 1,3,5,6
 *              exclusionIndices[0][3][5][8]
 *           exclusionRowIndices[3][5][6][3][4][1][3][5][6]
 *                         index 0  1  2  3  4  5  6  7  8 
 * [out] oldPos                - stores the positions of the atoms in which this neighbourlist was built on
 *                             - this is used to decide when to rebuild a neighbourlist
 * [in] rebuildNeighbourList   - whether or not to execute this kernel
 *
237
 */
238
extern "C" __global__ __launch_bounds__(GROUP_SIZE,3) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
239
        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
240
241
242
243
244
245
        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs, unsigned int startBlockIndex,
        unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const half3* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
        real4* __restrict__ largeBlockCenter, half3* __restrict__ largeBlockBoundingBox,
#endif
        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
246
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
247

248
249
    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.
250
251
252
253
254
255
256

    const int indexInWarp = threadIdx.x%32;
    const int warpStart = threadIdx.x-indexInWarp;
    const int totalWarps = blockDim.x*gridDim.x/32;
    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
257
    __shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
258
    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
259
    __shared__ real4 posBuffer[GROUP_SIZE];
260
261
    __shared__ volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
    __shared__ unsigned int workgroupPairStartIndex[GROUP_SIZE/32];
peastman's avatar
peastman committed
262
    int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
263
    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
264
    int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
265
    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
266
267
    volatile unsigned int& tileStartIndex = workgroupTileIndex[warpStart/32];
    volatile unsigned int& pairStartIndex = workgroupPairStartIndex[warpStart/32];
268
269

    // Loop over blocks.
270
    
271
272
273
274
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
        real2 sortedKey = sortedBlocks[block1];
275
        int x = (int) sortedKey.y;
276
        real4 blockCenterX = sortedBlockCenter[block1];
277
        real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
278
        int neighborsInBuffer = 0;
279
        real4 pos1 = posq[x*TILE_SIZE+indexInWarp];
280
281
282
283
284
285
286
287
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
288
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
289
290
        }
#endif
291
        pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
292
        posBuffer[threadIdx.x] = pos1;
293

294
295
296
297
298
        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
299
        #pragma unroll 4 // (MAX_EXCLUSIONS)
300
        for (int j = indexInWarp; j < numExclusions; j += 32)
301
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
302
303
        if (MAX_EXCLUSIONS > 32)
            __syncthreads();
304
        
305
306
307
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

308
309
310
311
#ifdef USE_LARGE_BLOCKS
        int largeBlockFlags = 0;
        int loadedLargeBlocks = 0;
#endif
312
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
                }
                largeBlockFlags = BALLOT(includeLargeBlock);
                loadedLargeBlocks = 32;
            }
            loadedLargeBlocks--;
            if ((largeBlockFlags&1) == 0) {
                // None of the next 32 blocks interact with block 1.

                largeBlockFlags >>= 1;
                continue;
            }
            largeBlockFlags >>= 1;
#endif
343
344
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
345
            bool forceInclude = false;
346
            if (includeBlock2) {
347
                real4 blockCenterY = sortedBlockCenter[block2];
348
                real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
349
                real4 blockDelta = blockCenterX-blockCenterY;
350
#ifdef USE_PERIODIC
351
                APPLY_PERIODIC_TO_DELTA(blockDelta)
352
#endif
353
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
354
355
356
357
                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
358
359
360
361
362
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
363
                    includeBlock2 = forceInclude = true;
364
#endif
365
                if (includeBlock2) {
366
                    int y = (int) sortedBlocks[block2].y;
367
                    #pragma unroll 4 // (MAX_EXCLUSIONS)
368
369
370
371
372
373
374
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
375
            int includeBlockFlags = BALLOT(includeBlock2);
Peter Eastman's avatar
Peter Eastman committed
376
            int forceIncludeFlags = BALLOT(forceInclude);
377
378
379
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
Peter Eastman's avatar
Peter Eastman committed
380
                forceInclude = (forceIncludeFlags>>i) & 1;
381
                int y = (int) sortedBlocks[block2Base+i].y;
382

383
                // Check each atom in block Y for interactions.
384

385
                int atom2 = y*TILE_SIZE+indexInWarp;
386
                real4 pos2 = posq[atom2];
387
388
#ifdef USE_PERIODIC
                if (singlePeriodicCopy) {
389
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
390
391
                }
#endif
392
393
                pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);

394
                real4 blockCenterY = sortedBlockCenter[block2Base+i];
395
                real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY);
396
397
398
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
399
                int atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
400
                int interacts = 0;
401
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
402
403
#ifdef USE_PERIODIC
                    if (!singlePeriodicCopy) {
404
405
                        int first = __ffs(atomFlags)-1;
                        int last = 32-__clz(atomFlags);
406
                        for (int j = first; j < last; j++) {
407
                            real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]);
408
                            APPLY_PERIODIC_TO_DELTA(delta)
409
                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
410
411
412
413
                        }
                    }
                    else {
#endif
414
415
416
417
418
                        #pragma unroll
                        for (int j = 0; j < 32; j++) {
                            real4 posj = posBuffer[warpStart+j];
                            real halfDist2 = posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z;
                            interacts |= (halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<<j : 0);
419
420
421
422
423
424
425
426
                        }
#ifdef USE_PERIODIC
                    }
#endif
                }
                
                // Add any interacting atoms to the buffer.
                
427
                int includeAtomFlags = BALLOT(interacts);
428
429
430
431
432
                if (interacts) {
                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
                    buffer[index] = atom2;
                    flagsBuffer[index] = interacts;
                }
433
434
435
436
                neighborsInBuffer += __popc(includeAtomFlags);
                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                    // Store the new tiles to memory.
                    
437
438
439
#if MAX_BITS_FOR_PAIRS > 0
                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
440
                    unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
441
442
                    if (tilesToStore > 0) {
                        if (indexInWarp == 0)
443
                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
444
                        unsigned int newTileStartIndex = tileStartIndex;
445
446
447
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
448
                            #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
449
450
451
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
452
453
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
454
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
455
456
                    }
                }
457
            }
458
459
460
461
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
462
#if MAX_BITS_FOR_PAIRS > 0
463
        if (neighborsInBuffer > 32)
464
465
            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
466
        if (neighborsInBuffer > 0) {
467
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
468
            if (indexInWarp == 0)
469
                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
470
            unsigned int newTileStartIndex = tileStartIndex;
471
            if (newTileStartIndex+tilesToStore <= maxTiles) {
472
473
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
474
                #pragma unroll 8 // (GROUP_SIZE / TILE_SIZE)
475
476
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
477
478
479
            }
        }
    }
480
481
482
483
484
    
    // Record the positions the neighbor list is based on.
    
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
        oldPositions[i] = posq[i];
485
}