findInteractingBlocks.cl 31.7 KB
Newer Older
1
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
3
4
5
6

/**
 * Find a bounding box for the atoms in each block.
 */
7
8
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
9
        __global real2* restrict blockSizeRange) {
10
    int index = get_global_id(0);
11
    int base = index*TILE_SIZE;
12
    real minSize = 1e38, maxSize = 0;
13
    while (base < numAtoms) {
14
        real4 pos = posq[base];
15
#ifdef USE_PERIODIC
16
        APPLY_PERIODIC_TO_POS(pos)
17
#endif
18
19
        real4 minPos = pos;
        real4 maxPos = pos;
20
        int last = min(base+TILE_SIZE, numAtoms);
21
22
23
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
24
            real4 center = 0.5f*(maxPos+minPos);
25
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
26
27
28
29
#endif
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
30
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
31
        real4 center = 0.5f*(maxPos+minPos);
32
        center.w = 0;
33
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
34
35
36
37
38
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
39
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
40
        }
Peter Eastman's avatar
Peter Eastman committed
41
        center.w = sqrt(center.w);
42
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
43
        blockCenter[index] = center;
44
45
46
        real totalSize = blockSize.x+blockSize.y+blockSize.z;
        minSize = min(minSize, totalSize);
        maxSize = max(maxSize, totalSize);
47
        index += get_global_size(0);
48
        base = index*TILE_SIZE;
49
    }
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

    // Record the range of sizes seen by threads in this block.

    __local real minBuffer[64], maxBuffer[64];
    minBuffer[get_local_id(0)] = minSize;
    maxBuffer[get_local_id(0)] = maxSize;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int step = 1; step < 64; step *= 2) {
        if (get_local_id(0)+step < 64 && get_local_id(0)%(2*step) == 0) {
            minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
            maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (get_local_id(0) == 0)
        blockSizeRange[get_group_id(0)] = make_real2(minBuffer[0], maxBuffer[0]);
66
    if (get_global_id(0) == 0)
67
        rebuildNeighborList[0] = 0;
68
}
69

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
    // Find the total range of sizes recorded by all blocks.

    __local real2 sizeRange;
    if (get_local_id(0) == 0) {
        sizeRange = blockSizeRange[0];
        for (int i = 1; i < numSizes; i++) {
            real2 size = blockSizeRange[i];
            sizeRange.x = min(sizeRange.x, size.x);
            sizeRange.y = max(sizeRange.y, size.y);
        }
        sizeRange.x = LOG(sizeRange.x);
        sizeRange.y = LOG(sizeRange.y);
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    // Sort keys store the bin in the high order part and the block in the low
    // order part.

    int numSizeBins = 20;
    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
    for (unsigned int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
        real4 box = blockBoundingBox[i];
        real size = LOG(box.x+box.y+box.z);
        int bin = (size-sizeRange.x)*scale;
        bin = max(0, min(bin, numSizeBins-1));
        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
    }
}
99
100

/**
101
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
102
 */
103
__kernel void sortBoxData(__global const unsigned int* restrict sortedBlocks, __global const real4* restrict blockCenter,
104
105
        __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
        __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
106
107
108
109
110
111
        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
        ) {
112
    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
113
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
114
115
        sortedBlockCenter[i] = blockCenter[index];
        sortedBlockBoundingBox[i] = blockBoundingBox[index];
116

117
#ifdef USE_LARGE_BLOCKS
118
        // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.
119

120
121
        real4 minPos = blockCenter[index]-blockBoundingBox[index];
        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
122
123
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
124
125
126
            unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
            real4 blockPos = blockCenter[index2];
            real4 width = blockBoundingBox[index2];
127
128
129
130
131
132
133
134
135
136
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = min(minPos, blockPos-width);
            maxPos = max(maxPos, blockPos+width);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = 0.5f*(maxPos-minPos);
#endif
137
138
    }

139
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
140

141
    bool rebuild = forceRebuild;
142
143
144
145
146
147
148
149
150
151
152
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
    }
}

peastman's avatar
peastman committed
153
154
155
156
#if SIMD_WIDTH <= 32

#define BUFFER_SIZE 256

157
158
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
159
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global unsigned int* restrict sortedBlocks,
160
        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
161
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
162
163
164
165
166
        __global const int* restrict rebuildNeighborList
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox
#endif
        ) {
167
168
169
170
171
172
173
174
175
176
177
178

    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

    const int indexInWarp = get_local_id(0)%32;
    const int warpStart = get_local_id(0)-indexInWarp;
    const int totalWarps = get_global_size(0)/32;
    const int warpIndex = get_global_id(0)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __local int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
    __local int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
    __local real3 posBuffer[GROUP_SIZE];
179
    __local volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
180
    __local bool includeBlockFlags[GROUP_SIZE];
Peter Eastman's avatar
Peter Eastman committed
181
    __local volatile short2 atomCountBuffer[GROUP_SIZE];
182
183
    __local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
    __local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
184
    __local volatile unsigned int* tileStartIndex = workgroupTileIndex+(warpStart/32);
185
186
187
#ifdef USE_LARGE_BLOCKS
    __local bool largeBlockFlags[GROUP_SIZE];
#endif
188
189

    // Loop over blocks.
190

191
192
193
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
194
        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
195
196
197
198
199
200
201
202
203
204
205
206
        real4 blockCenterX = sortedBlockCenter[block1];
        real4 blockSizeX = sortedBlockBoundingBox[block1];
        int neighborsInBuffer = 0;
        real3 pos1 = posq[x*TILE_SIZE+indexInWarp].xyz;
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
207
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
        }
#endif
        posBuffer[get_local_id(0)] = pos1;

        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
        for (int j = indexInWarp; j < numExclusions; j += 32)
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
        if (MAX_EXCLUSIONS > 32)
            barrier(CLK_LOCAL_MEM_FENCE);
        else
            SYNC_WARPS;
        
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

227
228
229
#ifdef USE_LARGE_BLOCKS
        int loadedLargeBlocks = 0;
#endif
230
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real4 largeSize = largeBlockBoundingBox[largeBlockIndex];
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
248
249
250
251
252
253
254
#ifdef TRICLINIC
                    // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                    // If there's any possibility we might have missed it, do a detailed check.

                    if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
                        includeLargeBlock = true;
#endif
255
256
257
258
259
260
261
262
263
264
265
                }
                largeBlockFlags[get_local_id(0)] = includeLargeBlock;
                loadedLargeBlocks = 32;
                SYNC_WARPS;
            }
            if (!largeBlockFlags[warpStart+32-(loadedLargeBlocks--)]) {
                // None of the next 32 blocks interact with block 1.

                continue;
            }
#endif
266
267
268
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
peastman's avatar
peastman committed
269
270
                real4 blockCenterY = sortedBlockCenter[block2];
                real4 blockSizeY = sortedBlockBoundingBox[block2];
271
272
                real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
273
                APPLY_PERIODIC_TO_DELTA(blockDelta)
274
#endif
275
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
276
277
278
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
279
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
280
281
282
283
284
285
286
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
                    includeBlock2 = true;
#endif
287
                if (includeBlock2) {
288
                    int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
289
290
291
292
293
294
295
296
297
298
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
            includeBlockFlags[get_local_id(0)] = includeBlock2;
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
299
300
301
                while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
                    i++;
                if (i < TILE_SIZE) {
302
                    int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
303
304
305

                    // Check each atom in block Y for interactions.

306
                    int atom2 = y*TILE_SIZE+indexInWarp;
307
308
309
                    real3 pos2 = posq[atom2].xyz;
#ifdef USE_PERIODIC
                    if (singlePeriodicCopy)
310
                        APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
311
312
313
314
315
316
317
#endif
                    bool interacts = false;
                    if (atom2 < NUM_ATOMS) {
#ifdef USE_PERIODIC
                        if (!singlePeriodicCopy) {
                            for (int j = 0; j < TILE_SIZE; j++) {
                                real3 delta = pos2-posBuffer[warpStart+j];
318
                                APPLY_PERIODIC_TO_DELTA(delta)
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
                        }
                        else {
#endif
                            for (int j = 0; j < TILE_SIZE; j++) {
                                real3 delta = pos2-posBuffer[warpStart+j];
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
#ifdef USE_PERIODIC
                        }
#endif
                    }
                    
                    // Do a prefix sum to compact the list of atoms.

                    atomCountBuffer[get_local_id(0)].x = (interacts ? 1 : 0);
                    SYNC_WARPS;
                    int whichBuffer = 0;
                    for (int offset = 1; offset < TILE_SIZE; offset *= 2) {
                        if (whichBuffer == 0)
peastman's avatar
peastman committed
340
                            atomCountBuffer[get_local_id(0)].y = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].x : atomCountBuffer[get_local_id(0)].x+atomCountBuffer[get_local_id(0)-offset].x);
341
                        else
peastman's avatar
peastman committed
342
                            atomCountBuffer[get_local_id(0)].x = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].y : atomCountBuffer[get_local_id(0)].y+atomCountBuffer[get_local_id(0)-offset].y);
343
344
345
346
347
348
349
                        whichBuffer = 1-whichBuffer;
                        SYNC_WARPS;
                    }
                    
                    // Add any interacting atoms to the buffer.

                    if (interacts)
peastman's avatar
peastman committed
350
351
                        buffer[neighborsInBuffer+atomCountBuffer[get_local_id(0)].y-1] = atom2;
                    neighborsInBuffer += atomCountBuffer[warpStart+TILE_SIZE-1].y;
352
353
354
                    if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                        // Store the new tiles to memory.

355
                        unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
356
                        if (indexInWarp == 0)
357
                            *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
358
                        SYNC_WARPS;
359
                        unsigned int newTileStartIndex = *tileStartIndex;
360
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
361
362
363
364
365
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
366
367
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
368
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
peastman's avatar
peastman committed
369
                   }
370
                }
371
372
373
                else {
                    SYNC_WARPS;
                }
374
375
376
377
378
379
            }
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
        if (neighborsInBuffer > 0) {
380
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
381
            if (indexInWarp == 0)
382
                *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
383
            SYNC_WARPS;
384
            unsigned int newTileStartIndex = *tileStartIndex;
385
            if (newTileStartIndex+tilesToStore <= maxTiles) {
386
387
388
389
390
391
392
393
394
395
396
397
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
            }
        }
    }
    
    // Record the positions the neighbor list is based on.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
peastman's avatar
peastman committed
398
399
400
401
402
403
404
405
}

#else
// This is the old implementation of finding interacting blocks.  It is quite a bit more complicated,
// and slower on most GPUs.  On AMD, however, it is faster, so we keep it around to use there.

#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
406
#define INVALID -1
peastman's avatar
peastman committed
407
408
409
410

/**
 * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
 */
411
void prefixSum(__local int* sum, __local int2* temp) {
peastman's avatar
peastman committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        temp[i].x = sum[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    int whichBuffer = 0;
    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
        if (whichBuffer == 0)
            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
        else
            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
        whichBuffer = 1-whichBuffer;
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (whichBuffer == 0)
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].x;
    else
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].y;
    barrier(CLK_LOCAL_MEM_FENCE);
}

/**
 * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
 * in them, and writes the result to global memory.
 */
439
void storeInteractionData(int x, __local int* buffer, __local int* sum, __local int2* temp, __local int* atoms, __local int* numAtoms,
peastman's avatar
peastman committed
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
            __local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
            real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, __local real4* posBuffer,
            real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
    if (get_local_id(0) < TILE_SIZE) {
        real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
#ifdef USE_PERIODIC
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
        }
#endif
        posBuffer[get_local_id(0)] = pos;
    }
    
    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.

    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        sum[i] = (buffer[i] == INVALID ? 0 : 1);
    barrier(CLK_LOCAL_MEM_FENCE);
    prefixSum(sum, temp);
    int numValid = sum[BUFFER_SIZE-1];

    // Compact the buffer.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        if (buffer[i] != INVALID)
            temp[sum[i]-1].x = buffer[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        buffer[i] = temp[i].x;
    barrier(CLK_LOCAL_MEM_FENCE);

    // Loop over the tiles and find specific interactions in them.

    const int indexInWarp = get_local_id(0)%WARP_SIZE;
    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
        for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
            // Check each atom in block Y for interactions.
            
            real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
#ifdef USE_PERIODIC
            if (singlePeriodicCopy)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
#endif
            bool interacts = false;
#ifdef USE_PERIODIC
            if (!singlePeriodicCopy) {
                for (int j = 0; j < TILE_SIZE; j++) {
                    real4 delta = pos-posBuffer[j];
                    APPLY_PERIODIC_TO_DELTA(delta)
                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
            }
            else {
#endif
                for (int j = 0; j < TILE_SIZE; j++) {
                    real4 delta = pos-posBuffer[j];
                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
#ifdef USE_PERIODIC
            }
#endif
            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
        }
        for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
            sum[i*WARP_SIZE+indexInWarp] = 0;

        // Compact the list of atoms.

        barrier(CLK_LOCAL_MEM_FENCE);
        prefixSum(sum, temp);
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
                atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;

        // Store them to global memory.

        int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
        if (tilesToStore > 0) {
            if (get_local_id(0) == 0)
528
                *baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
                *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
            if (*baseIndex+tilesToStore <= maxTiles) {
                if (get_local_id(0) < tilesToStore)
                    interactingTiles[*baseIndex+get_local_id(0)] = x;
                for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
                    interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
            }
        }
        else {
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
                *numAtoms += sum[BUFFER_SIZE-1];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_local_id(0) < *numAtoms && !storePartialTile)
            atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
    }

    if (numValid == 0 && *numAtoms > 0 && finish) {
        // We didn't have any more tiles to process, but there were some atoms left over from a
        // previous call to this function.  Save them now.

        if (get_local_id(0) == 0)
554
            *baseIndex = ATOMIC_ADD(interactionCount, 1);
peastman's avatar
peastman committed
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
        barrier(CLK_LOCAL_MEM_FENCE);
        if (*baseIndex < maxTiles) {
            if (get_local_id(0) == 0)
                interactingTiles[*baseIndex] = x;
            if (get_local_id(0) < TILE_SIZE)
                interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
        }
    }

    // Reset the buffer for processing more tiles.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        buffer[i] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
}

/**
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
577
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global unsigned int* restrict sortedBlocks,
peastman's avatar
peastman committed
578
579
        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
580
581
582
583
584
        __global const int* restrict rebuildNeighborList
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox
#endif
        ) {
585
586
587
    __local int buffer[BUFFER_SIZE];
    __local int sum[BUFFER_SIZE];
    __local int2 temp[BUFFER_SIZE];
peastman's avatar
peastman committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
    __local int atoms[BUFFER_SIZE+TILE_SIZE];
    __local real4 posBuffer[TILE_SIZE];
    __local int exclusionsForX[MAX_EXCLUSIONS];
    __local int bufferFull;
    __local int globalIndex;
    __local int numAtoms;
#ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
    // The IF actually causes the write to never be executed, its presence is all that is needed.
    // AMD APP SDK 2.4 has this problem.
    if (get_global_id(0) == get_local_id(0)+1)
        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif

    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

    int valuesInBuffer = 0;
    if (get_local_id(0) == 0)
        bufferFull = false;
    for (int i = 0; i < BUFFER_GROUPS; ++i)
        buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // Loop over blocks sorted by size.
    
    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
        if (get_local_id(0) == get_local_size(0)-1)
            numAtoms = 0;
618
        unsigned int x = sortedBlocks[i] & BLOCK_INDEX_MASK;
peastman's avatar
peastman committed
619
620
621
622
623
624
625
626
627
628
629
630
631
        real4 blockCenterX = sortedBlockCenter[i];
        real4 blockSizeX = sortedBlockBoundingBox[i];

        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
        for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
        barrier(CLK_LOCAL_MEM_FENCE);
        
        // Compare it to other blocks after this one in sorted order.
632

peastman's avatar
peastman committed
633
634
635
636
        for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
            int j = base+get_local_id(0);
            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
637
            unsigned int y = (j < NUM_BLOCKS ? sortedBlocks[j] & BLOCK_INDEX_MASK : 0);
peastman's avatar
peastman committed
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
            real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
            bool hasExclusions = false;
            for (int k = 0; k < numExclusions; k++)
                hasExclusions |= (exclusionsForX[k] == y);
            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
                // Add this tile to the buffer.

                int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
                buffer[bufferIndex] = y;
                valuesInBuffer++;
                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
                    bufferFull = true;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (bufferFull) {
                storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
                valuesInBuffer = 0;
                if (get_local_id(0) == 0)
                    bufferFull = false;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
    }
    
    // Record the positions the neighbor list is based on.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
}

#endif