findInteractingBlocks.cl 31.2 KB
Newer Older
1
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
3
4
5
6

/**
 * Find a bounding box for the atoms in each block.
 */
7
8
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
9
        __global real2* restrict blockSizeRange) {
10
    int index = get_global_id(0);
11
    int base = index*TILE_SIZE;
12
    real minSize = 1e38, maxSize = 0;
13
    while (base < numAtoms) {
14
        real4 pos = posq[base];
15
#ifdef USE_PERIODIC
16
        APPLY_PERIODIC_TO_POS(pos)
17
#endif
18
19
        real4 minPos = pos;
        real4 maxPos = pos;
20
        int last = min(base+TILE_SIZE, numAtoms);
21
22
23
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
24
            real4 center = 0.5f*(maxPos+minPos);
25
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
26
27
28
29
#endif
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
30
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
31
        real4 center = 0.5f*(maxPos+minPos);
32
        center.w = 0;
33
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
34
35
36
37
38
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
39
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
40
        }
Peter Eastman's avatar
Peter Eastman committed
41
        center.w = sqrt(center.w);
42
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
43
        blockCenter[index] = center;
44
45
46
        real totalSize = blockSize.x+blockSize.y+blockSize.z;
        minSize = min(minSize, totalSize);
        maxSize = max(maxSize, totalSize);
47
        index += get_global_size(0);
48
        base = index*TILE_SIZE;
49
    }
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

    // Record the range of sizes seen by threads in this block.

    __local real minBuffer[64], maxBuffer[64];
    minBuffer[get_local_id(0)] = minSize;
    maxBuffer[get_local_id(0)] = maxSize;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int step = 1; step < 64; step *= 2) {
        if (get_local_id(0)+step < 64 && get_local_id(0)%(2*step) == 0) {
            minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
            maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (get_local_id(0) == 0)
        blockSizeRange[get_group_id(0)] = make_real2(minBuffer[0], maxBuffer[0]);
66
    if (get_global_id(0) == 0)
67
        rebuildNeighborList[0] = 0;
68
}
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
    // Find the total range of sizes recorded by all blocks.

    __local real2 sizeRange;
    if (get_local_id(0) == 0) {
        sizeRange = blockSizeRange[0];
        for (int i = 1; i < numSizes; i++) {
            real2 size = blockSizeRange[i];
            sizeRange.x = min(sizeRange.x, size.x);
            sizeRange.y = max(sizeRange.y, size.y);
        }
        sizeRange.x = LOG(sizeRange.x);
        sizeRange.y = LOG(sizeRange.y);
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    // Sort keys store the bin in the high order part and the block in the low
    // order part.

    int numSizeBins = 20;
    real scale = numSizeBins/(sizeRange.y-sizeRange.x);
    for (unsigned int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
        real4 box = blockBoundingBox[i];
        real size = LOG(box.x+box.y+box.z);
        int bin = (size-sizeRange.x)*scale;
        bin = max(0, min(bin, numSizeBins-1));
        sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
    }
}
98
99

/**
100
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
101
 */
102
__kernel void sortBoxData(__global const unsigned int* restrict sortedBlocks, __global const real4* restrict blockCenter,
103
104
        __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
        __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
105
106
107
108
109
110
        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
        ) {
111
    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
112
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
113
114
115
        sortedBlockCenter[i] = blockCenter[index];
        sortedBlockBoundingBox[i] = blockBoundingBox[index];
    }
116
117
118
119
#ifdef USE_LARGE_BLOCKS
    // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.

    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
120
121
122
        unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
        real4 minPos = blockCenter[index]-blockBoundingBox[index];
        real4 maxPos = blockCenter[index]+blockBoundingBox[index];
123
124
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
125
126
127
            index = sortedBlocks[j] & BLOCK_INDEX_MASK;
            real4 blockPos = blockCenter[index];
            real4 width = blockBoundingBox[index];
128
129
130
131
132
133
134
135
136
137
138
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = min(minPos, blockPos-width);
            maxPos = max(maxPos, blockPos+width);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = 0.5f*(maxPos-minPos);
    }
#endif
139
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
140

141
    bool rebuild = forceRebuild;
142
143
144
145
146
147
148
149
150
151
152
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
    }
}

peastman's avatar
peastman committed
153
154
155
156
#if SIMD_WIDTH <= 32

#define BUFFER_SIZE 256

157
158
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
159
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global unsigned int* restrict sortedBlocks,
160
        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
161
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
162
163
164
165
166
        __global const int* restrict rebuildNeighborList
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox
#endif
        ) {
167
168
169
170
171
172
173
174
175
176
177
178

    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

    const int indexInWarp = get_local_id(0)%32;
    const int warpStart = get_local_id(0)-indexInWarp;
    const int totalWarps = get_global_size(0)/32;
    const int warpIndex = get_global_id(0)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __local int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
    __local int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
    __local real3 posBuffer[GROUP_SIZE];
179
    __local volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
180
    __local bool includeBlockFlags[GROUP_SIZE];
Peter Eastman's avatar
Peter Eastman committed
181
    __local volatile short2 atomCountBuffer[GROUP_SIZE];
182
183
    __local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
    __local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
184
    __local volatile unsigned int* tileStartIndex = workgroupTileIndex+(warpStart/32);
185
186
187
#ifdef USE_LARGE_BLOCKS
    __local bool largeBlockFlags[GROUP_SIZE];
#endif
188
189

    // Loop over blocks.
190

191
192
193
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
194
        int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
195
196
197
198
199
200
201
202
203
204
205
206
        real4 blockCenterX = sortedBlockCenter[block1];
        real4 blockSizeX = sortedBlockBoundingBox[block1];
        int neighborsInBuffer = 0;
        real3 pos1 = posq[x*TILE_SIZE+indexInWarp].xyz;
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
207
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
        }
#endif
        posBuffer[get_local_id(0)] = pos1;

        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
        for (int j = indexInWarp; j < numExclusions; j += 32)
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
        if (MAX_EXCLUSIONS > 32)
            barrier(CLK_LOCAL_MEM_FENCE);
        else
            SYNC_WARPS;
        
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

227
228
229
#ifdef USE_LARGE_BLOCKS
        int loadedLargeBlocks = 0;
#endif
230
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real4 largeSize = largeBlockBoundingBox[largeBlockIndex];
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
                }
                largeBlockFlags[get_local_id(0)] = includeLargeBlock;
                loadedLargeBlocks = 32;
                SYNC_WARPS;
            }
            if (!largeBlockFlags[warpStart+32-(loadedLargeBlocks--)]) {
                // None of the next 32 blocks interact with block 1.

                continue;
            }
#endif
259
260
261
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
peastman's avatar
peastman committed
262
263
                real4 blockCenterY = sortedBlockCenter[block2];
                real4 blockSizeY = sortedBlockBoundingBox[block2];
264
265
                real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
266
                APPLY_PERIODIC_TO_DELTA(blockDelta)
267
#endif
268
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
269
270
271
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
272
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
273
274
275
276
277
278
279
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
                    includeBlock2 = true;
#endif
280
                if (includeBlock2) {
281
                    int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
282
283
284
285
286
287
288
289
290
291
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
            includeBlockFlags[get_local_id(0)] = includeBlock2;
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
292
293
294
                while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
                    i++;
                if (i < TILE_SIZE) {
295
                    int y = sortedBlocks[block2Base+i] & BLOCK_INDEX_MASK;
296
297
298

                    // Check each atom in block Y for interactions.

299
                    int atom2 = y*TILE_SIZE+indexInWarp;
300
301
302
                    real3 pos2 = posq[atom2].xyz;
#ifdef USE_PERIODIC
                    if (singlePeriodicCopy)
303
                        APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
304
305
306
307
308
309
310
#endif
                    bool interacts = false;
                    if (atom2 < NUM_ATOMS) {
#ifdef USE_PERIODIC
                        if (!singlePeriodicCopy) {
                            for (int j = 0; j < TILE_SIZE; j++) {
                                real3 delta = pos2-posBuffer[warpStart+j];
311
                                APPLY_PERIODIC_TO_DELTA(delta)
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
                        }
                        else {
#endif
                            for (int j = 0; j < TILE_SIZE; j++) {
                                real3 delta = pos2-posBuffer[warpStart+j];
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
#ifdef USE_PERIODIC
                        }
#endif
                    }
                    
                    // Do a prefix sum to compact the list of atoms.

                    atomCountBuffer[get_local_id(0)].x = (interacts ? 1 : 0);
                    SYNC_WARPS;
                    int whichBuffer = 0;
                    for (int offset = 1; offset < TILE_SIZE; offset *= 2) {
                        if (whichBuffer == 0)
peastman's avatar
peastman committed
333
                            atomCountBuffer[get_local_id(0)].y = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].x : atomCountBuffer[get_local_id(0)].x+atomCountBuffer[get_local_id(0)-offset].x);
334
                        else
peastman's avatar
peastman committed
335
                            atomCountBuffer[get_local_id(0)].x = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].y : atomCountBuffer[get_local_id(0)].y+atomCountBuffer[get_local_id(0)-offset].y);
336
337
338
339
340
341
342
                        whichBuffer = 1-whichBuffer;
                        SYNC_WARPS;
                    }
                    
                    // Add any interacting atoms to the buffer.

                    if (interacts)
peastman's avatar
peastman committed
343
344
                        buffer[neighborsInBuffer+atomCountBuffer[get_local_id(0)].y-1] = atom2;
                    neighborsInBuffer += atomCountBuffer[warpStart+TILE_SIZE-1].y;
345
346
347
                    if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                        // Store the new tiles to memory.

348
                        unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
349
                        if (indexInWarp == 0)
350
                            *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
351
                        SYNC_WARPS;
352
                        unsigned int newTileStartIndex = *tileStartIndex;
353
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
354
355
356
357
358
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
359
360
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
361
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
peastman's avatar
peastman committed
362
                   }
363
                }
364
365
366
                else {
                    SYNC_WARPS;
                }
367
368
369
370
371
372
            }
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
        if (neighborsInBuffer > 0) {
373
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
374
            if (indexInWarp == 0)
375
                *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
376
            SYNC_WARPS;
377
            unsigned int newTileStartIndex = *tileStartIndex;
378
            if (newTileStartIndex+tilesToStore <= maxTiles) {
379
380
381
382
383
384
385
386
387
388
389
390
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
            }
        }
    }
    
    // Record the positions the neighbor list is based on.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
peastman's avatar
peastman committed
391
392
393
394
395
396
397
398
}

#else
// This is the old implementation of finding interacting blocks.  It is quite a bit more complicated,
// and slower on most GPUs.  On AMD, however, it is faster, so we keep it around to use there.

#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
399
#define INVALID -1
peastman's avatar
peastman committed
400
401
402
403

/**
 * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
 */
404
void prefixSum(__local int* sum, __local int2* temp) {
peastman's avatar
peastman committed
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        temp[i].x = sum[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    int whichBuffer = 0;
    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
        if (whichBuffer == 0)
            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
        else
            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
        whichBuffer = 1-whichBuffer;
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (whichBuffer == 0)
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].x;
    else
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].y;
    barrier(CLK_LOCAL_MEM_FENCE);
}

/**
 * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
 * in them, and writes the result to global memory.
 */
432
void storeInteractionData(int x, __local int* buffer, __local int* sum, __local int2* temp, __local int* atoms, __local int* numAtoms,
peastman's avatar
peastman committed
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
            __local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
            real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, __local real4* posBuffer,
            real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
    if (get_local_id(0) < TILE_SIZE) {
        real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
#ifdef USE_PERIODIC
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
        }
#endif
        posBuffer[get_local_id(0)] = pos;
    }
    
    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.

    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        sum[i] = (buffer[i] == INVALID ? 0 : 1);
    barrier(CLK_LOCAL_MEM_FENCE);
    prefixSum(sum, temp);
    int numValid = sum[BUFFER_SIZE-1];

    // Compact the buffer.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        if (buffer[i] != INVALID)
            temp[sum[i]-1].x = buffer[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        buffer[i] = temp[i].x;
    barrier(CLK_LOCAL_MEM_FENCE);

    // Loop over the tiles and find specific interactions in them.

    const int indexInWarp = get_local_id(0)%WARP_SIZE;
    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
        for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
            // Check each atom in block Y for interactions.
            
            real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
#ifdef USE_PERIODIC
            if (singlePeriodicCopy)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
#endif
            bool interacts = false;
#ifdef USE_PERIODIC
            if (!singlePeriodicCopy) {
                for (int j = 0; j < TILE_SIZE; j++) {
                    real4 delta = pos-posBuffer[j];
                    APPLY_PERIODIC_TO_DELTA(delta)
                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
            }
            else {
#endif
                for (int j = 0; j < TILE_SIZE; j++) {
                    real4 delta = pos-posBuffer[j];
                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
#ifdef USE_PERIODIC
            }
#endif
            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
        }
        for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
            sum[i*WARP_SIZE+indexInWarp] = 0;

        // Compact the list of atoms.

        barrier(CLK_LOCAL_MEM_FENCE);
        prefixSum(sum, temp);
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
                atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;

        // Store them to global memory.

        int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
        if (tilesToStore > 0) {
            if (get_local_id(0) == 0)
521
                *baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
                *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
            if (*baseIndex+tilesToStore <= maxTiles) {
                if (get_local_id(0) < tilesToStore)
                    interactingTiles[*baseIndex+get_local_id(0)] = x;
                for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
                    interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
            }
        }
        else {
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
                *numAtoms += sum[BUFFER_SIZE-1];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_local_id(0) < *numAtoms && !storePartialTile)
            atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
    }

    if (numValid == 0 && *numAtoms > 0 && finish) {
        // We didn't have any more tiles to process, but there were some atoms left over from a
        // previous call to this function.  Save them now.

        if (get_local_id(0) == 0)
547
            *baseIndex = ATOMIC_ADD(interactionCount, 1);
peastman's avatar
peastman committed
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
        barrier(CLK_LOCAL_MEM_FENCE);
        if (*baseIndex < maxTiles) {
            if (get_local_id(0) == 0)
                interactingTiles[*baseIndex] = x;
            if (get_local_id(0) < TILE_SIZE)
                interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
        }
    }

    // Reset the buffer for processing more tiles.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        buffer[i] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
}

/**
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
        __global const int* restrict rebuildNeighborList) {
574
575
576
    __local int buffer[BUFFER_SIZE];
    __local int sum[BUFFER_SIZE];
    __local int2 temp[BUFFER_SIZE];
peastman's avatar
peastman committed
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
    __local int atoms[BUFFER_SIZE+TILE_SIZE];
    __local real4 posBuffer[TILE_SIZE];
    __local int exclusionsForX[MAX_EXCLUSIONS];
    __local int bufferFull;
    __local int globalIndex;
    __local int numAtoms;
#ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
    // The IF actually causes the write to never be executed, its presence is all that is needed.
    // AMD APP SDK 2.4 has this problem.
    if (get_global_id(0) == get_local_id(0)+1)
        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif

    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

    int valuesInBuffer = 0;
    if (get_local_id(0) == 0)
        bufferFull = false;
    for (int i = 0; i < BUFFER_GROUPS; ++i)
        buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // Loop over blocks sorted by size.
    
    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
        if (get_local_id(0) == get_local_size(0)-1)
            numAtoms = 0;
        real2 sortedKey = sortedBlocks[i];
        int x = (int) sortedKey.y;
        real4 blockCenterX = sortedBlockCenter[i];
        real4 blockSizeX = sortedBlockBoundingBox[i];

        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
        for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
        barrier(CLK_LOCAL_MEM_FENCE);
        
        // Compare it to other blocks after this one in sorted order.
622

peastman's avatar
peastman committed
623
624
625
626
627
        for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
            int j = base+get_local_id(0);
            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
628
            int y = (int) sortedKey2.y;
peastman's avatar
peastman committed
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
            real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
            bool hasExclusions = false;
            for (int k = 0; k < numExclusions; k++)
                hasExclusions |= (exclusionsForX[k] == y);
            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
                // Add this tile to the buffer.

                int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
                buffer[bufferIndex] = y;
                valuesInBuffer++;
                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
                    bufferFull = true;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (bufferFull) {
                storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
                valuesInBuffer = 0;
                if (get_local_id(0) == 0)
                    bufferFull = false;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
    }
    
    // Record the positions the neighbor list is based on.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
}

#endif