findInteractingBlocks.cl 29.1 KB
Newer Older
1
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
3
4
5
6

/**
 * Find a bounding box for the atoms in each block.
 */
7
8
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
9
        __global real2* restrict sortedBlocks) {
10
    int index = get_global_id(0);
11
    int base = index*TILE_SIZE;
12
    while (base < numAtoms) {
13
        real4 pos = posq[base];
14
#ifdef USE_PERIODIC
15
        APPLY_PERIODIC_TO_POS(pos)
16
#endif
17
18
        real4 minPos = pos;
        real4 maxPos = pos;
19
        int last = min(base+TILE_SIZE, numAtoms);
20
21
22
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
23
            real4 center = 0.5f*(maxPos+minPos);
24
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
25
26
27
28
#endif
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
29
        real4 blockSize = 0.5f*(maxPos-minPos);
Peter Eastman's avatar
Peter Eastman committed
30
        real4 center = 0.5f*(maxPos+minPos);
31
        center.w = 0;
32
        for (int i = base; i < last; i++) {
Peter Eastman's avatar
Peter Eastman committed
33
34
35
36
37
            pos = posq[i];
            real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
38
            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
Peter Eastman's avatar
Peter Eastman committed
39
        }
Peter Eastman's avatar
Peter Eastman committed
40
        center.w = sqrt(center.w);
41
        blockBoundingBox[index] = blockSize;
Peter Eastman's avatar
Peter Eastman committed
42
        blockCenter[index] = center;
43
        sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index);
44
        index += get_global_size(0);
45
        base = index*TILE_SIZE;
46
    }
47
    if (get_global_id(0) == 0)
48
        rebuildNeighborList[0] = 0;
49
50
51
}

/**
52
 * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
53
 */
54
55
56
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
        __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
        __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
57
58
59
60
61
62
        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox, real4 periodicBoxSize,
        real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
        ) {
63
64
65
66
67
    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
        int index = (int) sortedBlock[i].y;
        sortedBlockCenter[i] = blockCenter[index];
        sortedBlockBoundingBox[i] = blockBoundingBox[index];
    }
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#ifdef USE_LARGE_BLOCKS
    // Compute the sizes of large blocks (composed of 32 regular blocks) starting from each block.

    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
        real4 minPos = blockCenter[i]-blockBoundingBox[i];
        real4 maxPos = blockCenter[i]+blockBoundingBox[i];
        int last = min(i+32, NUM_BLOCKS);
        for (int j = i+1; j < last; j++) {
            real4 blockPos = blockCenter[j];
            real4 width = blockBoundingBox[j];
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
            minPos = min(minPos, blockPos-width);
            maxPos = max(maxPos, blockPos+width);
        }
        largeBlockCenter[i] = 0.5f*(maxPos+minPos);
        largeBlockBoundingBox[i] = 0.5f*(maxPos-minPos);
    }
#endif
89
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
90

91
    bool rebuild = forceRebuild;
92
93
94
95
96
97
98
99
100
101
102
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
            rebuild = true;
    }
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
    }
}

peastman's avatar
peastman committed
103
104
105
106
#if SIMD_WIDTH <= 32

#define BUFFER_SIZE 256

107
108
109
110
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
111
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
112
113
114
115
116
        __global const int* restrict rebuildNeighborList
#ifdef USE_LARGE_BLOCKS
        , __global real4* restrict largeBlockCenter, __global real4* restrict largeBlockBoundingBox
#endif
        ) {
117
118
119
120
121
122
123
124
125
126
127
128

    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

    const int indexInWarp = get_local_id(0)%32;
    const int warpStart = get_local_id(0)-indexInWarp;
    const int totalWarps = get_global_size(0)/32;
    const int warpIndex = get_global_id(0)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __local int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
    __local int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
    __local real3 posBuffer[GROUP_SIZE];
129
    __local volatile unsigned int workgroupTileIndex[GROUP_SIZE/32];
130
    __local bool includeBlockFlags[GROUP_SIZE];
Peter Eastman's avatar
Peter Eastman committed
131
    __local volatile short2 atomCountBuffer[GROUP_SIZE];
132
133
    __local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
    __local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
134
    __local volatile unsigned int* tileStartIndex = workgroupTileIndex+(warpStart/32);
135
136
137
#ifdef USE_LARGE_BLOCKS
    __local bool largeBlockFlags[GROUP_SIZE];
#endif
138
139

    // Loop over blocks.
140

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
        real2 sortedKey = sortedBlocks[block1];
        int x = (int) sortedKey.y;
        real4 blockCenterX = sortedBlockCenter[block1];
        real4 blockSizeX = sortedBlockBoundingBox[block1];
        int neighborsInBuffer = 0;
        real3 pos1 = posq[x*TILE_SIZE+indexInWarp].xyz;
#ifdef USE_PERIODIC
        const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                         0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
158
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
        }
#endif
        posBuffer[get_local_id(0)] = pos1;

        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
        for (int j = indexInWarp; j < numExclusions; j += 32)
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
        if (MAX_EXCLUSIONS > 32)
            barrier(CLK_LOCAL_MEM_FENCE);
        else
            SYNC_WARPS;
        
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

178
179
180
#ifdef USE_LARGE_BLOCKS
        int loadedLargeBlocks = 0;
#endif
181
        for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += 32) {
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#ifdef USE_LARGE_BLOCKS
            if (loadedLargeBlocks == 0) {
                // Check the next set of large blocks.

                int largeBlockIndex = block2Base + 32*indexInWarp;
                bool includeLargeBlock = false;
                if (largeBlockIndex < NUM_BLOCKS) {
                    real4 largeCenter = largeBlockCenter[largeBlockIndex];
                    real4 largeSize = largeBlockBoundingBox[largeBlockIndex];
                    real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                    blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
                    blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
                    blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
                    includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
                }
                largeBlockFlags[get_local_id(0)] = includeLargeBlock;
                loadedLargeBlocks = 32;
                SYNC_WARPS;
            }
            if (!largeBlockFlags[warpStart+32-(loadedLargeBlocks--)]) {
                // None of the next 32 blocks interact with block 1.

                continue;
            }
#endif
210
211
212
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
peastman's avatar
peastman committed
213
214
                real4 blockCenterY = sortedBlockCenter[block2];
                real4 blockSizeY = sortedBlockBoundingBox[block2];
215
216
                real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
217
                APPLY_PERIODIC_TO_DELTA(blockDelta)
218
#endif
219
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
220
221
222
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
223
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
224
225
226
227
228
229
230
#ifdef TRICLINIC
                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
                // If there's any possibility we might have missed it, do a detailed check.

                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
                    includeBlock2 = true;
#endif
231
                if (includeBlock2) {
232
                    int y = (int) sortedBlocks[block2].y;
233
234
235
236
237
238
239
240
241
242
                    for (int k = 0; k < numExclusions; k++)
                        includeBlock2 &= (exclusionsForX[k] != y);
                }
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
            includeBlockFlags[get_local_id(0)] = includeBlock2;
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
243
244
245
                while (i < TILE_SIZE && !includeBlockFlags[warpStart+i])
                    i++;
                if (i < TILE_SIZE) {
246
                    int y = (int) sortedBlocks[block2Base+i].y;
247
248
249

                    // Check each atom in block Y for interactions.

250
                    int atom2 = y*TILE_SIZE+indexInWarp;
251
252
253
                    real3 pos2 = posq[atom2].xyz;
#ifdef USE_PERIODIC
                    if (singlePeriodicCopy)
254
                        APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
255
256
257
258
259
260
261
#endif
                    bool interacts = false;
                    if (atom2 < NUM_ATOMS) {
#ifdef USE_PERIODIC
                        if (!singlePeriodicCopy) {
                            for (int j = 0; j < TILE_SIZE; j++) {
                                real3 delta = pos2-posBuffer[warpStart+j];
262
                                APPLY_PERIODIC_TO_DELTA(delta)
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
                        }
                        else {
#endif
                            for (int j = 0; j < TILE_SIZE; j++) {
                                real3 delta = pos2-posBuffer[warpStart+j];
                                interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                            }
#ifdef USE_PERIODIC
                        }
#endif
                    }
                    
                    // Do a prefix sum to compact the list of atoms.

                    atomCountBuffer[get_local_id(0)].x = (interacts ? 1 : 0);
                    SYNC_WARPS;
                    int whichBuffer = 0;
                    for (int offset = 1; offset < TILE_SIZE; offset *= 2) {
                        if (whichBuffer == 0)
peastman's avatar
peastman committed
284
                            atomCountBuffer[get_local_id(0)].y = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].x : atomCountBuffer[get_local_id(0)].x+atomCountBuffer[get_local_id(0)-offset].x);
285
                        else
peastman's avatar
peastman committed
286
                            atomCountBuffer[get_local_id(0)].x = (indexInWarp < offset ? atomCountBuffer[get_local_id(0)].y : atomCountBuffer[get_local_id(0)].y+atomCountBuffer[get_local_id(0)-offset].y);
287
288
289
290
291
292
293
                        whichBuffer = 1-whichBuffer;
                        SYNC_WARPS;
                    }
                    
                    // Add any interacting atoms to the buffer.

                    if (interacts)
peastman's avatar
peastman committed
294
295
                        buffer[neighborsInBuffer+atomCountBuffer[get_local_id(0)].y-1] = atom2;
                    neighborsInBuffer += atomCountBuffer[warpStart+TILE_SIZE-1].y;
296
297
298
                    if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                        // Store the new tiles to memory.

299
                        unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
300
                        if (indexInWarp == 0)
301
                            *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
302
                        SYNC_WARPS;
303
                        unsigned int newTileStartIndex = *tileStartIndex;
304
                        if (newTileStartIndex+tilesToStore <= maxTiles) {
305
306
307
308
309
                            if (indexInWarp < tilesToStore)
                                interactingTiles[newTileStartIndex+indexInWarp] = x;
                            for (int j = 0; j < tilesToStore; j++)
                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
                        }
310
311
                        if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
                            buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
312
                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
peastman's avatar
peastman committed
313
                   }
314
                }
315
316
317
                else {
                    SYNC_WARPS;
                }
318
319
320
321
322
323
            }
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
        if (neighborsInBuffer > 0) {
324
            unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
325
            if (indexInWarp == 0)
326
                *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
327
            SYNC_WARPS;
328
            unsigned int newTileStartIndex = *tileStartIndex;
329
            if (newTileStartIndex+tilesToStore <= maxTiles) {
330
331
332
333
334
335
336
337
338
339
340
341
                if (indexInWarp < tilesToStore)
                    interactingTiles[newTileStartIndex+indexInWarp] = x;
                for (int j = 0; j < tilesToStore; j++)
                    interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
            }
        }
    }
    
    // Record the positions the neighbor list is based on.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
peastman's avatar
peastman committed
342
343
344
345
346
347
348
349
}

#else
// This is the old implementation of finding interacting blocks.  It is quite a bit more complicated,
// and slower on most GPUs.  On AMD, however, it is faster, so we keep it around to use there.

#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
350
#define INVALID -1
peastman's avatar
peastman committed
351
352
353
354

/**
 * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
 */
355
void prefixSum(__local int* sum, __local int2* temp) {
peastman's avatar
peastman committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        temp[i].x = sum[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    int whichBuffer = 0;
    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
        if (whichBuffer == 0)
            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
        else
            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
        whichBuffer = 1-whichBuffer;
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (whichBuffer == 0)
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].x;
    else
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            sum[i] = temp[i].y;
    barrier(CLK_LOCAL_MEM_FENCE);
}

/**
 * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
 * in them, and writes the result to global memory.
 */
383
void storeInteractionData(int x, __local int* buffer, __local int* sum, __local int2* temp, __local int* atoms, __local int* numAtoms,
peastman's avatar
peastman committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
            __local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
            real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, __local real4* posBuffer,
            real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
    if (get_local_id(0) < TILE_SIZE) {
        real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
#ifdef USE_PERIODIC
        if (singlePeriodicCopy) {
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
        }
#endif
        posBuffer[get_local_id(0)] = pos;
    }
    
    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.

    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        sum[i] = (buffer[i] == INVALID ? 0 : 1);
    barrier(CLK_LOCAL_MEM_FENCE);
    prefixSum(sum, temp);
    int numValid = sum[BUFFER_SIZE-1];

    // Compact the buffer.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        if (buffer[i] != INVALID)
            temp[sum[i]-1].x = buffer[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        buffer[i] = temp[i].x;
    barrier(CLK_LOCAL_MEM_FENCE);

    // Loop over the tiles and find specific interactions in them.

    const int indexInWarp = get_local_id(0)%WARP_SIZE;
    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
        for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
            // Check each atom in block Y for interactions.
            
            real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
#ifdef USE_PERIODIC
            if (singlePeriodicCopy)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
#endif
            bool interacts = false;
#ifdef USE_PERIODIC
            if (!singlePeriodicCopy) {
                for (int j = 0; j < TILE_SIZE; j++) {
                    real4 delta = pos-posBuffer[j];
                    APPLY_PERIODIC_TO_DELTA(delta)
                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
            }
            else {
#endif
                for (int j = 0; j < TILE_SIZE; j++) {
                    real4 delta = pos-posBuffer[j];
                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
#ifdef USE_PERIODIC
            }
#endif
            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
        }
        for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
            sum[i*WARP_SIZE+indexInWarp] = 0;

        // Compact the list of atoms.

        barrier(CLK_LOCAL_MEM_FENCE);
        prefixSum(sum, temp);
        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
                atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;

        // Store them to global memory.

        int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
        if (tilesToStore > 0) {
            if (get_local_id(0) == 0)
472
                *baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
peastman's avatar
peastman committed
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
                *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
            if (*baseIndex+tilesToStore <= maxTiles) {
                if (get_local_id(0) < tilesToStore)
                    interactingTiles[*baseIndex+get_local_id(0)] = x;
                for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
                    interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
            }
        }
        else {
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) == 0)
                *numAtoms += sum[BUFFER_SIZE-1];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (get_local_id(0) < *numAtoms && !storePartialTile)
            atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
    }

    if (numValid == 0 && *numAtoms > 0 && finish) {
        // We didn't have any more tiles to process, but there were some atoms left over from a
        // previous call to this function.  Save them now.

        if (get_local_id(0) == 0)
498
            *baseIndex = ATOMIC_ADD(interactionCount, 1);
peastman's avatar
peastman committed
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
        barrier(CLK_LOCAL_MEM_FENCE);
        if (*baseIndex < maxTiles) {
            if (get_local_id(0) == 0)
                interactingTiles[*baseIndex] = x;
            if (get_local_id(0) < TILE_SIZE)
                interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
        }
    }

    // Reset the buffer for processing more tiles.

    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
        buffer[i] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
}

/**
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
        __global const int* restrict rebuildNeighborList) {
525
526
527
    __local int buffer[BUFFER_SIZE];
    __local int sum[BUFFER_SIZE];
    __local int2 temp[BUFFER_SIZE];
peastman's avatar
peastman committed
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
    __local int atoms[BUFFER_SIZE+TILE_SIZE];
    __local real4 posBuffer[TILE_SIZE];
    __local int exclusionsForX[MAX_EXCLUSIONS];
    __local int bufferFull;
    __local int globalIndex;
    __local int numAtoms;
#ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
    // The IF actually causes the write to never be executed, its presence is all that is needed.
    // AMD APP SDK 2.4 has this problem.
    if (get_global_id(0) == get_local_id(0)+1)
        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif

    if (rebuildNeighborList[0] == 0)
        return; // The neighbor list doesn't need to be rebuilt.

    int valuesInBuffer = 0;
    if (get_local_id(0) == 0)
        bufferFull = false;
    for (int i = 0; i < BUFFER_GROUPS; ++i)
        buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // Loop over blocks sorted by size.
    
    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
        if (get_local_id(0) == get_local_size(0)-1)
            numAtoms = 0;
        real2 sortedKey = sortedBlocks[i];
        int x = (int) sortedKey.y;
        real4 blockCenterX = sortedBlockCenter[i];
        real4 blockSizeX = sortedBlockBoundingBox[i];

        // Load exclusion data for block x.
        
        const int exclusionStart = exclusionRowIndices[x];
        const int exclusionEnd = exclusionRowIndices[x+1];
        const int numExclusions = exclusionEnd-exclusionStart;
        for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
        barrier(CLK_LOCAL_MEM_FENCE);
        
        // Compare it to other blocks after this one in sorted order.
573

peastman's avatar
peastman committed
574
575
576
577
578
        for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
            int j = base+get_local_id(0);
            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
579
            int y = (int) sortedKey2.y;
peastman's avatar
peastman committed
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
            real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
            APPLY_PERIODIC_TO_DELTA(delta)
#endif
            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
            bool hasExclusions = false;
            for (int k = 0; k < numExclusions; k++)
                hasExclusions |= (exclusionsForX[k] == y);
            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
                // Add this tile to the buffer.

                int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
                buffer[bufferIndex] = y;
                valuesInBuffer++;
                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
                    bufferFull = true;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (bufferFull) {
                storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
                valuesInBuffer = 0;
                if (get_local_id(0) == 0)
                    bufferFull = false;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
    }
    
    // Record the positions the neighbor list is based on.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
}

#endif