nonbonded.hip 20.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/**
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on
 * implicit warp-level synchronization. A tile is defined by two atom blocks
 * each of warpsize. Each warp computes a range of tiles.
 *
 * Tiles with exclusions compute the entire set of interactions across
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 *
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1
 * L    a b c d e f g h
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
 * and follows the same force accumulation method. If more there are more interactingTiles
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
 * and the full tileset is computed. This should happen on the first step, and very rarely
 * afterwards.
 *
 * On diagonal exclusion tiles use __shfl to broadcast. For all other types of tiles __shfl
 * is used to pass around the forces, positions, and parameters when computing the forces.
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
 * [in]posq             - x,y,z,charge
 * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
 * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
 * [in]startTileIndex   - index into first tile to be processed
 * [in]numTileIndices   - number of tiles this context is responsible for processing
 * [in]int tiles        - the atom block for each tile
 * [in]interactionCount - total number of tiles that have an interaction
 * [in]maxTiles         - stores the size of the neighbourlist in case it needs
 *                      - to be expanded
 * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
 * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
 * [in]blockCenter      - the center of each block in euclidean coordinates
 * [in]blockSize        - size of the each block, radiating from the center
 *                      - x is half the distance of total length
 *                      - y is half the distance of total width
 *                      - z is half the distance of total height
 *                      - w is not used
 * [in]interactingAtoms - a list of interactions within a given tile
 *
 */
Anton Gorenko's avatar
Anton Gorenko committed
62
extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded(
63
64
65
66
67
68
69
70
71
72
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
#ifdef USE_CUTOFF
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
        const int2* __restrict__ singlePairs
#endif
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
Anton Gorenko's avatar
Anton Gorenko committed
73
    const unsigned int warp = blockIdx.x*(THREAD_BLOCK_SIZE/TILE_SIZE) + threadIdx.x/TILE_SIZE; // global warpIndex
74
75
76
77
78
79
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    mixed energy = 0;
    INIT_DERIVATIVES

    // First loop: process tiles that contain exclusions.

Anton Gorenko's avatar
Anton Gorenko committed
80
    for (int pos = FIRST_EXCLUSION_TILE+warp; pos < LAST_EXCLUSION_TILE; pos+=totalWarps) {
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
        const int2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real3 force = make_real3(0);
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
#endif
        if (x == y) {
            // This tile is on the diagonal.
            real4 shflPosq = posq1;

            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
Anton Gorenko's avatar
Anton Gorenko committed
97
            #pragma unroll 2
98
99
100
101
102
103
104
105
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                real4 posq2;
                BROADCAST_WARP_DATA
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
Anton Gorenko's avatar
Anton Gorenko committed
106
107
108
109
110
111
112
#ifdef USE_CUTOFF
                if (r2 < MAX_CUTOFF_SQUARED) {
#endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    int atom2 = y*TILE_SIZE+j;
113
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
114
                    real dEdR = 0.0f;
115
#else
Anton Gorenko's avatar
Anton Gorenko committed
116
117
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
118
119
#endif
#ifdef USE_EXCLUSIONS
Anton Gorenko's avatar
Anton Gorenko committed
120
121
122
123
124
125
126
                    bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
#endif
                    real tempEnergy = 0.0f;
                    const real interactionScale = 0.5f;
                    COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
                    energy += 0.5f*tempEnergy;
127
128
129
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
130
131
132
                    force.x -= delta.x*dEdR;
                    force.y -= delta.y*dEdR;
                    force.z -= delta.z*dEdR;
133
#else
Anton Gorenko's avatar
Anton Gorenko committed
134
135
136
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
137
138
#endif
#endif
Anton Gorenko's avatar
Anton Gorenko committed
139
140
141
#ifdef USE_CUTOFF
                }
#endif
142
143
144
145
146
147
148
149
#ifdef USE_EXCLUSIONS
                excl >>= 1;
#endif
            }
        }
        else {
            // This is an off-diagonal tile.
            unsigned int j = y*TILE_SIZE + tgx;
Anton Gorenko's avatar
Anton Gorenko committed
150
            int atomIndex = j;
151
152
153
154
155
156
157
158
159
160
            real4 shflPosq = posq[j];
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
            DECLARE_LOCAL_PARAMETERS
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef USE_EXCLUSIONS
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
Anton Gorenko's avatar
Anton Gorenko committed
161
            #pragma unroll 2
162
163
164
165
166
167
168
            for (j = 0; j < TILE_SIZE; j++) {
                real4 posq2 = shflPosq;
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
Anton Gorenko's avatar
Anton Gorenko committed
169
170
171
172
173
174
175
#ifdef USE_CUTOFF
                if (r2 < MAX_CUTOFF_SQUARED) {
#endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    int atom2 = atomIndex;
176
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
177
                    real dEdR = 0.0f;
178
#else
Anton Gorenko's avatar
Anton Gorenko committed
179
180
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
181
182
#endif
#ifdef USE_EXCLUSIONS
Anton Gorenko's avatar
Anton Gorenko committed
183
184
185
186
187
188
189
                    bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
#endif
                    real tempEnergy = 0.0f;
                    const real interactionScale = 1.0f;
                    COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
                    energy += tempEnergy;
190
191
192
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
193
194
195
196
197
198
199
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
200
#else // !USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
201
202
203
204
205
206
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
207
208
#endif // end USE_SYMMETRIC
#endif
Anton Gorenko's avatar
Anton Gorenko committed
209
210
#ifdef USE_CUTOFF
                }
211
212
213
214
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
#endif
Anton Gorenko's avatar
Anton Gorenko committed
215
216
                SHUFFLE_WARP_DATA
                atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
217
            }
Anton Gorenko's avatar
Anton Gorenko committed
218
            const unsigned int offset = atomIndex;
219
220
            // write results for off diagonal tiles
#ifdef INCLUDE_FORCES
Anton Gorenko's avatar
Anton Gorenko committed
221
222
223
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
224
225
226
227
228
#endif
        }
        // Write results for on and off diagonal tiles
#ifdef INCLUDE_FORCES
        const unsigned int offset = x*TILE_SIZE + tgx;
Anton Gorenko's avatar
Anton Gorenko committed
229
230
231
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(force.x)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
232
233
234
235
236
237
#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

238
#ifdef USE_NEIGHBOR_LIST
239
240
241
    const unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
Anton Gorenko's avatar
Anton Gorenko committed
242
243
244
245
246
247
    for (unsigned int pos0 = warp; pos0 < LAST_EXCLUSION_TILE+numTiles; pos0+=totalWarps) {
        // Skip warps that may be still busy in the first loop
        if (pos0 < LAST_EXCLUSION_TILE) {
            continue;
        }
        const unsigned int pos = pos0-LAST_EXCLUSION_TILE;
248
249
250
251
#else
    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
    int skipBase = 0;
252
    int skipTiles = -1;
Anton Gorenko's avatar
Anton Gorenko committed
253
254
    for (; pos < end; pos++) {
#endif
255
256
257
258
259
260
        real3 force = make_real3(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
        int x, y;
        bool singlePeriodicCopy = false;
261
#ifdef USE_NEIGHBOR_LIST
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        }

        // Skip over tiles that have exclusions, since they were already processed.

277
        while (BALLOT(skipTiles >= pos) == 0) {
278
279
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                int2 tile = exclusionTiles[skipBase+tgx];
Anton Gorenko's avatar
Anton Gorenko committed
280
                skipTiles = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
281
282
            }
            else
Anton Gorenko's avatar
Anton Gorenko committed
283
                skipTiles = end;
284
285
            skipBase += TILE_SIZE;
        }
286
        includeTile = BALLOT(skipTiles == pos) == 0;
287
288
289
290
291
292
#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
293
#ifdef USE_NEIGHBOR_LIST
294
295
296
297
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
Anton Gorenko's avatar
Anton Gorenko committed
298
            int atomIndex = j;
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
            DECLARE_LOCAL_PARAMETERS
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
            if (j < PADDED_NUM_ATOMS) {
                // Load position of atom j from from global memory
                shflPosq = posq[j];
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            }
            else {
                shflPosq = make_real4(0, 0, 0, 0);
                CLEAR_LOCAL_PARAMETERS
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
Anton Gorenko's avatar
Anton Gorenko committed
321
                #pragma unroll 2
322
323
324
325
                for (j = 0; j < TILE_SIZE; j++) {
                    real4 posq2 = shflPosq;
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
Anton Gorenko's avatar
Anton Gorenko committed
326
327
328
329
330
331
332
#ifdef USE_CUTOFF
                    if (r2 < MAX_CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        int atom2 = atomIndex;
333
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
334
                        real dEdR = 0.0f;
335
#else
Anton Gorenko's avatar
Anton Gorenko committed
336
337
                        real3 dEdR1 = make_real3(0);
                        real3 dEdR2 = make_real3(0);
338
339
#endif
#ifdef USE_EXCLUSIONS
Anton Gorenko's avatar
Anton Gorenko committed
340
341
342
343
344
345
346
                        bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
#endif
                        real tempEnergy = 0.0f;
                        const real interactionScale = 1.0f;
                        COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
                        energy += tempEnergy;
347
348
349
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
350
351
352
353
354
355
356
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
                        shflForce.x += delta.x;
                        shflForce.y += delta.y;
                        shflForce.z += delta.z;
357
#else // !USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
358
359
360
361
362
363
                        force.x -= dEdR1.x;
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
                        shflForce.x += dEdR2.x;
                        shflForce.y += dEdR2.y;
                        shflForce.z += dEdR2.z;
364
365
#endif // end USE_SYMMETRIC
#endif
Anton Gorenko's avatar
Anton Gorenko committed
366
367
#ifdef USE_CUTOFF
                    }
368
#endif
Anton Gorenko's avatar
Anton Gorenko committed
369
370
                    SHUFFLE_WARP_DATA
                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
371
372
373
374
375
376
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.
Anton Gorenko's avatar
Anton Gorenko committed
377
                #pragma unroll 2
378
379
380
381
382
383
384
                for (j = 0; j < TILE_SIZE; j++) {
                    real4 posq2 = shflPosq;
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
#endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
Anton Gorenko's avatar
Anton Gorenko committed
385
386
387
388
389
390
391
#ifdef USE_CUTOFF
                    if (r2 < MAX_CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        int atom2 = atomIndex;
392
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
393
                        real dEdR = 0.0f;
394
#else
Anton Gorenko's avatar
Anton Gorenko committed
395
396
                        real3 dEdR1 = make_real3(0);
                        real3 dEdR2 = make_real3(0);
397
398
#endif
#ifdef USE_EXCLUSIONS
Anton Gorenko's avatar
Anton Gorenko committed
399
400
401
402
403
404
405
                        bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
#endif
                        real tempEnergy = 0.0f;
                        const real interactionScale = 1.0f;
                        COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
                        energy += tempEnergy;
406
407
408
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
409
410
411
412
413
414
415
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
                        shflForce.x += delta.x;
                        shflForce.y += delta.y;
                        shflForce.z += delta.z;
416
#else // !USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
417
418
419
420
421
422
                        force.x -= dEdR1.x;
                        force.y -= dEdR1.y;
                        force.z -= dEdR1.z;
                        shflForce.x += dEdR2.x;
                        shflForce.y += dEdR2.y;
                        shflForce.z += dEdR2.z;
423
424
#endif // end USE_SYMMETRIC
#endif
Anton Gorenko's avatar
Anton Gorenko committed
425
426
#ifdef USE_CUTOFF
                    }
427
#endif
Anton Gorenko's avatar
Anton Gorenko committed
428
429
                    SHUFFLE_WARP_DATA
                    atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
430
431
432
433
434
                }
            }

            // Write results.
#ifdef INCLUDE_FORCES
Anton Gorenko's avatar
Anton Gorenko committed
435
436
437
438
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(force.x)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
            unsigned int atom2 = atomIndex;
439
            if (atom2 < PADDED_NUM_ATOMS) {
Anton Gorenko's avatar
Anton Gorenko committed
440
441
442
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
443
444
445
446
447
448
449
            }
#endif
        }
    }

    // Third loop: single pairs that aren't part of a tile.

450
#if USE_NEIGHBOR_LIST
451
452
453
454
455
456
457
458
459
460
461
462
463
464
    const unsigned int numPairs = interactionCount[1];
    if (numPairs > maxSinglePairs)
        return; // There wasn't enough memory for the neighbor list.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
        int2 pair = singlePairs[i];
        int atom1 = pair.x;
        int atom2 = pair.y;
        real4 posq1 = posq[atom1];
        real4 posq2 = posq[atom2];
        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_DELTA(delta)
#endif
        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
Anton Gorenko's avatar
Anton Gorenko committed
465
466
467
468
469
470
471
472
        if (r2 < MAX_CUTOFF_SQUARED) {
            LOAD_ATOM1_PARAMETERS
            int j = atom2;
            DECLARE_LOCAL_PARAMETERS
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            LOAD_ATOM2_PARAMETERS
            real invR = RSQRT(r2);
            real r = r2*invR;
473
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
474
            real dEdR = 0.0f;
475
#else
Anton Gorenko's avatar
Anton Gorenko committed
476
477
478
479
480
481
482
483
484
485
            real3 dEdR1 = make_real3(0);
            real3 dEdR2 = make_real3(0);
#endif
            bool isExcluded = false;
            real tempEnergy = 0.0f;
            const real interactionScale = 1.0f;
            COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
            energy += tempEnergy;
#endif
486
487
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
Anton Gorenko's avatar
Anton Gorenko committed
488
489
            real3 dEdR1 = delta*dEdR;
            real3 dEdR2 = -dEdR1;
490
#endif
Anton Gorenko's avatar
Anton Gorenko committed
491
492
493
494
495
496
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.x)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.y)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.z)));
            atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.x)));
            atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.y)));
            atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.z)));
497
#endif
Anton Gorenko's avatar
Anton Gorenko committed
498
        }
499
500
501
502
503
504
505
    }
#endif
#ifdef INCLUDE_ENERGY
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
#endif
    SAVE_DERIVATIVES
}