nonbonded.cu 21.1 KB
Newer Older
1
2
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

3
4
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
Peter Eastman's avatar
Peter Eastman committed
5
    return SHFL(var, srcLane);
6
7
}

8
9
10
11
static __inline__ __device__ float real_shfl(int var, int srcLane) {
    return SHFL(var, srcLane);
}

12
static __inline__ __device__ double real_shfl(double var, int srcLane) {
13
14
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
Peter Eastman's avatar
Peter Eastman committed
15
16
    hi = SHFL(hi, srcLane);
    lo = SHFL(lo, srcLane);
17
18
    return __hiloint2double( hi, lo );
}
19
20
21
22

static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
Peter Eastman's avatar
Peter Eastman committed
23
24
    hi = SHFL(hi, srcLane);
    lo = SHFL(lo, srcLane);
25
26
27
28
    // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<long long*>(&fuse);
}
29

30
31
32
33
34
/**
 * Save the force on a single atom.
 */
__device__ void saveSingleForce(int atom, real3 force, unsigned long long* forceBuffers) {
    if (force.x != 0)
35
        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>(realToFixedPoint(force.x)));
36
    if (force.y != 0)
37
        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
38
    if (force.z != 0)
39
        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
40
41
}

42
/**
43
44
45
46
47
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on 
 * implicit warp-level synchronization. A tile is defined by two atom blocks 
 * each of warpsize. Each warp computes a range of tiles.
 * 
48
 * Tiles with exclusions compute the entire set of interactions across
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 * 
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1 
 * L    a b c d e f g h 
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4 
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
70
71
72
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
73
 * and follows the same force accumulation method. If more there are more interactingTiles
74
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
75
76
77
78
79
80
 * and the full tileset is computed. This should happen on the first step, and very rarely 
 * afterwards.
 *
 * On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
 * __shfl to broadcast. For all other types of tiles __shfl is used to pass around the 
 * forces, positions, and parameters when computing the forces. 
81
82
83
84
85
86
87
88
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
 * [in]posq             - x,y,z,charge 
 * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
 * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
 * [in]startTileIndex   - index into first tile to be processed
 * [in]numTileIndices   - number of tiles this context is responsible for processing
89
 * [in]int tiles        - the atom block for each tile
90
91
92
93
94
95
96
97
98
99
100
101
102
 * [in]interactionCount - total number of tiles that have an interaction
 * [in]maxTiles         - stores the size of the neighbourlist in case it needs 
 *                      - to be expanded
 * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
 * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
 * [in]blockCenter      - the center of each block in euclidean coordinates
 * [in]blockSize        - size of the each block, radiating from the center
 *                      - x is half the distance of total length
 *                      - y is half the distance of total width
 *                      - z is half the distance of total height
 *                      - w is not used
 * [in]interactingAtoms - a list of interactions within a given tile     
 *
103
104
 */
extern "C" __global__ void computeNonbonded(
105
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
106
        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
107
#ifdef USE_CUTOFF
108
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
109
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
110
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
111
        const int2* __restrict__ singlePairs
112
113
#endif
        PARAMETER_ARGUMENTS) {
114
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
115
116
117
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
118
    mixed energy = 0;
119
    INIT_DERIVATIVES
120

121
    // First loop: process tiles that contain exclusions.
122

123
124
125
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
126
        const int2 tileIndices = exclusionTiles[pos];
127
128
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
129
        real3 force = make_real3(0);
130
131
132
133
134
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
135
#endif
136
137
138
        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
139
            real4 shflPosq = posq1;
140

141
142
            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
143
144
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
145
146
                real4 posq2;
                BROADCAST_WARP_DATA
147
148
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
149
                APPLY_PERIODIC_TO_DELTA(delta)
150
151
152
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                real invR = RSQRT(r2);
peastman's avatar
peastman committed
153
                real r = r2*invR;
154
155
156
157
158
159
160
161
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
                real dEdR = 0.0f;
#else
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
#endif
162
#ifdef USE_EXCLUSIONS
163
164
165
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
                real tempEnergy = 0.0f;
166
                const real interactionScale = 0.5f;
167
168
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
169
#ifdef INCLUDE_FORCES
170
171
172
173
#ifdef USE_SYMMETRIC
                force.x -= delta.x*dEdR;
                force.y -= delta.y*dEdR;
                force.z -= delta.z*dEdR;
174
#else
175
176
177
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
178
#endif
179
#endif
180
#ifdef USE_EXCLUSIONS
181
                excl >>= 1;
182
#endif
183
184
            }
        }
185
186
        else {
            // This is an off-diagonal tile.
187
            unsigned int j = y*TILE_SIZE + tgx;
188
189
190
191
192
            real4 shflPosq = posq[j];
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
193
            DECLARE_LOCAL_PARAMETERS
194
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
195
#ifdef USE_EXCLUSIONS
196
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
197
#endif
198
199
200
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
201
                real4 posq2 = shflPosq;
202
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
203
#ifdef USE_PERIODIC
204
                APPLY_PERIODIC_TO_DELTA(delta)
205
206
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
207
208
209
210
                real invR = RSQRT(r2);
                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+tj;
211
#ifdef USE_SYMMETRIC
212
                real dEdR = 0.0f;
213
#else
214
215
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
216
217
#endif
#ifdef USE_EXCLUSIONS
218
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
219
#endif
220
                real tempEnergy = 0.0f;
221
                const real interactionScale = 1.0f;
222
223
                COMPUTE_INTERACTION
                energy += tempEnergy;
224
#ifdef INCLUDE_FORCES
225
#ifdef USE_SYMMETRIC
226
227
228
229
230
231
232
                delta *= dEdR;
                force.x -= delta.x;
                force.y -= delta.y;
                force.z -= delta.z;
                shflForce.x += delta.x;
                shflForce.y += delta.y;
                shflForce.z += delta.z;
233
#else // !USE_SYMMETRIC
234
235
236
237
238
239
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
                shflForce.x += dEdR2.x;
                shflForce.y += dEdR2.y;
                shflForce.z += dEdR2.z;
240
#endif // end USE_SYMMETRIC
241
#endif
242
                SHUFFLE_WARP_DATA
243
244
#ifdef USE_EXCLUSIONS
                excl >>= 1;
245
#endif
246
247
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
248
249
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
250
251
            const unsigned int offset = y*TILE_SIZE + tgx;
            // write results for off diagonal tiles
252
#ifdef INCLUDE_FORCES
253
254
255
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
256
#endif
257
        }
258
        // Write results for on and off diagonal tiles
259
#ifdef INCLUDE_FORCES
260
        const unsigned int offset = x*TILE_SIZE + tgx;
261
262
263
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(force.x)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
264
#endif
265
266
267
268
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
269

270
#ifdef USE_NEIGHBOR_LIST
271
    const unsigned int numTiles = interactionCount[0];
272
273
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
274
275
    int pos = (int) (warp*(long long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
276
#else
277
278
    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
279
280
    int skipBase = 0;
    int currentSkipIndex = tbx;
281
282
283
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
    skipTiles[threadIdx.x] = -1;
#endif
284
285
    // atomIndices can probably be shuffled as well
    // but it probably wouldn't make things any faster
286
287
288
289
290
291
292
293
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
    
    while (pos < end) {
        const bool hasExclusions = false;
        real3 force = make_real3(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
294
        int x, y;
295
        bool singlePeriodicCopy = false;
296
#ifdef USE_NEIGHBOR_LIST
297
298
299
300
301
302
303
304
305
306
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
307
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
308
        }
309

310
        // Skip over tiles that have exclusions, since they were already processed.
311

312
313
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
314
                int2 tile = exclusionTiles[skipBase+tgx];
315
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
316
            }
317
318
319
320
            else
                skipTiles[threadIdx.x] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
321
        }
322
323
324
325
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
326
327
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
328
            // Load atom data for this tile.
329
330
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
331
#ifdef USE_NEIGHBOR_LIST
peastman's avatar
peastman committed
332
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
333
334
335
336
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
            atomIndices[threadIdx.x] = j;
337
            DECLARE_LOCAL_PARAMETERS
338
339
340
341
342
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
343
            if (j < PADDED_NUM_ATOMS) {
344
                // Load position of atom j from from global memory
345
                shflPosq = posq[j];
346
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
347
            }
348
349
            else {
                shflPosq = make_real4(0, 0, 0, 0);
350
                CLEAR_LOCAL_PARAMETERS
351
            }
352
353
354
355
356
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
357
358
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
359
360
361
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
362
                    real4 posq2 = shflPosq; 
363
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
364
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
365
366
367
368
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
369
#ifdef USE_SYMMETRIC
370
                    real dEdR = 0.0f;
371
#else
372
373
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
374
375
#endif
#ifdef USE_EXCLUSIONS
376
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
377
#endif
378
                    real tempEnergy = 0.0f;
379
                    const real interactionScale = 1.0f;
380
381
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
382
#ifdef INCLUDE_FORCES
383
#ifdef USE_SYMMETRIC
384
385
386
387
388
389
390
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
391
#else // !USE_SYMMETRIC
392
393
394
395
396
397
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
398
#endif // end USE_SYMMETRIC
399
#endif
400
                    SHUFFLE_WARP_DATA
401
                    tj = (tj + 1) & (TILE_SIZE - 1);
402
                }
403
404
            }
            else
405
#endif
406
407
408
409
410
            {
                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
411
                    real4 posq2 = shflPosq;
412
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
413
#ifdef USE_PERIODIC
414
                    APPLY_PERIODIC_TO_DELTA(delta)
415
#endif
416
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
417
418
419
420
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
421
#ifdef USE_SYMMETRIC
422
                    real dEdR = 0.0f;
423
#else
424
425
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
426
427
#endif
#ifdef USE_EXCLUSIONS
428
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
429
#endif
430
                    real tempEnergy = 0.0f;
431
                    const real interactionScale = 1.0f;
432
433
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
434
#ifdef INCLUDE_FORCES
435
#ifdef USE_SYMMETRIC
436
437
438
439
440
441
442
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
443
#else // !USE_SYMMETRIC
444
445
446
447
448
449
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
450
#endif // end USE_SYMMETRIC
451
#endif
452
                    SHUFFLE_WARP_DATA
453
                    tj = (tj + 1) & (TILE_SIZE - 1);
454
455
                }
            }
456
457

            // Write results.
458
#ifdef INCLUDE_FORCES
459
460
461
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(force.x)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
462
#ifdef USE_NEIGHBOR_LIST
463
464
465
466
467
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
            if (atom2 < PADDED_NUM_ATOMS) {
468
469
470
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
471
            }
472
#endif
473
474
        }
        pos++;
475
    }
476
477
478
    
    // Third loop: single pairs that aren't part of a tile.
    
479
#if USE_NEIGHBOR_LIST
480
    const unsigned int numPairs = interactionCount[1];
481
482
483
484
485
486
487
488
489
    if (numPairs > maxSinglePairs)
        return; // There wasn't enough memory for the neighbor list.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
        int2 pair = singlePairs[i];
        int atom1 = pair.x;
        int atom2 = pair.y;
        real4 posq1 = posq[atom1];
        real4 posq2 = posq[atom2];
        LOAD_ATOM1_PARAMETERS
490
        LOAD_ATOM2_PARAMETERS_FROM_GLOBAL
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_DELTA(delta)
#endif
        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
        real invR = RSQRT(r2);
        real r = r2*invR;
#ifdef USE_SYMMETRIC
        real dEdR = 0.0f;
#else
        real3 dEdR1 = make_real3(0);
        real3 dEdR2 = make_real3(0);
#endif
        bool hasExclusions = false;
        bool isExcluded = false;
        real tempEnergy = 0.0f;
        const real interactionScale = 1.0f;
        COMPUTE_INTERACTION
        energy += tempEnergy;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
        real3 dEdR1 = delta*dEdR;
        real3 dEdR2 = -dEdR1;
#endif
515
516
        saveSingleForce(atom1, -dEdR1, forceBuffers);
        saveSingleForce(atom2, -dEdR2, forceBuffers);
517
518
519
#endif
    }
#endif
520
#ifdef INCLUDE_ENERGY
521
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
522
#endif
523
    SAVE_DERIVATIVES
524
}