nonbonded.cu 25.6 KB
Newer Older
1
2
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

3
#ifndef ENABLE_SHUFFLE
4
5
6
typedef struct {
    real x, y, z;
    real q;
7
    real fx, fy, fz;
8
9
10
11
12
    ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
#endif
} AtomData;
13
14
#endif

15
#ifdef ENABLE_SHUFFLE
16
17
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
Peter Eastman's avatar
Peter Eastman committed
18
    return SHFL(var, srcLane);
19
20
}

21
22
23
24
static __inline__ __device__ float real_shfl(int var, int srcLane) {
    return SHFL(var, srcLane);
}

25
static __inline__ __device__ double real_shfl(double var, int srcLane) {
26
27
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
Peter Eastman's avatar
Peter Eastman committed
28
29
    hi = SHFL(hi, srcLane);
    lo = SHFL(lo, srcLane);
30
31
    return __hiloint2double( hi, lo );
}
32
33
34
35

static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
Peter Eastman's avatar
Peter Eastman committed
36
37
    hi = SHFL(hi, srcLane);
    lo = SHFL(lo, srcLane);
38
39
40
41
    // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<long long*>(&fuse);
}
42
#endif
43

44
45
46
47
48
49
50
51
52
53
54
55
/**
 * Save the force on a single atom.
 */
__device__ void saveSingleForce(int atom, real3 force, unsigned long long* forceBuffers) {
    if (force.x != 0)
        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
    if (force.y != 0)
        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
    if (force.z != 0)
        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}

56
/**
57
58
59
60
61
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on 
 * implicit warp-level synchronization. A tile is defined by two atom blocks 
 * each of warpsize. Each warp computes a range of tiles.
 * 
62
 * Tiles with exclusions compute the entire set of interactions across
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 * 
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1 
 * L    a b c d e f g h 
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4 
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
84
85
86
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
87
 * and follows the same force accumulation method. If more there are more interactingTiles
88
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
89
90
91
92
93
94
 * and the full tileset is computed. This should happen on the first step, and very rarely 
 * afterwards.
 *
 * On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
 * __shfl to broadcast. For all other types of tiles __shfl is used to pass around the 
 * forces, positions, and parameters when computing the forces. 
95
96
97
98
99
100
101
102
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
 * [in]posq             - x,y,z,charge 
 * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
 * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
 * [in]startTileIndex   - index into first tile to be processed
 * [in]numTileIndices   - number of tiles this context is responsible for processing
103
 * [in]int tiles        - the atom block for each tile
104
105
106
107
108
109
110
111
112
113
114
115
116
 * [in]interactionCount - total number of tiles that have an interaction
 * [in]maxTiles         - stores the size of the neighbourlist in case it needs 
 *                      - to be expanded
 * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
 * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
 * [in]blockCenter      - the center of each block in euclidean coordinates
 * [in]blockSize        - size of the each block, radiating from the center
 *                      - x is half the distance of total length
 *                      - y is half the distance of total width
 *                      - z is half the distance of total height
 *                      - w is not used
 * [in]interactingAtoms - a list of interactions within a given tile     
 *
117
118
 */
extern "C" __global__ void computeNonbonded(
119
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
120
        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
121
#ifdef USE_CUTOFF
122
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
123
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
124
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
125
        const int2* __restrict__ singlePairs
126
127
#endif
        PARAMETER_ARGUMENTS) {
128
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
129
130
131
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
132
    mixed energy = 0;
133
    INIT_DERIVATIVES
134
135
136
137
    // used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
138

139
    // First loop: process tiles that contain exclusions.
140

141
142
143
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
144
        const int2 tileIndices = exclusionTiles[pos];
145
146
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
147
        real3 force = make_real3(0);
148
149
150
151
152
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
153
#endif
154
155
156
        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
157
158
#ifdef ENABLE_SHUFFLE
            real4 shflPosq = posq1;
159
#else
160
161
162
163
164
165
            localData[threadIdx.x].x = posq1.x;
            localData[threadIdx.x].y = posq1.y;
            localData[threadIdx.x].z = posq1.z;
            localData[threadIdx.x].q = posq1.w;
            LOAD_LOCAL_PARAMETERS_FROM_1
#endif
166

167
168
            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
169
170
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
171
                real4 posq2;
172
#ifdef ENABLE_SHUFFLE
173
                BROADCAST_WARP_DATA
174
#else   
175
176
                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
177
178
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
179
                APPLY_PERIODIC_TO_DELTA(delta)
180
181
182
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                real invR = RSQRT(r2);
peastman's avatar
peastman committed
183
                real r = r2*invR;
184
185
186
187
188
189
190
191
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
                real dEdR = 0.0f;
#else
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
#endif
192
#ifdef USE_EXCLUSIONS
193
194
195
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
                real tempEnergy = 0.0f;
196
                const real interactionScale = 0.5f;
197
198
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
199
#ifdef INCLUDE_FORCES
200
201
202
203
#ifdef USE_SYMMETRIC
                force.x -= delta.x*dEdR;
                force.y -= delta.y*dEdR;
                force.z -= delta.z*dEdR;
204
#else
205
206
207
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
208
#endif
209
#endif
210
#ifdef USE_EXCLUSIONS
211
                excl >>= 1;
212
#endif
213
214
            }
        }
215
216
        else {
            // This is an off-diagonal tile.
217
            unsigned int j = y*TILE_SIZE + tgx;
218
219
220
221
222
223
            real4 shflPosq = posq[j];
#ifdef ENABLE_SHUFFLE
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
224
#else
225
226
227
228
229
230
231
232
            localData[threadIdx.x].x = shflPosq.x;
            localData[threadIdx.x].y = shflPosq.y;
            localData[threadIdx.x].z = shflPosq.z;
            localData[threadIdx.x].q = shflPosq.w;
            localData[threadIdx.x].fx = 0.0f;
            localData[threadIdx.x].fy = 0.0f;
            localData[threadIdx.x].fz = 0.0f;
#endif
233
            DECLARE_LOCAL_PARAMETERS
234
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
235
#ifdef USE_EXCLUSIONS
236
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
237
#endif
238
239
240
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
241
242
#ifdef ENABLE_SHUFFLE
                real4 posq2 = shflPosq;
243
#else
244
245
                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
246
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
247
#ifdef USE_PERIODIC
248
                APPLY_PERIODIC_TO_DELTA(delta)
249
250
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
251
252
253
254
                real invR = RSQRT(r2);
                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+tj;
255
#ifdef USE_SYMMETRIC
256
                real dEdR = 0.0f;
257
#else
258
259
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
260
261
#endif
#ifdef USE_EXCLUSIONS
262
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
263
#endif
264
                real tempEnergy = 0.0f;
265
                const real interactionScale = 1.0f;
266
267
                COMPUTE_INTERACTION
                energy += tempEnergy;
268
#ifdef INCLUDE_FORCES
269
#ifdef USE_SYMMETRIC
270
271
272
273
                delta *= dEdR;
                force.x -= delta.x;
                force.y -= delta.y;
                force.z -= delta.z;
274
#ifdef ENABLE_SHUFFLE
275
276
277
                shflForce.x += delta.x;
                shflForce.y += delta.y;
                shflForce.z += delta.z;
278

279
#else
280
281
282
                localData[tbx+tj].fx += delta.x;
                localData[tbx+tj].fy += delta.y;
                localData[tbx+tj].fz += delta.z;
283
284
#endif
#else // !USE_SYMMETRIC
285
286
287
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
288
#ifdef ENABLE_SHUFFLE
289
290
291
                shflForce.x += dEdR2.x;
                shflForce.y += dEdR2.y;
                shflForce.z += dEdR2.z;
292
#else
293
294
295
                localData[tbx+tj].fx += dEdR2.x;
                localData[tbx+tj].fy += dEdR2.y;
                localData[tbx+tj].fz += dEdR2.z;
296
297
#endif 
#endif // end USE_SYMMETRIC
298
#endif
299
300
#ifdef ENABLE_SHUFFLE
                SHUFFLE_WARP_DATA
301
302
303
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
304
#endif
305
306
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
307
308
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
309
310
            const unsigned int offset = y*TILE_SIZE + tgx;
            // write results for off diagonal tiles
311
#ifdef INCLUDE_FORCES
312
313
314
315
#ifdef ENABLE_SHUFFLE
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
316
#else
317
318
319
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
320
#endif
321
#endif
322
        }
323
        // Write results for on and off diagonal tiles
324
#ifdef INCLUDE_FORCES
325
        const unsigned int offset = x*TILE_SIZE + tgx;
326
327
328
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
329
#endif
330
331
332
333
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
334

335
336
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
337
338
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
339
340
    int pos = (int) (warp*(long long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
341
#else
342
343
    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
344
345
    int skipBase = 0;
    int currentSkipIndex = tbx;
346
347
348
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
    skipTiles[threadIdx.x] = -1;
#endif
349
350
    // atomIndices can probably be shuffled as well
    // but it probably wouldn't make things any faster
351
352
353
354
355
356
357
358
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
    
    while (pos < end) {
        const bool hasExclusions = false;
        real3 force = make_real3(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
359
        int x, y;
360
361
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
362
363
364
365
366
367
368
369
370
371
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
372
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
373
        }
374

375
        // Skip over tiles that have exclusions, since they were already processed.
376

377
378
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
379
                int2 tile = exclusionTiles[skipBase+tgx];
380
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
381
            }
382
383
384
385
            else
                skipTiles[threadIdx.x] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
386
        }
387
388
389
390
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
391
392
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
393
            // Load atom data for this tile.
394
395
396
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
397
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
398
399
400
401
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
            atomIndices[threadIdx.x] = j;
402
#ifdef ENABLE_SHUFFLE
403
            DECLARE_LOCAL_PARAMETERS
404
405
406
407
408
409
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
#endif
410
            if (j < PADDED_NUM_ATOMS) {
411
                // Load position of atom j from from global memory
412
413
#ifdef ENABLE_SHUFFLE
                shflPosq = posq[j];
414
#else
415
416
417
418
419
420
421
422
                localData[threadIdx.x].x = posq[j].x;
                localData[threadIdx.x].y = posq[j].y;
                localData[threadIdx.x].z = posq[j].z;
                localData[threadIdx.x].q = posq[j].w;
                localData[threadIdx.x].fx = 0.0f;
                localData[threadIdx.x].fy = 0.0f;
                localData[threadIdx.x].fz = 0.0f;
#endif                
423
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
424
            }
425
426
427
428
429
430
431
432
            else {
#ifdef ENABLE_SHUFFLE
                shflPosq = make_real4(0, 0, 0, 0);
#else
                localData[threadIdx.x].x = 0;
                localData[threadIdx.x].y = 0;
                localData[threadIdx.x].z = 0;
#endif
433
                CLEAR_LOCAL_PARAMETERS
434
            }
435
436
437
438
439
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
440
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
441
#ifdef ENABLE_SHUFFLE
442
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
443
#else
444
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
445
#endif
446
447
448
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
449
450
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq; 
451
#else
452
453
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
454
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
455
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
456
457
458
459
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
460
#ifdef USE_SYMMETRIC
461
                    real dEdR = 0.0f;
462
#else
463
464
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
465
466
#endif
#ifdef USE_EXCLUSIONS
467
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
468
#endif
469
                    real tempEnergy = 0.0f;
470
                    const real interactionScale = 1.0f;
471
472
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
473
#ifdef INCLUDE_FORCES
474
#ifdef USE_SYMMETRIC
475
476
477
478
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
479
#ifdef ENABLE_SHUFFLE
480
481
482
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
483

484
#else
485
486
487
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
488
489
#endif
#else // !USE_SYMMETRIC
490
491
492
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
493
#ifdef ENABLE_SHUFFLE
494
495
496
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
497
#else
498
499
500
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
501
502
#endif 
#endif // end USE_SYMMETRIC
503
#endif
504
#ifdef ENABLE_SHUFFLE
505
                    SHUFFLE_WARP_DATA
506
#endif
507
                    tj = (tj + 1) & (TILE_SIZE - 1);
508
                }
509
510
            }
            else
511
#endif
512
513
514
515
516
            {
                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
517
518
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq;
519
#else
520
521
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
522
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
523
#ifdef USE_PERIODIC
524
                    APPLY_PERIODIC_TO_DELTA(delta)
525
#endif
526
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
527
528
529
530
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
531
#ifdef USE_SYMMETRIC
532
                    real dEdR = 0.0f;
533
#else
534
535
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
536
537
#endif
#ifdef USE_EXCLUSIONS
538
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
539
#endif
540
                    real tempEnergy = 0.0f;
541
                    const real interactionScale = 1.0f;
542
543
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
544
#ifdef INCLUDE_FORCES
545
#ifdef USE_SYMMETRIC
546
547
548
549
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
550
#ifdef ENABLE_SHUFFLE
551
552
553
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
554

555
#else
556
557
558
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
559
#endif
560
#else // !USE_SYMMETRIC
561
562
563
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
564
#ifdef ENABLE_SHUFFLE
565
566
567
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
568
#else
569
570
571
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
572
573
#endif 
#endif // end USE_SYMMETRIC
574
#endif
575
#ifdef ENABLE_SHUFFLE
576
                    SHUFFLE_WARP_DATA
577
#endif
578
                    tj = (tj + 1) & (TILE_SIZE - 1);
579
580
                }
            }
581
582

            // Write results.
583
#ifdef INCLUDE_FORCES
584
585
586
587
588
589
590
591
592
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
#ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
            if (atom2 < PADDED_NUM_ATOMS) {
593
594
595
596
#ifdef ENABLE_SHUFFLE
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
597
#else
598
599
600
601
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
602
            }
603
#endif
604
605
        }
        pos++;
606
    }
607
608
609
610
    
    // Third loop: single pairs that aren't part of a tile.
    
#if USE_CUTOFF
611
    const unsigned int numPairs = interactionCount[1];
612
613
614
615
616
617
618
619
620
    if (numPairs > maxSinglePairs)
        return; // There wasn't enough memory for the neighbor list.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
        int2 pair = singlePairs[i];
        int atom1 = pair.x;
        int atom2 = pair.y;
        real4 posq1 = posq[atom1];
        real4 posq2 = posq[atom2];
        LOAD_ATOM1_PARAMETERS
621
        LOAD_ATOM2_PARAMETERS_FROM_GLOBAL
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_DELTA(delta)
#endif
        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
        real invR = RSQRT(r2);
        real r = r2*invR;
#ifdef USE_SYMMETRIC
        real dEdR = 0.0f;
#else
        real3 dEdR1 = make_real3(0);
        real3 dEdR2 = make_real3(0);
#endif
        bool hasExclusions = false;
        bool isExcluded = false;
        real tempEnergy = 0.0f;
        const real interactionScale = 1.0f;
        COMPUTE_INTERACTION
        energy += tempEnergy;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
        real3 dEdR1 = delta*dEdR;
        real3 dEdR2 = -dEdR1;
#endif
646
647
        saveSingleForce(atom1, -dEdR1, forceBuffers);
        saveSingleForce(atom2, -dEdR2, forceBuffers);
648
649
650
#endif
    }
#endif
651
#ifdef INCLUDE_ENERGY
652
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
653
#endif
654
    SAVE_DERIVATIVES
655
}