nonbonded.cu 26 KB
Newer Older
1
2
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

3
#ifndef ENABLE_SHUFFLE
4
5
6
typedef struct {
    real x, y, z;
    real q;
7
    real fx, fy, fz;
8
9
10
11
12
    ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
#endif
} AtomData;
13
14
#endif

15
#ifdef ENABLE_SHUFFLE
16
17
18
19
20
21
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
    return __shfl(var, srcLane);
}

static __inline__ __device__ double real_shfl(double var, int srcLane) {
22
23
24
25
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
    hi = __shfl(hi, srcLane);
    lo = __shfl(lo, srcLane);
26
27
    return __hiloint2double( hi, lo );
}
28
29
30
31
32
33
34
35
36
37

static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
    hi = __shfl(hi, srcLane);
    lo = __shfl(lo, srcLane);
    // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<long long*>(&fuse);
}
38
#endif
39
40

/**
41
42
43
44
45
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on 
 * implicit warp-level synchronization. A tile is defined by two atom blocks 
 * each of warpsize. Each warp computes a range of tiles.
 * 
46
 * Tiles with exclusions compute the entire set of interactions across
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 * 
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1 
 * L    a b c d e f g h 
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4 
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
68
69
70
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
71
 * and follows the same force accumulation method. If more there are more interactingTiles
72
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
73
74
75
76
77
78
 * and the full tileset is computed. This should happen on the first step, and very rarely 
 * afterwards.
 *
 * On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
 * __shfl to broadcast. For all other types of tiles __shfl is used to pass around the 
 * forces, positions, and parameters when computing the forces. 
79
80
81
82
83
84
85
86
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
 * [in]posq             - x,y,z,charge 
 * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
 * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
 * [in]startTileIndex   - index into first tile to be processed
 * [in]numTileIndices   - number of tiles this context is responsible for processing
87
 * [in]int tiles        - the atom block for each tile
88
89
90
91
92
93
94
95
96
97
98
99
100
 * [in]interactionCount - total number of tiles that have an interaction
 * [in]maxTiles         - stores the size of the neighbourlist in case it needs 
 *                      - to be expanded
 * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
 * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
 * [in]blockCenter      - the center of each block in euclidean coordinates
 * [in]blockSize        - size of the each block, radiating from the center
 *                      - x is half the distance of total length
 *                      - y is half the distance of total width
 *                      - z is half the distance of total height
 *                      - w is not used
 * [in]interactingAtoms - a list of interactions within a given tile     
 *
101
102
 */
extern "C" __global__ void computeNonbonded(
103
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
104
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
105
#ifdef USE_CUTOFF
106
107
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
108
109
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs, const int* __restrict__ singlePairCount,
        const int2* __restrict__ singlePairs
110
111
#endif
        PARAMETER_ARGUMENTS) {
112
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
113
114
115
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
116
    mixed energy = 0;
117
    INIT_DERIVATIVES
118
119
120
121
    // used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
122

123
    // First loop: process tiles that contain exclusions.
124

125
126
127
128
129
130
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
131
        real3 force = make_real3(0);
132
133
134
135
136
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
137
#endif
138
139
140
        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
141
142
#ifdef ENABLE_SHUFFLE
            real4 shflPosq = posq1;
143
#else
144
145
146
147
148
149
            localData[threadIdx.x].x = posq1.x;
            localData[threadIdx.x].y = posq1.y;
            localData[threadIdx.x].z = posq1.z;
            localData[threadIdx.x].q = posq1.w;
            LOAD_LOCAL_PARAMETERS_FROM_1
#endif
150

151
152
            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
153
154
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
155
                real4 posq2;
156
#ifdef ENABLE_SHUFFLE
157
                BROADCAST_WARP_DATA
158
#else   
159
160
                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
161
162
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
163
                APPLY_PERIODIC_TO_DELTA(delta)
164
165
166
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                real invR = RSQRT(r2);
peastman's avatar
peastman committed
167
                real r = r2*invR;
168
169
170
171
172
173
174
175
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
                real dEdR = 0.0f;
#else
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
#endif
176
#ifdef USE_EXCLUSIONS
177
178
179
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
                real tempEnergy = 0.0f;
180
                const real interactionScale = 0.5f;
181
182
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
183
#ifdef INCLUDE_FORCES
184
185
186
187
#ifdef USE_SYMMETRIC
                force.x -= delta.x*dEdR;
                force.y -= delta.y*dEdR;
                force.z -= delta.z*dEdR;
188
#else
189
190
191
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
192
#endif
193
#endif
194
#ifdef USE_EXCLUSIONS
195
                excl >>= 1;
196
#endif
197
198
            }
        }
199
200
        else {
            // This is an off-diagonal tile.
201
            unsigned int j = y*TILE_SIZE + tgx;
202
203
204
205
206
207
            real4 shflPosq = posq[j];
#ifdef ENABLE_SHUFFLE
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
208
#else
209
210
211
212
213
214
215
216
            localData[threadIdx.x].x = shflPosq.x;
            localData[threadIdx.x].y = shflPosq.y;
            localData[threadIdx.x].z = shflPosq.z;
            localData[threadIdx.x].q = shflPosq.w;
            localData[threadIdx.x].fx = 0.0f;
            localData[threadIdx.x].fy = 0.0f;
            localData[threadIdx.x].fz = 0.0f;
#endif
217
            DECLARE_LOCAL_PARAMETERS
218
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
219
#ifdef USE_EXCLUSIONS
220
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
221
#endif
222
223
224
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
225
226
#ifdef ENABLE_SHUFFLE
                real4 posq2 = shflPosq;
227
#else
228
229
                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
230
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
231
#ifdef USE_PERIODIC
232
                APPLY_PERIODIC_TO_DELTA(delta)
233
234
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
235
236
237
238
                real invR = RSQRT(r2);
                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+tj;
239
#ifdef USE_SYMMETRIC
240
                real dEdR = 0.0f;
241
#else
242
243
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
244
245
#endif
#ifdef USE_EXCLUSIONS
246
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
247
#endif
248
                real tempEnergy = 0.0f;
249
                const real interactionScale = 1.0f;
250
251
                COMPUTE_INTERACTION
                energy += tempEnergy;
252
#ifdef INCLUDE_FORCES
253
#ifdef USE_SYMMETRIC
254
255
256
257
                delta *= dEdR;
                force.x -= delta.x;
                force.y -= delta.y;
                force.z -= delta.z;
258
#ifdef ENABLE_SHUFFLE
259
260
261
                shflForce.x += delta.x;
                shflForce.y += delta.y;
                shflForce.z += delta.z;
262

263
#else
264
265
266
                localData[tbx+tj].fx += delta.x;
                localData[tbx+tj].fy += delta.y;
                localData[tbx+tj].fz += delta.z;
267
268
#endif
#else // !USE_SYMMETRIC
269
270
271
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
272
#ifdef ENABLE_SHUFFLE
273
274
275
                shflForce.x += dEdR2.x;
                shflForce.y += dEdR2.y;
                shflForce.z += dEdR2.z;
276
#else
277
278
279
                localData[tbx+tj].fx += dEdR2.x;
                localData[tbx+tj].fy += dEdR2.y;
                localData[tbx+tj].fz += dEdR2.z;
280
281
282
283
#endif 
#endif // end USE_SYMMETRIC
#ifdef ENABLE_SHUFFLE
                SHUFFLE_WARP_DATA
284
285
286
287
#endif
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
288
#endif
289
290
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
291
292
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
293
294
            const unsigned int offset = y*TILE_SIZE + tgx;
            // write results for off diagonal tiles
295
#ifdef INCLUDE_FORCES
296
297
298
299
#ifdef ENABLE_SHUFFLE
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
300
#else
301
302
303
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
304
#endif
305
#endif
306
        }
307
        // Write results for on and off diagonal tiles
308
#ifdef INCLUDE_FORCES
309
        const unsigned int offset = x*TILE_SIZE + tgx;
310
311
312
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
313
#endif
314
315
316
317
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
318

319
320
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
321
322
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
323
324
    int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
    int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
325
326
#else
    const unsigned int numTiles = numTileIndices;
327
328
    int pos = (int) (startTileIndex+warp*(long long)numTiles/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
329
330
331
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
332
333
    // atomIndices can probably be shuffled as well
    // but it probably wouldn't make things any faster
334
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
335
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
336
337
338
339
340
341
342
343
    skipTiles[threadIdx.x] = -1;
    
    while (pos < end) {
        const bool hasExclusions = false;
        real3 force = make_real3(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
344
        int x, y;
345
346
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
347
348
349
350
351
352
353
354
355
356
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
357
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
358
        }
359

360
        // Skip over tiles that have exclusions, since they were already processed.
361

362
363
364
365
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
366
            }
367
368
369
370
            else
                skipTiles[threadIdx.x] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
371
        }
372
373
374
375
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
376
377
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
378
            // Load atom data for this tile.
379
380
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
381
            //const unsigned int localAtomIndex = threadIdx.x;
382
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
383
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
384
385
386
387
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
            atomIndices[threadIdx.x] = j;
388
#ifdef ENABLE_SHUFFLE
389
            DECLARE_LOCAL_PARAMETERS
390
391
392
393
394
395
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
#endif
396
            if (j < PADDED_NUM_ATOMS) {
397
                // Load position of atom j from from global memory
398
399
#ifdef ENABLE_SHUFFLE
                shflPosq = posq[j];
400
#else
401
402
403
404
405
406
407
408
                localData[threadIdx.x].x = posq[j].x;
                localData[threadIdx.x].y = posq[j].y;
                localData[threadIdx.x].z = posq[j].z;
                localData[threadIdx.x].q = posq[j].w;
                localData[threadIdx.x].fx = 0.0f;
                localData[threadIdx.x].fy = 0.0f;
                localData[threadIdx.x].fz = 0.0f;
#endif                
409
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
410
            }
411
412
413
414
415
416
417
418
419
            else {
#ifdef ENABLE_SHUFFLE
                shflPosq = make_real4(0, 0, 0, 0);
#else
                localData[threadIdx.x].x = 0;
                localData[threadIdx.x].y = 0;
                localData[threadIdx.x].z = 0;
#endif
            }
420
421
422
423
424
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
425
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
426
#ifdef ENABLE_SHUFFLE
427
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
428
#else
429
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
430
#endif
431
432
433
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
434
435
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq; 
436
#else
437
438
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
439
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
440
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
441
442
443
444
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
445
#ifdef USE_SYMMETRIC
446
                    real dEdR = 0.0f;
447
#else
448
449
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
450
451
#endif
#ifdef USE_EXCLUSIONS
452
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
453
#endif
454
                    real tempEnergy = 0.0f;
455
                    const real interactionScale = 1.0f;
456
457
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
458
#ifdef INCLUDE_FORCES
459
#ifdef USE_SYMMETRIC
460
461
462
463
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
464
#ifdef ENABLE_SHUFFLE
465
466
467
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
468

469
#else
470
471
472
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
473
474
#endif
#else // !USE_SYMMETRIC
475
476
477
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
478
#ifdef ENABLE_SHUFFLE
479
480
481
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
482
#else
483
484
485
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
486
487
488
#endif 
#endif // end USE_SYMMETRIC
#ifdef ENABLE_SHUFFLE
489
                    SHUFFLE_WARP_DATA
490
#endif
491
#endif
492
                    tj = (tj + 1) & (TILE_SIZE - 1);
493
                }
494
495
            }
            else
496
#endif
497
498
499
500
501
            {
                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
502
503
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq;
504
#else
505
506
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
507
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
508
#ifdef USE_PERIODIC
509
                    APPLY_PERIODIC_TO_DELTA(delta)
510
#endif
511
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
512
513
514
515
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
516
#ifdef USE_SYMMETRIC
517
                    real dEdR = 0.0f;
518
#else
519
520
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
521
522
#endif
#ifdef USE_EXCLUSIONS
523
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
524
#endif
525
                    real tempEnergy = 0.0f;
526
                    const real interactionScale = 1.0f;
527
528
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
529
#ifdef INCLUDE_FORCES
530
#ifdef USE_SYMMETRIC
531
532
533
534
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
535
#ifdef ENABLE_SHUFFLE
536
537
538
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
539

540
#else
541
542
543
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
544
#endif
545
#else // !USE_SYMMETRIC
546
547
548
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
549
#ifdef ENABLE_SHUFFLE
550
551
552
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
553
#else
554
555
556
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
557
558
559
#endif 
#endif // end USE_SYMMETRIC
#ifdef ENABLE_SHUFFLE
560
                    SHUFFLE_WARP_DATA
561
#endif
562
#endif
563
                    tj = (tj + 1) & (TILE_SIZE - 1);
564
565
                }
            }
566
567

            // Write results.
568
#ifdef INCLUDE_FORCES
569
570
571
572
573
574
575
576
577
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
#ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
            if (atom2 < PADDED_NUM_ATOMS) {
578
579
580
581
#ifdef ENABLE_SHUFFLE
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
582
#else
583
584
585
586
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
587
            }
588
#endif
589
590
        }
        pos++;
591
    }
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
    
    // Third loop: single pairs that aren't part of a tile.
    
#if USE_CUTOFF
    const unsigned int numPairs = singlePairCount[0];
    if (numPairs > maxSinglePairs)
        return; // There wasn't enough memory for the neighbor list.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
        int2 pair = singlePairs[i];
        int atom1 = pair.x;
        int atom2 = pair.y;
        real4 posq1 = posq[atom1];
        real4 posq2 = posq[atom2];
        LOAD_ATOM1_PARAMETERS
        int j = atom2;
atom2 = threadIdx.x;
        DECLARE_LOCAL_PARAMETERS
        LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
        LOAD_ATOM2_PARAMETERS
atom2 = pair.y;
        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_DELTA(delta)
#endif
        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
        real invR = RSQRT(r2);
        real r = r2*invR;
#ifdef USE_SYMMETRIC
        real dEdR = 0.0f;
#else
        real3 dEdR1 = make_real3(0);
        real3 dEdR2 = make_real3(0);
#endif
        bool hasExclusions = false;
        bool isExcluded = false;
        real tempEnergy = 0.0f;
        const real interactionScale = 1.0f;
        COMPUTE_INTERACTION
        energy += tempEnergy;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
        real3 dEdR1 = delta*dEdR;
        real3 dEdR2 = -dEdR1;
#endif
        atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
        atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
        atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
#endif
    }
#endif
645
#ifdef INCLUDE_ENERGY
646
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
647
#endif
648
    SAVE_DERIVATIVES
649
}