nonbonded.cu 25.9 KB
Newer Older
1
2
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

3
#ifndef ENABLE_SHUFFLE
4
5
6
typedef struct {
    real x, y, z;
    real q;
7
    real fx, fy, fz;
8
9
10
11
12
    ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
#endif
} AtomData;
13
14
#endif

15
#ifdef ENABLE_SHUFFLE
16
17
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
Peter Eastman's avatar
Peter Eastman committed
18
    return SHFL(var, srcLane);
19
20
}

21
22
23
24
static __inline__ __device__ float real_shfl(int var, int srcLane) {
    return SHFL(var, srcLane);
}

25
static __inline__ __device__ double real_shfl(double var, int srcLane) {
26
27
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
Peter Eastman's avatar
Peter Eastman committed
28
29
    hi = SHFL(hi, srcLane);
    lo = SHFL(lo, srcLane);
30
31
    return __hiloint2double( hi, lo );
}
32
33
34
35

static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
Peter Eastman's avatar
Peter Eastman committed
36
37
    hi = SHFL(hi, srcLane);
    lo = SHFL(lo, srcLane);
38
39
40
41
    // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<long long*>(&fuse);
}
42
#endif
43
44

/**
45
46
47
48
49
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on 
 * implicit warp-level synchronization. A tile is defined by two atom blocks 
 * each of warpsize. Each warp computes a range of tiles.
 * 
50
 * Tiles with exclusions compute the entire set of interactions across
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 * 
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1 
 * L    a b c d e f g h 
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4 
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
72
73
74
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
75
 * and follows the same force accumulation method. If more there are more interactingTiles
76
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
77
78
79
80
81
82
 * and the full tileset is computed. This should happen on the first step, and very rarely 
 * afterwards.
 *
 * On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
 * __shfl to broadcast. For all other types of tiles __shfl is used to pass around the 
 * forces, positions, and parameters when computing the forces. 
83
84
85
86
87
88
89
90
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
 * [in]posq             - x,y,z,charge 
 * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
 * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
 * [in]startTileIndex   - index into first tile to be processed
 * [in]numTileIndices   - number of tiles this context is responsible for processing
91
 * [in]int tiles        - the atom block for each tile
92
93
94
95
96
97
98
99
100
101
102
103
104
 * [in]interactionCount - total number of tiles that have an interaction
 * [in]maxTiles         - stores the size of the neighbourlist in case it needs 
 *                      - to be expanded
 * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
 * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
 * [in]blockCenter      - the center of each block in euclidean coordinates
 * [in]blockSize        - size of the each block, radiating from the center
 *                      - x is half the distance of total length
 *                      - y is half the distance of total width
 *                      - z is half the distance of total height
 *                      - w is not used
 * [in]interactingAtoms - a list of interactions within a given tile     
 *
105
106
 */
extern "C" __global__ void computeNonbonded(
107
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
108
        const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
109
#ifdef USE_CUTOFF
110
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
111
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
112
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
113
        const int2* __restrict__ singlePairs
114
115
#endif
        PARAMETER_ARGUMENTS) {
116
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
117
118
119
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
120
    mixed energy = 0;
121
    INIT_DERIVATIVES
122
123
124
125
    // used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
126

127
    // First loop: process tiles that contain exclusions.
128

129
130
131
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
132
        const int2 tileIndices = exclusionTiles[pos];
133
134
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
135
        real3 force = make_real3(0);
136
137
138
139
140
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
141
#endif
142
143
144
        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
145
146
#ifdef ENABLE_SHUFFLE
            real4 shflPosq = posq1;
147
#else
148
149
150
151
152
153
            localData[threadIdx.x].x = posq1.x;
            localData[threadIdx.x].y = posq1.y;
            localData[threadIdx.x].z = posq1.z;
            localData[threadIdx.x].q = posq1.w;
            LOAD_LOCAL_PARAMETERS_FROM_1
#endif
154

155
156
            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
157
158
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
159
                real4 posq2;
160
#ifdef ENABLE_SHUFFLE
161
                BROADCAST_WARP_DATA
162
#else   
163
164
                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
165
166
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
167
                APPLY_PERIODIC_TO_DELTA(delta)
168
169
170
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                real invR = RSQRT(r2);
peastman's avatar
peastman committed
171
                real r = r2*invR;
172
173
174
175
176
177
178
179
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
                real dEdR = 0.0f;
#else
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
#endif
180
#ifdef USE_EXCLUSIONS
181
182
183
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
                real tempEnergy = 0.0f;
184
                const real interactionScale = 0.5f;
185
186
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
187
#ifdef INCLUDE_FORCES
188
189
190
191
#ifdef USE_SYMMETRIC
                force.x -= delta.x*dEdR;
                force.y -= delta.y*dEdR;
                force.z -= delta.z*dEdR;
192
#else
193
194
195
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
196
#endif
197
#endif
198
#ifdef USE_EXCLUSIONS
199
                excl >>= 1;
200
#endif
201
202
            }
        }
203
204
        else {
            // This is an off-diagonal tile.
205
            unsigned int j = y*TILE_SIZE + tgx;
206
207
208
209
210
211
            real4 shflPosq = posq[j];
#ifdef ENABLE_SHUFFLE
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
212
#else
213
214
215
216
217
218
219
220
            localData[threadIdx.x].x = shflPosq.x;
            localData[threadIdx.x].y = shflPosq.y;
            localData[threadIdx.x].z = shflPosq.z;
            localData[threadIdx.x].q = shflPosq.w;
            localData[threadIdx.x].fx = 0.0f;
            localData[threadIdx.x].fy = 0.0f;
            localData[threadIdx.x].fz = 0.0f;
#endif
221
            DECLARE_LOCAL_PARAMETERS
222
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
223
#ifdef USE_EXCLUSIONS
224
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
225
#endif
226
227
228
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
229
230
#ifdef ENABLE_SHUFFLE
                real4 posq2 = shflPosq;
231
#else
232
233
                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
234
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
235
#ifdef USE_PERIODIC
236
                APPLY_PERIODIC_TO_DELTA(delta)
237
238
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
239
240
241
242
                real invR = RSQRT(r2);
                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+tj;
243
#ifdef USE_SYMMETRIC
244
                real dEdR = 0.0f;
245
#else
246
247
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
248
249
#endif
#ifdef USE_EXCLUSIONS
250
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
251
#endif
252
                real tempEnergy = 0.0f;
253
                const real interactionScale = 1.0f;
254
255
                COMPUTE_INTERACTION
                energy += tempEnergy;
256
#ifdef INCLUDE_FORCES
257
#ifdef USE_SYMMETRIC
258
259
260
261
                delta *= dEdR;
                force.x -= delta.x;
                force.y -= delta.y;
                force.z -= delta.z;
262
#ifdef ENABLE_SHUFFLE
263
264
265
                shflForce.x += delta.x;
                shflForce.y += delta.y;
                shflForce.z += delta.z;
266

267
#else
268
269
270
                localData[tbx+tj].fx += delta.x;
                localData[tbx+tj].fy += delta.y;
                localData[tbx+tj].fz += delta.z;
271
272
#endif
#else // !USE_SYMMETRIC
273
274
275
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
276
#ifdef ENABLE_SHUFFLE
277
278
279
                shflForce.x += dEdR2.x;
                shflForce.y += dEdR2.y;
                shflForce.z += dEdR2.z;
280
#else
281
282
283
                localData[tbx+tj].fx += dEdR2.x;
                localData[tbx+tj].fy += dEdR2.y;
                localData[tbx+tj].fz += dEdR2.z;
284
285
#endif 
#endif // end USE_SYMMETRIC
286
#endif
287
288
#ifdef ENABLE_SHUFFLE
                SHUFFLE_WARP_DATA
289
290
291
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
292
#endif
293
294
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
295
296
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
297
298
            const unsigned int offset = y*TILE_SIZE + tgx;
            // write results for off diagonal tiles
299
#ifdef INCLUDE_FORCES
300
301
302
303
#ifdef ENABLE_SHUFFLE
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
304
#else
305
306
307
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
308
#endif
309
#endif
310
        }
311
        // Write results for on and off diagonal tiles
312
#ifdef INCLUDE_FORCES
313
        const unsigned int offset = x*TILE_SIZE + tgx;
314
315
316
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
317
#endif
318
319
320
321
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
322

323
324
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
325
326
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
327
328
    int pos = (int) (warp*(long long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
329
#else
330
331
    int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
332
333
334
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
335
336
    // atomIndices can probably be shuffled as well
    // but it probably wouldn't make things any faster
337
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
338
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
339
340
341
342
343
344
345
346
    skipTiles[threadIdx.x] = -1;
    
    while (pos < end) {
        const bool hasExclusions = false;
        real3 force = make_real3(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
347
        int x, y;
348
349
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
350
351
352
353
354
355
356
357
358
359
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
360
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
361
        }
362

363
        // Skip over tiles that have exclusions, since they were already processed.
364

365
366
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
367
                int2 tile = exclusionTiles[skipBase+tgx];
368
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
369
            }
370
371
372
373
            else
                skipTiles[threadIdx.x] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
374
        }
375
376
377
378
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
379
380
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
381
            // Load atom data for this tile.
382
383
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
384
            //const unsigned int localAtomIndex = threadIdx.x;
385
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
386
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
387
388
389
390
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
            atomIndices[threadIdx.x] = j;
391
#ifdef ENABLE_SHUFFLE
392
            DECLARE_LOCAL_PARAMETERS
393
394
395
396
397
398
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
#endif
399
            if (j < PADDED_NUM_ATOMS) {
400
                // Load position of atom j from from global memory
401
402
#ifdef ENABLE_SHUFFLE
                shflPosq = posq[j];
403
#else
404
405
406
407
408
409
410
411
                localData[threadIdx.x].x = posq[j].x;
                localData[threadIdx.x].y = posq[j].y;
                localData[threadIdx.x].z = posq[j].z;
                localData[threadIdx.x].q = posq[j].w;
                localData[threadIdx.x].fx = 0.0f;
                localData[threadIdx.x].fy = 0.0f;
                localData[threadIdx.x].fz = 0.0f;
#endif                
412
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
413
            }
414
415
416
417
418
419
420
421
            else {
#ifdef ENABLE_SHUFFLE
                shflPosq = make_real4(0, 0, 0, 0);
#else
                localData[threadIdx.x].x = 0;
                localData[threadIdx.x].y = 0;
                localData[threadIdx.x].z = 0;
#endif
422
                CLEAR_LOCAL_PARAMETERS
423
            }
424
425
426
427
428
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
429
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
430
#ifdef ENABLE_SHUFFLE
431
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
432
#else
433
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
434
#endif
435
436
437
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
438
439
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq; 
440
#else
441
442
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
443
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
444
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
445
446
447
448
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
449
#ifdef USE_SYMMETRIC
450
                    real dEdR = 0.0f;
451
#else
452
453
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
454
455
#endif
#ifdef USE_EXCLUSIONS
456
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
457
#endif
458
                    real tempEnergy = 0.0f;
459
                    const real interactionScale = 1.0f;
460
461
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
462
#ifdef INCLUDE_FORCES
463
#ifdef USE_SYMMETRIC
464
465
466
467
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
468
#ifdef ENABLE_SHUFFLE
469
470
471
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
472

473
#else
474
475
476
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
477
478
#endif
#else // !USE_SYMMETRIC
479
480
481
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
482
#ifdef ENABLE_SHUFFLE
483
484
485
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
486
#else
487
488
489
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
490
491
#endif 
#endif // end USE_SYMMETRIC
492
#endif
493
#ifdef ENABLE_SHUFFLE
494
                    SHUFFLE_WARP_DATA
495
#endif
496
                    tj = (tj + 1) & (TILE_SIZE - 1);
497
                }
498
499
            }
            else
500
#endif
501
502
503
504
505
            {
                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
506
507
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq;
508
#else
509
510
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
511
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
512
#ifdef USE_PERIODIC
513
                    APPLY_PERIODIC_TO_DELTA(delta)
514
#endif
515
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
516
517
518
519
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
520
#ifdef USE_SYMMETRIC
521
                    real dEdR = 0.0f;
522
#else
523
524
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
525
526
#endif
#ifdef USE_EXCLUSIONS
527
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
528
#endif
529
                    real tempEnergy = 0.0f;
530
                    const real interactionScale = 1.0f;
531
532
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
533
#ifdef INCLUDE_FORCES
534
#ifdef USE_SYMMETRIC
535
536
537
538
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
539
#ifdef ENABLE_SHUFFLE
540
541
542
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
543

544
#else
545
546
547
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
548
#endif
549
#else // !USE_SYMMETRIC
550
551
552
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
553
#ifdef ENABLE_SHUFFLE
554
555
556
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
557
#else
558
559
560
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
561
562
#endif 
#endif // end USE_SYMMETRIC
563
#endif
564
#ifdef ENABLE_SHUFFLE
565
                    SHUFFLE_WARP_DATA
566
#endif
567
                    tj = (tj + 1) & (TILE_SIZE - 1);
568
569
                }
            }
570
571

            // Write results.
572
#ifdef INCLUDE_FORCES
573
574
575
576
577
578
579
580
581
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
#ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
            if (atom2 < PADDED_NUM_ATOMS) {
582
583
584
585
#ifdef ENABLE_SHUFFLE
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
586
#else
587
588
589
590
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
591
            }
592
#endif
593
594
        }
        pos++;
595
    }
596
597
598
599
    
    // Third loop: single pairs that aren't part of a tile.
    
#if USE_CUTOFF
600
    const unsigned int numPairs = interactionCount[1];
601
602
603
604
605
606
607
608
609
610
    if (numPairs > maxSinglePairs)
        return; // There wasn't enough memory for the neighbor list.
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
        int2 pair = singlePairs[i];
        int atom1 = pair.x;
        int atom2 = pair.y;
        real4 posq1 = posq[atom1];
        real4 posq2 = posq[atom2];
        LOAD_ATOM1_PARAMETERS
        int j = atom2;
611
        atom2 = threadIdx.x;
612
613
614
        DECLARE_LOCAL_PARAMETERS
        LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
        LOAD_ATOM2_PARAMETERS
615
        atom2 = pair.y;
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_DELTA(delta)
#endif
        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
        real invR = RSQRT(r2);
        real r = r2*invR;
#ifdef USE_SYMMETRIC
        real dEdR = 0.0f;
#else
        real3 dEdR1 = make_real3(0);
        real3 dEdR2 = make_real3(0);
#endif
        bool hasExclusions = false;
        bool isExcluded = false;
        real tempEnergy = 0.0f;
        const real interactionScale = 1.0f;
        COMPUTE_INTERACTION
        energy += tempEnergy;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
        real3 dEdR1 = delta*dEdR;
        real3 dEdR2 = -dEdR1;
#endif
        atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
        atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
        atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
#endif
    }
#endif
649
#ifdef INCLUDE_ENERGY
650
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
651
#endif
652
    SAVE_DERIVATIVES
653
}