nonbonded.cu 23.7 KB
Newer Older
1
2
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

3
#ifndef ENABLE_SHUFFLE
4
5
6
typedef struct {
    real x, y, z;
    real q;
7
    real fx, fy, fz;
8
9
10
11
12
    ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
#endif
} AtomData;
13
14
#endif

15
#ifdef ENABLE_SHUFFLE
16
17
18
19
20
21
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
    return __shfl(var, srcLane);
}

static __inline__ __device__ double real_shfl(double var, int srcLane) {
22
23
24
25
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
    hi = __shfl(hi, srcLane);
    lo = __shfl(lo, srcLane);
26
27
    return __hiloint2double( hi, lo );
}
28
29
30
31
32
33
34
35
36
37

static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
    int hi, lo;
    asm volatile("mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
    hi = __shfl(hi, srcLane);
    lo = __shfl(lo, srcLane);
    // unforunately there isn't an __nv_hiloint2long(hi,lo) intrinsic cast
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<long long*>(&fuse);
}
38
#endif
39
40

/**
41
42
43
44
45
 * Compute nonbonded interactions. The kernel is separated into two parts,
 * tiles with exclusions and tiles without exclusions. It relies heavily on 
 * implicit warp-level synchronization. A tile is defined by two atom blocks 
 * each of warpsize. Each warp computes a range of tiles.
 * 
46
 * Tiles with exclusions compute the entire set of interactions across
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
 * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts 
 * the forces are computed and accumulated diagonally in the manner shown below
 * where, suppose
 *
 * [a-h] comprise atom block 1, [i-p] comprise atom block 2
 *
 * 1 denotes the first set of calculations within the warp
 * 2 denotes the second set of calculations within the warp
 * ... etc.
 * 
 *        threads
 *     0 1 2 3 4 5 6 7
 *         atom1 
 * L    a b c d e f g h 
 * o  i 1 2 3 4 5 6 7 8
 * c  j 8 1 2 3 4 5 6 7
 * a  k 7 8 1 2 3 4 5 6
 * l  l 6 7 8 1 2 3 4 5
 * D  m 5 6 7 8 1 2 3 4 
 * a  n 4 5 6 7 8 1 2 3
 * t  o 3 4 5 6 7 8 1 2
68
69
70
 * a  p 2 3 4 5 6 7 8 1
 *
 * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
71
 * and follows the same force accumulation method. If more there are more interactingTiles
72
 * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
73
74
75
76
77
78
 * and the full tileset is computed. This should happen on the first step, and very rarely 
 * afterwards.
 *
 * On CUDA devices that support the shuffle intrinsic, on diagonal exclusion tiles use
 * __shfl to broadcast. For all other types of tiles __shfl is used to pass around the 
 * forces, positions, and parameters when computing the forces. 
79
80
81
82
83
84
85
86
 *
 * [out]forceBuffers    - forces on each atom to eventually be accumulated
 * [out]energyBuffer    - energyBuffer to eventually be accumulated
 * [in]posq             - x,y,z,charge 
 * [in]exclusions       - 1024-bit flags denoting atom-atom exclusions for each tile
 * [in]exclusionTiles   - x,y denotes the indices of tiles that have an exclusion
 * [in]startTileIndex   - index into first tile to be processed
 * [in]numTileIndices   - number of tiles this context is responsible for processing
87
 * [in]int tiles        - the atom block for each tile
88
89
90
91
92
93
94
95
96
97
98
99
100
 * [in]interactionCount - total number of tiles that have an interaction
 * [in]maxTiles         - stores the size of the neighbourlist in case it needs 
 *                      - to be expanded
 * [in]periodicBoxSize  - size of the Periodic Box, last dimension (w) not used
 * [in]invPeriodicBox   - inverse of the periodicBoxSize, pre-computed for speed
 * [in]blockCenter      - the center of each block in euclidean coordinates
 * [in]blockSize        - size of the each block, radiating from the center
 *                      - x is half the distance of total length
 *                      - y is half the distance of total width
 *                      - z is half the distance of total height
 *                      - w is not used
 * [in]interactingAtoms - a list of interactions within a given tile     
 *
101
102
 */
extern "C" __global__ void computeNonbonded(
103
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
104
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
105
#ifdef USE_CUTOFF
106
107
108
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
109
110
#endif
        PARAMETER_ARGUMENTS) {
111
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
112
113
114
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
115
    mixed energy = 0;
116
    INIT_DERIVATIVES
117
118
119
120
    // used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
121

122
    // First loop: process tiles that contain exclusions.
123

124
125
126
127
128
129
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
130
        real3 force = make_real3(0);
131
132
133
134
135
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        tileflags excl = exclusions[pos*TILE_SIZE+tgx];
136
#endif
137
138
139
        const bool hasExclusions = true;
        if (x == y) {
            // This tile is on the diagonal.
140
141
#ifdef ENABLE_SHUFFLE
            real4 shflPosq = posq1;
142
#else
143
144
145
146
147
148
            localData[threadIdx.x].x = posq1.x;
            localData[threadIdx.x].y = posq1.y;
            localData[threadIdx.x].z = posq1.z;
            localData[threadIdx.x].q = posq1.w;
            LOAD_LOCAL_PARAMETERS_FROM_1
#endif
149

150
151
            // we do not need to fetch parameters from global since this is a symmetric tile
            // instead we can broadcast the values using shuffle
152
153
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
154
                real4 posq2;
155
#ifdef ENABLE_SHUFFLE
156
                BROADCAST_WARP_DATA
157
#else   
158
159
                posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
160
161
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
162
                APPLY_PERIODIC_TO_DELTA(delta)
163
164
165
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                real invR = RSQRT(r2);
peastman's avatar
peastman committed
166
                real r = r2*invR;
167
168
169
170
171
172
173
174
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
                real dEdR = 0.0f;
#else
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
#endif
175
#ifdef USE_EXCLUSIONS
176
177
178
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
                real tempEnergy = 0.0f;
179
                const real interactionScale = 0.5f;
180
181
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
182
#ifdef INCLUDE_FORCES
183
184
185
186
#ifdef USE_SYMMETRIC
                force.x -= delta.x*dEdR;
                force.y -= delta.y*dEdR;
                force.z -= delta.z*dEdR;
187
#else
188
189
190
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
191
#endif
192
#endif
193
#ifdef USE_EXCLUSIONS
194
                excl >>= 1;
195
#endif
196
197
            }
        }
198
199
        else {
            // This is an off-diagonal tile.
200
            unsigned int j = y*TILE_SIZE + tgx;
201
202
203
204
205
206
            real4 shflPosq = posq[j];
#ifdef ENABLE_SHUFFLE
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
207
#else
208
209
210
211
212
213
214
215
            localData[threadIdx.x].x = shflPosq.x;
            localData[threadIdx.x].y = shflPosq.y;
            localData[threadIdx.x].z = shflPosq.z;
            localData[threadIdx.x].q = shflPosq.w;
            localData[threadIdx.x].fx = 0.0f;
            localData[threadIdx.x].fy = 0.0f;
            localData[threadIdx.x].fz = 0.0f;
#endif
216
            DECLARE_LOCAL_PARAMETERS
217
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
218
#ifdef USE_EXCLUSIONS
219
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
220
#endif
221
222
223
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
224
225
#ifdef ENABLE_SHUFFLE
                real4 posq2 = shflPosq;
226
#else
227
228
                real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
229
                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
230
#ifdef USE_PERIODIC
231
                APPLY_PERIODIC_TO_DELTA(delta)
232
233
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
234
235
236
237
                real invR = RSQRT(r2);
                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+tj;
238
#ifdef USE_SYMMETRIC
239
                real dEdR = 0.0f;
240
#else
241
242
                real3 dEdR1 = make_real3(0);
                real3 dEdR2 = make_real3(0);
243
244
#endif
#ifdef USE_EXCLUSIONS
245
                bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
246
#endif
247
                real tempEnergy = 0.0f;
248
                const real interactionScale = 1.0f;
249
250
                COMPUTE_INTERACTION
                energy += tempEnergy;
251
#ifdef INCLUDE_FORCES
252
#ifdef USE_SYMMETRIC
253
254
255
256
                delta *= dEdR;
                force.x -= delta.x;
                force.y -= delta.y;
                force.z -= delta.z;
257
#ifdef ENABLE_SHUFFLE
258
259
260
                shflForce.x += delta.x;
                shflForce.y += delta.y;
                shflForce.z += delta.z;
261

262
#else
263
264
265
                localData[tbx+tj].fx += delta.x;
                localData[tbx+tj].fy += delta.y;
                localData[tbx+tj].fz += delta.z;
266
267
#endif
#else // !USE_SYMMETRIC
268
269
270
                force.x -= dEdR1.x;
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
271
#ifdef ENABLE_SHUFFLE
272
273
274
                shflForce.x += dEdR2.x;
                shflForce.y += dEdR2.y;
                shflForce.z += dEdR2.z;
275
#else
276
277
278
                localData[tbx+tj].fx += dEdR2.x;
                localData[tbx+tj].fy += dEdR2.y;
                localData[tbx+tj].fz += dEdR2.z;
279
280
#endif 
#endif // end USE_SYMMETRIC
281
#endif
282
283
#ifdef ENABLE_SHUFFLE
                SHUFFLE_WARP_DATA
284
285
286
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
287
#endif
288
289
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
290
291
                tj = (tj + 1) & (TILE_SIZE - 1);
            }
292
293
            const unsigned int offset = y*TILE_SIZE + tgx;
            // write results for off diagonal tiles
294
#ifdef INCLUDE_FORCES
295
296
297
298
#ifdef ENABLE_SHUFFLE
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
299
#else
300
301
302
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
303
#endif
304
#endif
305
        }
306
        // Write results for on and off diagonal tiles
307
#ifdef INCLUDE_FORCES
308
        const unsigned int offset = x*TILE_SIZE + tgx;
309
310
311
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
312
#endif
313
314
315
316
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
317

318
319
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
320
321
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
322
323
    int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
    int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
324
325
#else
    const unsigned int numTiles = numTileIndices;
326
327
    int pos = (int) (startTileIndex+warp*(long long)numTiles/totalWarps);
    int end = (int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
328
329
330
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
331
332
    // atomIndices can probably be shuffled as well
    // but it probably wouldn't make things any faster
333
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
334
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
335
336
337
338
339
340
341
342
    skipTiles[threadIdx.x] = -1;
    
    while (pos < end) {
        const bool hasExclusions = false;
        real3 force = make_real3(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
343
        int x, y;
344
345
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
346
347
348
349
350
351
352
353
354
355
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
356
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
357
        }
358

359
        // Skip over tiles that have exclusions, since they were already processed.
360

361
362
363
364
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
365
            }
366
367
368
369
            else
                skipTiles[threadIdx.x] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
370
        }
371
372
373
374
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
375
376
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;
377
            // Load atom data for this tile.
378
379
            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
380
            //const unsigned int localAtomIndex = threadIdx.x;
381
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
382
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
383
384
385
386
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
            atomIndices[threadIdx.x] = j;
387
#ifdef ENABLE_SHUFFLE
388
            DECLARE_LOCAL_PARAMETERS
389
390
391
392
393
394
            real4 shflPosq;
            real3 shflForce;
            shflForce.x = 0.0f;
            shflForce.y = 0.0f;
            shflForce.z = 0.0f;
#endif
395
            if (j < PADDED_NUM_ATOMS) {
396
                // Load position of atom j from from global memory
397
398
#ifdef ENABLE_SHUFFLE
                shflPosq = posq[j];
399
#else
400
401
402
403
404
405
406
407
                localData[threadIdx.x].x = posq[j].x;
                localData[threadIdx.x].y = posq[j].y;
                localData[threadIdx.x].z = posq[j].z;
                localData[threadIdx.x].q = posq[j].w;
                localData[threadIdx.x].fx = 0.0f;
                localData[threadIdx.x].fy = 0.0f;
                localData[threadIdx.x].fz = 0.0f;
#endif                
408
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
409
            }
410
411
412
413
414
415
416
417
418
            else {
#ifdef ENABLE_SHUFFLE
                shflPosq = make_real4(0, 0, 0, 0);
#else
                localData[threadIdx.x].x = 0;
                localData[threadIdx.x].y = 0;
                localData[threadIdx.x].z = 0;
#endif
            }
419
420
421
422
423
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
                real4 blockCenterX = blockCenter[x];
424
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
425
#ifdef ENABLE_SHUFFLE
426
                APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
427
#else
428
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
429
#endif
430
431
432
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
433
434
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq; 
435
#else
436
437
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
438
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
439
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
440
441
442
443
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
444
#ifdef USE_SYMMETRIC
445
                    real dEdR = 0.0f;
446
#else
447
448
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
449
450
#endif
#ifdef USE_EXCLUSIONS
451
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
452
#endif
453
                    real tempEnergy = 0.0f;
454
                    const real interactionScale = 1.0f;
455
456
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
457
#ifdef INCLUDE_FORCES
458
#ifdef USE_SYMMETRIC
459
460
461
462
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
463
#ifdef ENABLE_SHUFFLE
464
465
466
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
467

468
#else
469
470
471
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
472
473
#endif
#else // !USE_SYMMETRIC
474
475
476
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
477
#ifdef ENABLE_SHUFFLE
478
479
480
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
481
#else
482
483
484
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
485
486
#endif 
#endif // end USE_SYMMETRIC
487
#endif
488
#ifdef ENABLE_SHUFFLE
489
                    SHUFFLE_WARP_DATA
490
#endif
491
                    tj = (tj + 1) & (TILE_SIZE - 1);
492
                }
493
494
            }
            else
495
#endif
496
497
498
499
500
            {
                // We need to apply periodic boundary conditions separately for each interaction.
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
501
502
#ifdef ENABLE_SHUFFLE
                    real4 posq2 = shflPosq;
503
#else
504
505
                    real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
506
                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
507
#ifdef USE_PERIODIC
508
                    APPLY_PERIODIC_TO_DELTA(delta)
509
#endif
510
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
511
512
513
514
                    real invR = RSQRT(r2);
                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = atomIndices[tbx+tj];
515
#ifdef USE_SYMMETRIC
516
                    real dEdR = 0.0f;
517
#else
518
519
                    real3 dEdR1 = make_real3(0);
                    real3 dEdR2 = make_real3(0);
520
521
#endif
#ifdef USE_EXCLUSIONS
522
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
523
#endif
524
                    real tempEnergy = 0.0f;
525
                    const real interactionScale = 1.0f;
526
527
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
528
#ifdef INCLUDE_FORCES
529
#ifdef USE_SYMMETRIC
530
531
532
533
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
534
#ifdef ENABLE_SHUFFLE
535
536
537
                    shflForce.x += delta.x;
                    shflForce.y += delta.y;
                    shflForce.z += delta.z;
538

539
#else
540
541
542
                    localData[tbx+tj].fx += delta.x;
                    localData[tbx+tj].fy += delta.y;
                    localData[tbx+tj].fz += delta.z;
543
#endif
544
#else // !USE_SYMMETRIC
545
546
547
                    force.x -= dEdR1.x;
                    force.y -= dEdR1.y;
                    force.z -= dEdR1.z;
548
#ifdef ENABLE_SHUFFLE
549
550
551
                    shflForce.x += dEdR2.x;
                    shflForce.y += dEdR2.y;
                    shflForce.z += dEdR2.z;
552
#else
553
554
555
                    localData[tbx+tj].fx += dEdR2.x;
                    localData[tbx+tj].fy += dEdR2.y;
                    localData[tbx+tj].fz += dEdR2.z;
556
557
#endif 
#endif // end USE_SYMMETRIC
558
#endif
559
#ifdef ENABLE_SHUFFLE
560
                    SHUFFLE_WARP_DATA
561
#endif
562
                    tj = (tj + 1) & (TILE_SIZE - 1);
563
564
                }
            }
565
566

            // Write results.
567
#ifdef INCLUDE_FORCES
568
569
570
571
572
573
574
575
576
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
#ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
            if (atom2 < PADDED_NUM_ATOMS) {
577
578
579
580
#ifdef ENABLE_SHUFFLE
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
581
#else
582
583
584
585
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
586
            }
587
#endif
588
589
        }
        pos++;
590
    }
591
#ifdef INCLUDE_ENERGY
592
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
593
#endif
594
    SAVE_DERIVATIVES
595
}