gbsaObc.cc 34.2 KB
Newer Older
1
2
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)

3
4
5
6
7
8
9
#if defined(USE_HIP)
    #define ALIGN alignas(16)
#else
    #define ALIGN
#endif

typedef struct ALIGN {
10
11
12
13
14
15
16
17
18
    real x, y, z;
    real q;
    float radius, scaledRadius;
    real bornSum;
} AtomData1;

/**
 * Compute the Born sum.
 */
19
20
21
KERNEL void computeBornSum(
        GLOBAL mm_ulong* RESTRICT global_bornSum,
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
22
#ifdef USE_CUTOFF
23
24
25
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
26
27
28
#else
        unsigned int numTiles,
#endif
29
        GLOBAL const int2* RESTRICT exclusionTiles) {
30
31
32
33
34
    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
    const unsigned int tbx = LOCAL_ID - tgx;
    LOCAL AtomData1 localData[FORCE_WORK_GROUP_SIZE];
35
36
37
38
39
40

    // First loop: process tiles that contain exclusions.
    
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
41
        const int2 tileIndices = exclusionTiles[pos];
42
43
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
44
        real bornSum = 0;
45
46
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
47
        real charge1 = charge[atom1];
48
49
50
51
        float2 params1 = global_params[atom1];
        if (x == y) {
            // This tile is on the diagonal.

52
53
54
55
56
57
            localData[LOCAL_ID].x = posq1.x;
            localData[LOCAL_ID].y = posq1.y;
            localData[LOCAL_ID].z = posq1.z;
            localData[LOCAL_ID].q = charge1;
            localData[LOCAL_ID].radius = params1.x;
            localData[LOCAL_ID].scaledRadius = params1.y;
58
59
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
60
                real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
61
#ifdef USE_PERIODIC
62
                APPLY_PERIODIC_TO_DELTA(delta)
63
64
65
66
67
68
69
70
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
71
                    real r = r2*invR;
72
                    float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
                    real rScaledRadiusJ = r+params2.y;
                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                        real u_ij = RECIP(rScaledRadiusJ);
                        real l_ij2 = l_ij*l_ij;
                        real u_ij2 = u_ij*u_ij;
                        real ratio = LOG(u_ij * RECIP(l_ij));
                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                         (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                    }
                }
                SYNC_WARPS;
            }
        }
        else {
            // This is an off-diagonal tile.

            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
93
94
95
96
            localData[LOCAL_ID].x = tempPosq.x;
            localData[LOCAL_ID].y = tempPosq.y;
            localData[LOCAL_ID].z = tempPosq.z;
            localData[LOCAL_ID].q = charge[j];
97
            float2 tempParams = global_params[j];
98
99
100
            localData[LOCAL_ID].radius = tempParams.x;
            localData[LOCAL_ID].scaledRadius = tempParams.y;
            localData[LOCAL_ID].bornSum = 0.0f;
101
102
103
104
105
106
            SYNC_WARPS;

            // Compute the full set of interactions in this tile.

            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
107
                real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
108
#ifdef USE_PERIODIC
109
                APPLY_PERIODIC_TO_DELTA(delta)
110
111
112
113
114
115
116
117
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
118
                    real r = r2*invR;
119
                    float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
                    real rScaledRadiusJ = r+params2.y;
                    if (params1.x < rScaledRadiusJ) {
                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                        real u_ij = RECIP(rScaledRadiusJ);
                        real l_ij2 = l_ij*l_ij;
                        real u_ij2 = u_ij*u_ij;
                        real ratio = LOG(u_ij * RECIP(l_ij));
                        bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                         (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                        bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                    }
                    real rScaledRadiusI = r+params1.y;
                    if (params2.x < rScaledRadiusI) {
                        real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                        real u_ij = RECIP(rScaledRadiusI);
                        real l_ij2 = l_ij*l_ij;
                        real u_ij2 = u_ij*u_ij;
                        real ratio = LOG(u_ij * RECIP(l_ij));
                        real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                         (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                        term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                        localData[tbx+tj].bornSum += term;
                    }
                }
                tj = (tj + 1) & (TILE_SIZE - 1);
                SYNC_WARPS;
            }
        }

        // Write results.

        unsigned int offset = x*TILE_SIZE + tgx;
152
        ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(bornSum));
153
154
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
155
            ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
156
157
158
159
160
161
162
163
        }
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
164
165
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
166
167
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
168
#else
169
170
    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
171
172
173
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
174
175
176
    LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
    LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
    skipTiles[LOCAL_ID] = -1;
177
178
179
180
181
182
183

    while (pos < end) {
        real bornSum = 0;
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
184
        int x, y;
185
186
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
187
188
189
190
191
192
193
194
195
196
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
197
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
198
        }
199

200
        // Skip over tiles that have exclusions, since they were already processed.
201

202
203
        SYNC_WARPS;
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
204
            SYNC_WARPS;
205
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
206
                int2 tile = exclusionTiles[skipBase+tgx];
207
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
208
            }
209
            else
210
                skipTiles[LOCAL_ID] = end;
211
212
213
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
214
        }
215
216
217
218
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
219
220
221
222
223
224
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

            // Load atom data for this tile.

            real4 posq1 = posq[atom1];
225
            real charge1 = charge[atom1];
226
227
            float2 params1 = global_params[atom1];
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
228
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
229
230
231
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
232
            atomIndices[LOCAL_ID] = j;
233
234
            if (j < PADDED_NUM_ATOMS) {
                real4 tempPosq = posq[j];
235
236
237
238
                localData[LOCAL_ID].x = tempPosq.x;
                localData[LOCAL_ID].y = tempPosq.y;
                localData[LOCAL_ID].z = tempPosq.z;
                localData[LOCAL_ID].q = charge[j];
239
                float2 tempParams = global_params[j];
240
241
242
                localData[LOCAL_ID].radius = tempParams.x;
                localData[LOCAL_ID].scaledRadius = tempParams.y;
                localData[LOCAL_ID].bornSum = 0.0f;
243
244
245
246
247
248
249
250
            }
            SYNC_WARPS;
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
251
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
252
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
253
254
255
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
256
                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
257
258
259
260
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
261
                        real r = r2*invR;
262
                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                            real u_ij = RECIP(rScaledRadiusJ);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                        }
                        real rScaledRadiusI = r+params1.y;
                        if (params2.x < rScaledRadiusI) {
                            real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                            real u_ij = RECIP(rScaledRadiusI);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                            localData[tbx+tj].bornSum += term;
                        }
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                    SYNC_WARPS;
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
298
                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
299
#ifdef USE_PERIODIC
300
                    APPLY_PERIODIC_TO_DELTA(delta)
301
302
303
304
305
306
307
308
309
#endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    int atom2 = atomIndices[tbx+tj];
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
310
                        real r = r2*invR;
311
                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                            real u_ij = RECIP(rScaledRadiusJ);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                        }
                        real rScaledRadiusI = r+params1.y;
                        if (params2.x < rScaledRadiusI) {
                            real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                            real u_ij = RECIP(rScaledRadiusI);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                            localData[tbx+tj].bornSum += term;
                        }
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                    SYNC_WARPS;
                }
            }

            // Write results.

#ifdef USE_CUTOFF
344
            unsigned int atom2 = atomIndices[LOCAL_ID];
345
346
347
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
348
            ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) realToFixedPoint(bornSum));
349
            if (atom2 < PADDED_NUM_ATOMS)
350
                ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
351
352
353
354
355
        }
        pos++;
    }
}

356
typedef struct ALIGN {
357
358
359
360
361
362
363
364
365
366
    real x, y, z;
    real q;
    real fx, fy, fz, fw;
    real bornRadius;
} AtomData2;

/**
 * First part of computing the GBSA interaction.
 */

367
368
369
370
KERNEL void computeGBSAForce1(
        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
371
#ifdef USE_CUTOFF
372
373
374
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
375
376
377
#else
        unsigned int numTiles,
#endif
378
        GLOBAL const int2* RESTRICT exclusionTiles) {
379
380
381
382
    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
    const unsigned int tbx = LOCAL_ID - tgx;
383
    mixed energy = 0;
384
    LOCAL AtomData2 localData[FORCE_WORK_GROUP_SIZE];
385
386
387
388
389
390

    // First loop: process tiles that contain exclusions.
    
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
391
        const int2 tileIndices = exclusionTiles[pos];
392
393
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
394
        real4 force = make_real4(0);
395
396
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
397
        real charge1 = charge[atom1];
398
399
400
401
        real bornRadius1 = global_bornRadii[atom1];
        if (x == y) {
            // This tile is on the diagonal.

402
403
404
405
406
            localData[LOCAL_ID].x = posq1.x;
            localData[LOCAL_ID].y = posq1.y;
            localData[LOCAL_ID].z = posq1.z;
            localData[LOCAL_ID].q = charge1;
            localData[LOCAL_ID].bornRadius = bornRadius1;
peastman's avatar
peastman committed
407
            SYNC_WARPS;
408
409
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
410
                    real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
411
                    real charge2 = localData[tbx+j].q;
412
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
413
#ifdef USE_PERIODIC
414
                    APPLY_PERIODIC_TO_DELTA(delta)
415
416
417
418
419
420
#endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
421
                        real r = r2*invR;
422
423
424
425
426
427
                        real bornRadius2 = localData[tbx+j].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
428
                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
429
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
430
431
432
433
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                        force.w += dGpol_dalpha2_ij*bornRadius2;
434
435
436
437
#ifdef USE_CUTOFF
                        if (atom1 != y*TILE_SIZE+j)
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
438
439
                        if (needEnergy)
                            energy += 0.5f*tempEnergy;
440
441
442
443
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
444
445
446
447
#ifdef USE_CUTOFF
                    }
#endif
                }
peastman's avatar
peastman committed
448
                SYNC_WARPS;
449
450
451
452
453
454
455
            }
        }
        else {
            // This is an off-diagonal tile.

            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
456
457
458
459
460
461
462
463
464
            localData[LOCAL_ID].x = tempPosq.x;
            localData[LOCAL_ID].y = tempPosq.y;
            localData[LOCAL_ID].z = tempPosq.z;
            localData[LOCAL_ID].q = charge[j];
            localData[LOCAL_ID].bornRadius = global_bornRadii[j];
            localData[LOCAL_ID].fx = 0.0f;
            localData[LOCAL_ID].fy = 0.0f;
            localData[LOCAL_ID].fz = 0.0f;
            localData[LOCAL_ID].fw = 0.0f;
peastman's avatar
peastman committed
465
            SYNC_WARPS;
466
467
468
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
469
                    real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
470
                    real charge2 = localData[tbx+tj].q;
471
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
472
#ifdef USE_PERIODIC
473
                    APPLY_PERIODIC_TO_DELTA(delta)
474
475
476
477
478
479
#endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
480
                        real r = r2*invR;
481
482
483
484
485
486
                        real bornRadius2 = localData[tbx+tj].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
487
                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
488
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
489
490
491
492
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                        force.w += dGpol_dalpha2_ij*bornRadius2;
493
494
495
#ifdef USE_CUTOFF
                        tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
496
497
                        if (needEnergy)
                            energy += tempEnergy;
498
499
500
501
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
                        localData[tbx+tj].fx += delta.x;
                        localData[tbx+tj].fy += delta.y;
                        localData[tbx+tj].fz += delta.z;
                        localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
                    }
#endif
                }
                tj = (tj + 1) & (TILE_SIZE - 1);
                SYNC_WARPS;
            }
        }
        
        // Write results.
        
        unsigned int offset = x*TILE_SIZE + tgx;
518
519
520
521
        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
        ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
        ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(force.w));
522
523
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
524
525
526
527
            ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fx));
            ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy));
            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
            ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
528
529
530
531
532
533
534
535
        }
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
536
537
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
538
539
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
540
#else
541
542
    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
543
544
545
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
546
547
548
    LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
    LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
    skipTiles[LOCAL_ID] = -1;
549
550

    while (pos < end) {
551
        real4 force = make_real4(0);
552
553
554
555
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
556
        int x, y;
557
558
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
559
560
561
562
563
564
565
566
567
568
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
569
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
570
        }
571

572
        // Skip over tiles that have exclusions, since they were already processed.
573

574
575
        SYNC_WARPS;
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
576
            SYNC_WARPS;
577
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
578
                int2 tile = exclusionTiles[skipBase+tgx];
579
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
580
            }
581
            else
582
                skipTiles[LOCAL_ID] = end;
583
584
585
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
586
        }
587
588
589
590
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
591
592
593
594
595
596
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

            // Load atom data for this tile.
            
            real4 posq1 = posq[atom1];
597
            real charge1 = charge[atom1];
598
599
            real bornRadius1 = global_bornRadii[atom1];
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
600
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
601
602
603
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
604
            atomIndices[LOCAL_ID] = j;
605
606
            if (j < PADDED_NUM_ATOMS) {
                real4 tempPosq = posq[j];
607
608
609
610
611
612
613
614
615
                localData[LOCAL_ID].x = tempPosq.x;
                localData[LOCAL_ID].y = tempPosq.y;
                localData[LOCAL_ID].z = tempPosq.z;
                localData[LOCAL_ID].q = charge[j];
                localData[LOCAL_ID].bornRadius = global_bornRadii[j];
                localData[LOCAL_ID].fx = 0.0f;
                localData[LOCAL_ID].fy = 0.0f;
                localData[LOCAL_ID].fz = 0.0f;
                localData[LOCAL_ID].fw = 0.0f;
616
            }
peastman's avatar
peastman committed
617
            SYNC_WARPS;
618
619
620
621
622
623
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
624
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
625
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
626
627
628
629
630
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
631
                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
632
                        real charge2 = localData[tbx+tj].q;
633
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
634
635
636
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        if (r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
637
                            real r = r2*invR;
638
639
640
641
642
643
                            real bornRadius2 = localData[tbx+tj].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
644
                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
645
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
646
647
648
649
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                            force.w += dGpol_dalpha2_ij*bornRadius2;
650
651
652
#ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
653
654
                            if (needEnergy)
                                energy += tempEnergy;
655
656
657
658
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
                            localData[tbx+tj].fx += delta.x;
                            localData[tbx+tj].fy += delta.y;
                            localData[tbx+tj].fz += delta.z;
                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                    SYNC_WARPS;
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
678
                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
679
                        real charge2 = localData[tbx+tj].q;
680
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
681
#ifdef USE_PERIODIC
682
                        APPLY_PERIODIC_TO_DELTA(delta)
683
684
685
686
687
688
#endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                        if (r2 < CUTOFF_SQUARED) {
#endif
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
689
                            real r = r2*invR;
690
691
692
693
694
695
                            real bornRadius2 = localData[tbx+tj].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
696
                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
697
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
698
699
700
701
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                            force.w += dGpol_dalpha2_ij*bornRadius2;
702
703
704
#ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
705
706
                            if (needEnergy)
                                energy += tempEnergy;
707
708
709
710
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
711
712
713
714
715
716
717
718
719
720
721
722
                            localData[tbx+tj].fx += delta.x;
                            localData[tbx+tj].fy += delta.y;
                            localData[tbx+tj].fz += delta.z;
                            localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
                        }
#endif
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                    SYNC_WARPS;
                }
            }
723

724
            // Write results.
725

726
#ifdef USE_CUTOFF
727
            unsigned int atom2 = atomIndices[LOCAL_ID];
728
729
730
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
731
732
733
734
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
            ATOMIC_ADD(&global_bornForce[atom1], (mm_ulong) realToFixedPoint(force.w));
735
            if (atom2 < PADDED_NUM_ATOMS) {
736
737
738
739
                ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fx));
                ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy));
                ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
                ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
740
741
742
743
            }
        }
        pos++;
    }
744
    energyBuffer[GLOBAL_ID] += energy;
745
}