gbsaObc_cpu.cc 35.9 KB
Newer Older
1
typedef struct {
2
3
    real x, y, z;
    real q;
4
    float radius, scaledRadius;
5
    real bornSum;
6
} AtomData1;
7
8
9
10

/**
 * Compute the Born sum.
 */
11
12
13
KERNEL void computeBornSum(
        GLOBAL mm_long* RESTRICT global_bornSum,
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
14
#ifdef USE_CUTOFF
15
16
17
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
18
#else
19
        unsigned int numTiles,
20
#endif
21
        GLOBAL const int2* exclusionTiles) {
22
    LOCAL AtomData1 localData[TILE_SIZE];
23

24
25
    // First loop: process tiles that contain exclusions.
    
26
27
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
28
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
29
        const int2 tileIndices = exclusionTiles[pos];
30
31
32
33
34
35
36
37
38
39
40
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

        // Load the data for this tile.

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
            real4 tempPosq = posq[j];
            localData[localAtomIndex].x = tempPosq.x;
            localData[localAtomIndex].y = tempPosq.y;
            localData[localAtomIndex].z = tempPosq.z;
41
            localData[localAtomIndex].q = charge[j];
42
43
44
            float2 tempParams = global_params[j];
            localData[localAtomIndex].radius = tempParams.x;
            localData[localAtomIndex].scaledRadius = tempParams.y;
45
46
47
48
49
50
        }
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
51
                real bornSum = 0;
52
                real4 posq1 = posq[atom1];
53
54
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
55
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
56
                    real charge2 = localData[j].q;
57
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
58
#ifdef USE_PERIODIC
59
                    APPLY_PERIODIC_TO_DELTA(delta)
60
#endif
61
                    real r2 = dot(trimTo3(delta), trimTo3(delta));
62
63
64
65
66
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
67
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
68
                        real r = r2*invR;
69
                        float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
70
                        real rScaledRadiusJ = r+params2.y;
71
                        if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
72
73
74
75
76
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                            real u_ij = RECIP(rScaledRadiusJ);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
77
78
79
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
80
81
82
83
84
85
                        }
                    }
                }

                // Write results.

86
                ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
87
88
89
90
91
92
            }
        }
        else {
            // This is an off-diagonal tile.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
93
                localData[tgx].bornSum = 0;
94
95
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
96
                real bornSum = 0;
97
                real4 posq1 = posq[atom1];
98
99
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
100
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
101
                    real charge2 = localData[j].q;
102
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
103
#ifdef USE_PERIODIC
104
                    APPLY_PERIODIC_TO_DELTA(delta)
105
#endif
106
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
107
108
109
110
111
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
112
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
113
                        real r = r2*invR;
114
                        float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
115
                        real rScaledRadiusJ = r+params2.y;
116
                        if (params1.x < rScaledRadiusJ) {
117
118
119
120
121
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                            real u_ij = RECIP(rScaledRadiusJ);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
122
123
124
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
125
                        }
126
                        real rScaledRadiusI = r+params1.y;
127
                        if (params2.x < rScaledRadiusI) {
128
129
130
131
132
                            real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                            real u_ij = RECIP(rScaledRadiusI);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
133
134
135
                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
136
137
138
139
140
141
142
                            localData[j].bornSum += term;
                        }
                    }
                }

               // Write results for atom1.

143
                ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
144
145
            }

146
            // Write results.
147

148
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
149
                unsigned int offset = y*TILE_SIZE + tgx;
150
                ATOMIC_ADD(&global_bornSum[offset], realToFixedPoint(localData[tgx].bornSum));
151
            }
152
153
154
        }
    }

155
156
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
157

158
159
#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
160
161
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
162
163
    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
164
#else
165
166
    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
167
#endif
168
169
    int nextToSkip = -1;
    int currentSkipIndex = 0;
170
    LOCAL int atomIndices[TILE_SIZE];
171
172

    while (pos < end) {
173
174
175
176
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
177
        int x, y;
178
        bool singlePeriodicCopy = false;
179
#ifdef USE_CUTOFF
180
181
182
183
184
185
186
187
188
189
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
190
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
191
        }
192

193
        // Skip over tiles that have exclusions, since they were already processed.
194

195
196
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
197
                int2 tile = exclusionTiles[currentSkipIndex++];
198
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
199
            }
200
201
            else
                nextToSkip = end;
202
        }
203
204
        includeTile = (nextToSkip != pos);
#endif
205
206
        if (includeTile) {
            // Load the data for this tile.
207
208

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
209
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
210
                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
211
212
213
214
215
216
217
218
219
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
                    real4 tempPosq = posq[j];
                    localData[localAtomIndex].x = tempPosq.x;
                    localData[localAtomIndex].y = tempPosq.y;
                    localData[localAtomIndex].z = tempPosq.z;
220
                    localData[localAtomIndex].q = charge[j];
221
222
223
224
225
226
227
228
229
230
231
232
233
                    float2 tempParams = global_params[j];
                    localData[localAtomIndex].radius = tempParams.x;
                    localData[localAtomIndex].scaledRadius = tempParams.y;
                    localData[localAtomIndex].bornSum = 0.0f;
                }
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
234
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
235
236
237
238
239
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real bornSum = 0;
                    real4 posq1 = posq[atom1];
240
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
241
242
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
243
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
244
                        real charge2 = localData[j].q;
245
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
246
247
248
249
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
250
                            real r = r2*invR;
251
                            float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
                            real rScaledRadiusJ = r+params2.y;
                            if (params1.x < rScaledRadiusJ) {
                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                                real u_ij = RECIP(rScaledRadiusJ);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                                bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                            }
                            real rScaledRadiusI = r+params1.y;
                            if (params2.x < rScaledRadiusI) {
                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                                real u_ij = RECIP(rScaledRadiusI);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                                term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                                localData[j].bornSum += term;
                            }
                        }
                    }

                    // Write results for atom1.

280
                    ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
281
282
283
284
285
286
287
288
289
290
291
292
293
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real bornSum = 0;
                    real4 posq1 = posq[atom1];
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
294
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
295
                        real charge2 = localData[j].q;
296
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
297
#ifdef USE_PERIODIC
298
                        APPLY_PERIODIC_TO_DELTA(delta)
299
300
301
302
303
304
305
306
307
#endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
#ifdef USE_CUTOFF
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
308
                            real r = r2*invR;
309
                            float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
                            real rScaledRadiusJ = r+params2.y;
                            if (params1.x < rScaledRadiusJ) {
                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                                real u_ij = RECIP(rScaledRadiusJ);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                                bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                            }
                            real rScaledRadiusI = r+params1.y;
                            if (params2.x < rScaledRadiusI) {
                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                                real u_ij = RECIP(rScaledRadiusI);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                                term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                                localData[j].bornSum += term;
                            }
                        }
                    }

                    // Write results for atom1.

338
                    ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
339
340
341
342
343
344
345
346
347
348
349
350
                }
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
351
                    ATOMIC_ADD(&global_bornSum[atom2], realToFixedPoint(localData[tgx].bornSum));
352
                }
353
354
            }
        }
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
        pos++;
    }
}

typedef struct {
    real x, y, z;
    real q;
    real fx, fy, fz, fw;
    real bornRadius;
} AtomData2;

/**
 * First part of computing the GBSA interaction.
 */

370
371
372
373
KERNEL void computeGBSAForce1(
        GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
374
#ifdef USE_CUTOFF
375
376
377
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
378
379
380
#else
        unsigned int numTiles,
#endif
381
        GLOBAL const int2* exclusionTiles) {
382
    mixed energy = 0;
383
    LOCAL AtomData2 localData[TILE_SIZE];
384
385
386

    // First loop: process tiles that contain exclusions.
    
387
388
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
389
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
390
        const int2 tileIndices = exclusionTiles[pos];
391
392
393
394
395
396
397
398
399
400
401
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

        // Load the data for this tile.

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
            real4 tempPosq = posq[j];
            localData[localAtomIndex].x = tempPosq.x;
            localData[localAtomIndex].y = tempPosq.y;
            localData[localAtomIndex].z = tempPosq.z;
peastman's avatar
peastman committed
402
            localData[localAtomIndex].q = charge[j];
403
404
            localData[localAtomIndex].bornRadius = global_bornRadii[j];
        }
405
406
407
408
409
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
410
                real4 force = make_real4(0);
411
                real4 posq1 = posq[atom1];
412
                real charge1 = charge[atom1];
413
                real bornRadius1 = global_bornRadii[atom1];
414
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
415
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
416
                    real charge2 = localData[j].q;
417
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
418
#ifdef USE_PERIODIC
419
                    APPLY_PERIODIC_TO_DELTA(delta)
420
#endif
421
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
422
423
424
425
426
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
427
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
428
                        real r = r2*invR;
429
430
431
432
433
434
                        real bornRadius2 = localData[j].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
435
                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
436
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
437
438
439
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
440
                        force.w += dGpol_dalpha2_ij*bornRadius2;
441
442
443
444
#ifdef USE_CUTOFF
                        if (atom1 != y*TILE_SIZE+j)
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
445
                        energy += 0.5f*tempEnergy;
446
447
448
449
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
450
451
452
453
454
                    }
                }

                // Write results.

455
456
457
458
                ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
459
460
461
462
463
464
            }
        }
        else {
            // This is an off-diagonal tile.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
465
466
467
468
                localData[tgx].fx = 0;
                localData[tgx].fy = 0;
                localData[tgx].fz = 0;
                localData[tgx].fw = 0;
469
470
471
            }
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
472
                real4 force = make_real4(0);
473
                real4 posq1 = posq[atom1];
474
                real charge1 = charge[atom1];
475
                real bornRadius1 = global_bornRadii[atom1];
476
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
477
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
478
                    real charge2 = localData[j].q;
479
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
480
#ifdef USE_PERIODIC
481
                    APPLY_PERIODIC_TO_DELTA(delta)
482
#endif
483
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
484
485
486
487
488
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
489
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
490
                        real r = r2*invR;
491
492
493
494
495
496
                        real bornRadius2 = localData[j].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
497
                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
498
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
499
500
501
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
502
                        force.w += dGpol_dalpha2_ij*bornRadius2;
503
504
505
#ifdef USE_CUTOFF
                        tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
506
                        energy += tempEnergy;
507
508
509
510
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
511
512
513
514
515
516
517
                        localData[j].fx += delta.x;
                        localData[j].fy += delta.y;
                        localData[j].fz += delta.z;
                        localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
                    }
                }

518
               // Write results for atom1.
519

520
521
522
523
                ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
524
525
            }

526
            // Write results.
527

528
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
529
                unsigned int offset = y*TILE_SIZE + tgx;
530
531
532
533
                ATOMIC_ADD(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
                ATOMIC_ADD(&global_bornForce[offset], realToFixedPoint(localData[tgx].fw));
534
535
536
537
538
539
540
541
542
            }
        }
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
543
544
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
545
546
    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
547
#else
548
549
    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
550
551
552
#endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
553
    LOCAL int atomIndices[TILE_SIZE];
554
555
556
557
558
559

    while (pos < end) {
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
560
        int x, y;
561
562
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
563
564
565
566
567
568
569
570
571
572
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
573
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
574
        }
575

576
        // Skip over tiles that have exclusions, since they were already processed.
577

578
579
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
580
                int2 tile = exclusionTiles[currentSkipIndex++];
581
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
582
            }
583
584
            else
                nextToSkip = end;
585
        }
586
587
        includeTile = (nextToSkip != pos);
#endif
588
589
590
591
592
        if (includeTile) {
            // Load the data for this tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
593
                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
594
595
596
597
598
599
600
601
602
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
                    real4 tempPosq = posq[j];
                    localData[localAtomIndex].x = tempPosq.x;
                    localData[localAtomIndex].y = tempPosq.y;
                    localData[localAtomIndex].z = tempPosq.z;
603
                    localData[localAtomIndex].q = charge[j];
604
605
606
607
608
609
610
611
612
613
614
615
616
617
                    localData[localAtomIndex].bornRadius = global_bornRadii[j];
                    localData[localAtomIndex].fx = 0.0f;
                    localData[localAtomIndex].fy = 0.0f;
                    localData[localAtomIndex].fz = 0.0f;
                    localData[localAtomIndex].fw = 0.0f;
                }
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
618
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
619
620
621
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
622
                    real4 force = make_real4(0);
623
                    real4 posq1 = posq[atom1];
peastman's avatar
peastman committed
624
                    real charge1 = charge[atom1];
625
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
626
627
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
628
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
629
                        real charge2 = localData[j].q;
630
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
631
632
633
634
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
635
                            real r = r2*invR;
636
637
638
639
640
641
                            real bornRadius2 = localData[j].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
642
                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
643
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
644
645
646
647
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                            force.w += dGpol_dalpha2_ij*bornRadius2;
648
649
650
#ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
651
                            energy += tempEnergy;
652
653
654
655
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
656
657
658
659
660
661
662
663
664
                            localData[j].fx += delta.x;
                            localData[j].fy += delta.y;
                            localData[j].fz += delta.z;
                            localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
                    }

                    // Write results for atom1.

665
666
667
668
                    ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                    ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
669
670
671
672
673
674
675
676
677
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
678
                    real4 force = make_real4(0);
679
                    real4 posq1 = posq[atom1];
680
                    real charge1 = charge[atom1];
681
682
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
683
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
684
                        real charge2 = localData[j].q;
685
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
686
#ifdef USE_PERIODIC
687
                        APPLY_PERIODIC_TO_DELTA(delta)
688
689
690
691
692
693
694
695
696
#endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
#ifdef USE_CUTOFF
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
697
                            real r = r2*invR;
698
699
700
701
702
703
                            real bornRadius2 = localData[j].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
704
                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
705
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
706
707
708
709
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                            force.w += dGpol_dalpha2_ij*bornRadius2;
710
711
712
#ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
713
                            energy += tempEnergy;
714
715
716
717
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
718
719
720
721
722
723
724
725
726
                            localData[j].fx += delta.x;
                            localData[j].fy += delta.y;
                            localData[j].fz += delta.z;
                            localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
                    }

                    // Write results for atom1.

727
728
729
730
                    ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
                    ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
731
732
733
734
735
736
737
738
739
740
741
742
                }
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
743
744
745
746
                    ATOMIC_ADD(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
                    ATOMIC_ADD(&global_bornForce[atom2], realToFixedPoint(localData[tgx].fw));
747
                }
748
            }
749
750
751
        }
        pos++;
    }
752
    energyBuffer[GLOBAL_ID] += energy;
753
}