gbsaObc_cpu.cc 39.1 KB
Newer Older
1
typedef struct {
2
3
    real x, y, z;
    real q;
4
    float radius, scaledRadius;
5
    real bornSum;
6
} AtomData1;
7
8
9
10

/**
 * Compute the Born sum.
 */
11
KERNEL void computeBornSum(
12
#ifdef SUPPORTS_64_BIT_ATOMICS
13
        GLOBAL mm_long* RESTRICT global_bornSum,
14
#else
15
        GLOBAL real* RESTRICT global_bornSum,
16
#endif
17
        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
18
#ifdef USE_CUTOFF
19
20
21
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
22
#else
23
        unsigned int numTiles,
24
#endif
25
        GLOBAL const int2* exclusionTiles) {
26
    LOCAL AtomData1 localData[TILE_SIZE];
27

28
29
    // First loop: process tiles that contain exclusions.
    
30
31
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
32
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
33
        const int2 tileIndices = exclusionTiles[pos];
34
35
36
37
38
39
40
41
42
43
44
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

        // Load the data for this tile.

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
            real4 tempPosq = posq[j];
            localData[localAtomIndex].x = tempPosq.x;
            localData[localAtomIndex].y = tempPosq.y;
            localData[localAtomIndex].z = tempPosq.z;
45
            localData[localAtomIndex].q = charge[j];
46
47
48
            float2 tempParams = global_params[j];
            localData[localAtomIndex].radius = tempParams.x;
            localData[localAtomIndex].scaledRadius = tempParams.y;
49
50
51
52
53
54
        }
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
55
                real bornSum = 0;
56
                real4 posq1 = posq[atom1];
57
58
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
59
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
60
                    real charge2 = localData[j].q;
61
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
62
#ifdef USE_PERIODIC
63
                    APPLY_PERIODIC_TO_DELTA(delta)
64
#endif
65
                    real r2 = dot(trimTo3(delta), trimTo3(delta));
66
67
68
69
70
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
71
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
72
                        real r = r2*invR;
73
                        float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
74
                        real rScaledRadiusJ = r+params2.y;
75
                        if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
76
77
78
79
80
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                            real u_ij = RECIP(rScaledRadiusJ);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
81
82
83
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
84
85
86
87
88
89
                        }
                    }
                }

                // Write results.

90
#ifdef SUPPORTS_64_BIT_ATOMICS
91
                ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
92
#else
93
                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
94
                global_bornSum[offset] += bornSum;
95
#endif
96
97
98
99
100
101
            }
        }
        else {
            // This is an off-diagonal tile.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
102
                localData[tgx].bornSum = 0;
103
104
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
105
                real bornSum = 0;
106
                real4 posq1 = posq[atom1];
107
108
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
109
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
110
                    real charge2 = localData[j].q;
111
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
112
#ifdef USE_PERIODIC
113
                    APPLY_PERIODIC_TO_DELTA(delta)
114
#endif
115
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
116
117
118
119
120
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
121
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
122
                        real r = r2*invR;
123
                        float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
124
                        real rScaledRadiusJ = r+params2.y;
125
                        if (params1.x < rScaledRadiusJ) {
126
127
128
129
130
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                            real u_ij = RECIP(rScaledRadiusJ);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
131
132
133
                            bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                            bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
134
                        }
135
                        real rScaledRadiusI = r+params1.y;
136
                        if (params2.x < rScaledRadiusI) {
137
138
139
140
141
                            real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                            real u_ij = RECIP(rScaledRadiusI);
                            real l_ij2 = l_ij*l_ij;
                            real u_ij2 = u_ij*u_ij;
                            real ratio = LOG(u_ij * RECIP(l_ij));
142
143
144
                            real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                             (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                            term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
145
146
147
148
149
150
151
                            localData[j].bornSum += term;
                        }
                    }
                }

               // Write results for atom1.

152
#ifdef SUPPORTS_64_BIT_ATOMICS
153
                ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
154
#else
155
                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
156
                global_bornSum[offset] += bornSum;
157
#endif
158
159
            }

160
            // Write results.
161

162
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
163
164
#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
165
                ATOMIC_ADD(&global_bornSum[offset], (mm_long) (localData[tgx].bornSum*0x100000000));
166
#else
167
                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
168
                global_bornSum[offset] += localData[tgx].bornSum;
169
#endif
170
            }
171
172
173
        }
    }

174
175
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
176

177
178
#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
179
180
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
181
182
    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
183
#else
184
185
    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
186
#endif
187
188
    int nextToSkip = -1;
    int currentSkipIndex = 0;
189
    LOCAL int atomIndices[TILE_SIZE];
190
191

    while (pos < end) {
192
193
194
195
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
196
        int x, y;
197
        bool singlePeriodicCopy = false;
198
#ifdef USE_CUTOFF
199
200
201
202
203
204
205
206
207
208
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
209
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
210
        }
211

212
        // Skip over tiles that have exclusions, since they were already processed.
213

214
215
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
216
                int2 tile = exclusionTiles[currentSkipIndex++];
217
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
218
            }
219
220
            else
                nextToSkip = end;
221
        }
222
223
        includeTile = (nextToSkip != pos);
#endif
224
225
        if (includeTile) {
            // Load the data for this tile.
226
227

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
228
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
229
                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
230
231
232
233
234
235
236
237
238
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
                    real4 tempPosq = posq[j];
                    localData[localAtomIndex].x = tempPosq.x;
                    localData[localAtomIndex].y = tempPosq.y;
                    localData[localAtomIndex].z = tempPosq.z;
239
                    localData[localAtomIndex].q = charge[j];
240
241
242
243
244
245
246
247
248
249
250
251
252
                    float2 tempParams = global_params[j];
                    localData[localAtomIndex].radius = tempParams.x;
                    localData[localAtomIndex].scaledRadius = tempParams.y;
                    localData[localAtomIndex].bornSum = 0.0f;
                }
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
253
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
254
255
256
257
258
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real bornSum = 0;
                    real4 posq1 = posq[atom1];
259
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
260
261
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
262
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
263
                        real charge2 = localData[j].q;
264
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
265
266
267
268
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
269
                            real r = r2*invR;
270
                            float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
                            real rScaledRadiusJ = r+params2.y;
                            if (params1.x < rScaledRadiusJ) {
                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                                real u_ij = RECIP(rScaledRadiusJ);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                                bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                            }
                            real rScaledRadiusI = r+params1.y;
                            if (params2.x < rScaledRadiusI) {
                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                                real u_ij = RECIP(rScaledRadiusI);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                                term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                                localData[j].bornSum += term;
                            }
                        }
                    }

                    // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
300
                    ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
301
#else
302
                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
                    global_bornSum[offset] += bornSum;
#endif
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real bornSum = 0;
                    real4 posq1 = posq[atom1];
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
318
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
319
                        real charge2 = localData[j].q;
320
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
321
#ifdef USE_PERIODIC
322
                        APPLY_PERIODIC_TO_DELTA(delta)
323
324
325
326
327
328
329
330
331
#endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
#ifdef USE_CUTOFF
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
332
                            real r = r2*invR;
333
                            float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
                            real rScaledRadiusJ = r+params2.y;
                            if (params1.x < rScaledRadiusJ) {
                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
                                real u_ij = RECIP(rScaledRadiusJ);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params2.y*params2.y*invR)*(l_ij2-u_ij2));
                                bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
                            }
                            real rScaledRadiusI = r+params1.y;
                            if (params2.x < rScaledRadiusI) {
                                real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
                                real u_ij = RECIP(rScaledRadiusI);
                                real l_ij2 = l_ij*l_ij;
                                real u_ij2 = u_ij*u_ij;
                                real ratio = LOG(u_ij * RECIP(l_ij));
                                real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
                                                 (params1.y*params1.y*invR)*(l_ij2-u_ij2));
                                term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
                                localData[j].bornSum += term;
                            }
                        }
                    }

                    // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
363
                    ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
364
#else
365
                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
                    global_bornSum[offset] += bornSum;
#endif
                }
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
381
                    ATOMIC_ADD(&global_bornSum[atom2], (mm_long) (localData[tgx].bornSum*0x100000000));
382
#else
383
                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
384
385
386
                    global_bornSum[offset] += localData[tgx].bornSum;
#endif
                }
387
388
            }
        }
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
        pos++;
    }
}

typedef struct {
    real x, y, z;
    real q;
    real fx, fy, fz, fw;
    real bornRadius;
} AtomData2;

/**
 * First part of computing the GBSA interaction.
 */

404
KERNEL void computeGBSAForce1(
405
#ifdef SUPPORTS_64_BIT_ATOMICS
406
        GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
407
#else
408
        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
409
#endif
410
411
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
412
#ifdef USE_CUTOFF
413
414
415
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
416
417
418
#else
        unsigned int numTiles,
#endif
419
        GLOBAL const int2* exclusionTiles) {
420
    mixed energy = 0;
421
    LOCAL AtomData2 localData[TILE_SIZE];
422
423
424

    // First loop: process tiles that contain exclusions.
    
425
426
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
427
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
428
        const int2 tileIndices = exclusionTiles[pos];
429
430
431
432
433
434
435
436
437
438
439
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

        // Load the data for this tile.

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
            real4 tempPosq = posq[j];
            localData[localAtomIndex].x = tempPosq.x;
            localData[localAtomIndex].y = tempPosq.y;
            localData[localAtomIndex].z = tempPosq.z;
peastman's avatar
peastman committed
440
            localData[localAtomIndex].q = charge[j];
441
442
            localData[localAtomIndex].bornRadius = global_bornRadii[j];
        }
443
444
445
446
447
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
448
                real4 force = make_real4(0);
449
                real4 posq1 = posq[atom1];
450
                real charge1 = charge[atom1];
451
                real bornRadius1 = global_bornRadii[atom1];
452
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
453
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
454
                    real charge2 = localData[j].q;
455
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
456
#ifdef USE_PERIODIC
457
                    APPLY_PERIODIC_TO_DELTA(delta)
458
#endif
459
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
460
461
462
463
464
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
465
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
466
                        real r = r2*invR;
467
468
469
470
471
472
                        real bornRadius2 = localData[j].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
473
                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
474
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
475
476
477
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
478
                        force.w += dGpol_dalpha2_ij*bornRadius2;
479
480
481
482
#ifdef USE_CUTOFF
                        if (atom1 != y*TILE_SIZE+j)
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
483
                        energy += 0.5f*tempEnergy;
484
485
486
487
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
488
489
490
491
492
                    }
                }

                // Write results.

493
#ifdef SUPPORTS_64_BIT_ATOMICS
494
495
496
497
                ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
                ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
498
#else
499
500
                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
501
                global_bornForce[offset] += force.w;
502
#endif
503
504
505
506
507
508
            }
        }
        else {
            // This is an off-diagonal tile.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
509
510
511
512
                localData[tgx].fx = 0;
                localData[tgx].fy = 0;
                localData[tgx].fz = 0;
                localData[tgx].fw = 0;
513
514
515
            }
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
516
                real4 force = make_real4(0);
517
                real4 posq1 = posq[atom1];
518
                real charge1 = charge[atom1];
519
                real bornRadius1 = global_bornRadii[atom1];
520
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
521
                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
522
                    real charge2 = localData[j].q;
523
                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
524
#ifdef USE_PERIODIC
525
                    APPLY_PERIODIC_TO_DELTA(delta)
526
#endif
527
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
528
529
530
531
532
#ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
533
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
534
                        real r = r2*invR;
535
536
537
538
539
540
                        real bornRadius2 = localData[j].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
541
                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
542
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
543
544
545
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                        real dEdR = Gpol*(1.0f - 0.25f*expTerm);
546
                        force.w += dGpol_dalpha2_ij*bornRadius2;
547
548
549
#ifdef USE_CUTOFF
                        tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
550
                        energy += tempEnergy;
551
552
553
554
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
555
556
557
558
559
560
561
                        localData[j].fx += delta.x;
                        localData[j].fy += delta.y;
                        localData[j].fz += delta.z;
                        localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
                    }
                }

562
               // Write results for atom1.
563

564
#ifdef SUPPORTS_64_BIT_ATOMICS
565
566
567
568
                ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
                ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
569
#else
570
571
                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
572
                global_bornForce[offset] += force.w;
573
#endif
574
575
            }

576
            // Write results.
577

578
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
579
580
#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
581
582
583
584
                ATOMIC_ADD(&forceBuffers[offset], (mm_long) (localData[tgx].fx*0x100000000));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
                ATOMIC_ADD(&global_bornForce[offset], (mm_long) (localData[tgx].fw*0x100000000));
585
#else
586
                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
587
                real4 f = forceBuffers[offset];
588
589
590
591
592
                f.x += localData[tgx].fx;
                f.y += localData[tgx].fy;
                f.z += localData[tgx].fz;
                forceBuffers[offset] = f;
                global_bornForce[offset] += localData[tgx].fw;
593
594
595
596
597
598
599
600
601
602
#endif
            }
        }
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
603
604
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
605
606
    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
607
#else
608
609
    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
610
611
612
#endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
613
    LOCAL int atomIndices[TILE_SIZE];
614
615
616
617
618
619

    while (pos < end) {
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
620
        int x, y;
621
622
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
623
624
625
626
627
628
629
630
631
632
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
633
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
634
        }
635

636
        // Skip over tiles that have exclusions, since they were already processed.
637

638
639
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
640
                int2 tile = exclusionTiles[currentSkipIndex++];
641
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
642
            }
643
644
            else
                nextToSkip = end;
645
        }
646
647
        includeTile = (nextToSkip != pos);
#endif
648
649
650
651
652
        if (includeTile) {
            // Load the data for this tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
653
                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
654
655
656
657
658
659
660
661
662
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
                    real4 tempPosq = posq[j];
                    localData[localAtomIndex].x = tempPosq.x;
                    localData[localAtomIndex].y = tempPosq.y;
                    localData[localAtomIndex].z = tempPosq.z;
663
                    localData[localAtomIndex].q = charge[j];
664
665
666
667
668
669
670
671
672
673
674
675
676
677
                    localData[localAtomIndex].bornRadius = global_bornRadii[j];
                    localData[localAtomIndex].fx = 0.0f;
                    localData[localAtomIndex].fy = 0.0f;
                    localData[localAtomIndex].fz = 0.0f;
                    localData[localAtomIndex].fw = 0.0f;
                }
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
678
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
679
680
681
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
682
                    real4 force = make_real4(0);
683
                    real4 posq1 = posq[atom1];
peastman's avatar
peastman committed
684
                    real charge1 = charge[atom1];
685
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
686
687
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
688
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
689
                        real charge2 = localData[j].q;
690
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
691
692
693
694
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
695
                            real r = r2*invR;
696
697
698
699
700
701
                            real bornRadius2 = localData[j].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
702
                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
703
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
704
705
706
707
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                            force.w += dGpol_dalpha2_ij*bornRadius2;
708
709
710
#ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
711
                            energy += tempEnergy;
712
713
714
715
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
716
717
718
719
720
721
722
723
724
725
                            localData[j].fx += delta.x;
                            localData[j].fy += delta.y;
                            localData[j].fz += delta.z;
                            localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
                    }

                    // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
726
727
728
729
                    ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
                    ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
730
#else
731
732
                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
733
734
735
736
737
738
739
740
741
742
743
                    global_bornForce[offset] += force.w;
#endif
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
744
                    real4 force = make_real4(0);
745
                    real4 posq1 = posq[atom1];
746
                    real charge1 = charge[atom1];
747
748
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
749
                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
750
                        real charge2 = localData[j].q;
751
                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
752
#ifdef USE_PERIODIC
753
                        APPLY_PERIODIC_TO_DELTA(delta)
754
755
756
757
758
759
760
761
762
#endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
#ifdef USE_CUTOFF
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
763
                            real r = r2*invR;
764
765
766
767
768
769
                            real bornRadius2 = localData[j].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
770
                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
771
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
772
773
774
775
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
                            real dEdR = Gpol*(1.0f - 0.25f*expTerm);
                            force.w += dGpol_dalpha2_ij*bornRadius2;
776
777
778
#ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
779
                            energy += tempEnergy;
780
781
782
783
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
784
785
786
787
788
789
790
791
792
793
                            localData[j].fx += delta.x;
                            localData[j].fy += delta.y;
                            localData[j].fz += delta.z;
                            localData[j].fw += dGpol_dalpha2_ij*bornRadius1;
                        }
                    }

                    // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
794
795
796
797
                    ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
                    ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
798
#else
799
800
                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
                    global_bornForce[offset] += force.w;
#endif
                }
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
816
817
818
819
                    ATOMIC_ADD(&forceBuffers[atom2], (mm_long) (localData[tgx].fx*0x100000000));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
                    ATOMIC_ADD(&global_bornForce[atom2], (mm_long) (localData[tgx].fw*0x100000000));
820
#else
821
                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
822
823
824
825
826
827
828
829
                    real4 f = forceBuffers[offset];
                    f.x += localData[tgx].fx;
                    f.y += localData[tgx].fy;
                    f.z += localData[tgx].fz;
                    forceBuffers[offset] = f;
                    global_bornForce[offset] += localData[tgx].fw;
#endif
                }
830
            }
831
832
833
        }
        pos++;
    }
834
    energyBuffer[GLOBAL_ID] += energy;
835
}