kCalculateCDLJObcGbsaForces1.h 33.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
13
14
15
16
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
17
 *                                                                            *
18
19
20
21
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
22
 *                                                                            *
23
24
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
25
26
27
28
29
30
31
32
 * -------------------------------------------------------------------------- */

/**
 * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
 * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
 * different versions of the kernels.
 */

33
34
35
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795

Scott Le Grand's avatar
Scott Le Grand committed
36
37
38
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
39
#elif (__CUDA_ARCH__ >= 120)
Scott Le Grand's avatar
Scott Le Grand committed
40
41
42
43
44
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
45
46
47
48
{
    extern __shared__ Atom sA[];
    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
49
    unsigned int numWorkUnits = cSim.pInteractionCount[0];
50
51
    unsigned int pos = warp*numWorkUnits/totalWarps;
    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
52
53
    float CDLJObcGbsa_energy;
    float energy = 0.0f;
54
55
56
#ifdef USE_CUTOFF
    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
57

58
#ifdef USE_EWALD
59
    const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
60
61
#endif

62
    unsigned int lasty = -0xFFFFFFFF;
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    while (pos < end)
    {

        // Extract cell coordinates from appropriate work unit
        unsigned int x                      = workUnit[pos];
        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
        bool bExclusionFlag                 = (x & 0x1);
        x                                   = (x >> 17) << GRIDBITS;
        unsigned int tgx                    = threadIdx.x & (GRID - 1);
        unsigned int i                      = x + tgx;
        float4 apos                         = cSim.pPosq[i];
        float2 a                            = cSim.pAttr[i];
        float br                            = cSim.pBornRadii[i];
        unsigned int tbx                    = threadIdx.x - tgx;
77
        unsigned int tj                     = tgx;
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
        Atom* psA                           = &sA[tbx];
        float4 af;
        af.x                        = 0.0f;
        af.y                        = 0.0f;
        af.z                        = 0.0f;
        af.w                        = 0.0f;
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
            sA[threadIdx.x].x           = apos.x;
            sA[threadIdx.x].y           = apos.y;
            sA[threadIdx.x].z           = apos.z;
            sA[threadIdx.x].q           = apos.w;
            float q2                    = cSim.preFactor * apos.w;
            apos.w                     *= cSim.epsfac;
            sA[threadIdx.x].sig         = a.x;
            sA[threadIdx.x].eps         = a.y;
            sA[threadIdx.x].br          = br;
            if (!bExclusionFlag)
            {
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
                    float dy                = psA[j].y - apos.y;
                    float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
104
105
106
                    dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
107
108
109
110
111
112
113
114
115
116
117
#endif
                    float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[j].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
118
119
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
120
#ifdef USE_CUTOFF
121
122
123
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
124
                    float erfcAlphaR        = fastErfc(alphaR);
125
                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
126
		    /* E */
127
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
128
    #else
129
                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
130
131
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
132
    #endif
133
#else
134
                    float factorX           = apos.w * psA[j].q * invR;
135

136
137
138
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
139
140
141
142
143
144
145
146
147
148
149
150
#endif
                    dEdR                   *= invR * invR;

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
151
                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
152
153
154
#ifdef USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
155
156
157
                        dEdR                = 0.0f;
                        dGpol_dalpha2_ij    = 0.0f;
                        CDLJObcGbsa_energy  = 0.0f;
158
                   }
159
#endif
160
161
                    if (i < cSim.atoms)
                    {
162
                        energy             += 0.5f*CDLJObcGbsa_energy;
163
                    }
164
165
166
167
168
169
170
                    // Add Forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
171
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
172
173
174
175
                }
            }
            else  // bExclusion
            {
176
                unsigned int xi   = x>>GRIDBITS;
177
                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
178
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
179
180
181
182
183
184
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
                    float dy                = psA[j].y - apos.y;
                    float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
185
186
187
                    dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
188
189
190
191
192
193
194
195
196
197
198
#endif
                        float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[j].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
199
                    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
200
#ifdef USE_CUTOFF
201
202
203
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
204
                    float erfcAlphaR        = fastErfc(alphaR);
205
206
                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
Peter Eastman's avatar
Peter Eastman committed
207
208
209
210
211
                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
                    if (needCorrection)
                    {
                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

212
213
                        dEdR               = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        CDLJObcGbsa_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
Peter Eastman's avatar
Peter Eastman committed
214
                    }
215
    #else
216
                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
217
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
218
    #endif
219
#else
220
                    float factorX           = apos.w * psA[j].q * invR;
221

222
223
                    dEdR                   += factorX;
                    CDLJObcGbsa_energy     += factorX;
224
225
#endif
                    dEdR                   *= invR * invR;
Peter Eastman's avatar
Peter Eastman committed
226
227
228
#ifdef USE_EWALD
                    if (!(excl & 0x1) && !needCorrection)
#else
229
                    if (!(excl & 0x1))
Peter Eastman's avatar
Peter Eastman committed
230
#endif
231
232
                    {
                        dEdR = 0.0f;
233
		                  CDLJObcGbsa_energy  = 0.0f;
234
235
236
237
238
239
240
241
242
243
244
                    }

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
245
                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
246
247
248
#if defined USE_PERIODIC
                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
                    {
249
250
251
                        dEdR               = 0.0f;
                        dGpol_dalpha2_ij   = 0.0f;
		                  CDLJObcGbsa_energy = 0.0f;
252
253
254
255
                    }
#elif defined USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
256
257
258
                        dEdR               = 0.0f;
                        dGpol_dalpha2_ij   = 0.0f;
		                  CDLJObcGbsa_energy = 0.0f;
259
260
                    }
#endif
261
262
263
		    /* E */
                    if (i < cSim.atoms)
                    {
264
                        energy         += 0.5f*CDLJObcGbsa_energy;
265
                    }
266
267
268
269
270
271
272
                    // Add Forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
273
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
274
275
276
277
278
279
                    excl                  >>= 1;
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
280
            unsigned int offset         = x + tgx + warp*cSim.stride;
Peter Eastman's avatar
Peter Eastman committed
281
282
283
#else
            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
284
            float4 of                   = cSim.pForce4[offset];
285
286
287
288
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
            of.w                       += af.w;
289
            cSim.pForce4[offset]        = of;
Peter Eastman's avatar
Peter Eastman committed
290
            cSim.pBornForce[offset]     = of.w;
291
292
293
294
295
296
        }
        else        // 100% utilization
        {
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
297
                unsigned int j              = y + tgx;
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
                float4 temp                 = cSim.pPosq[j];
                float2 temp1                = cSim.pAttr[j];
                sA[threadIdx.x].br          = cSim.pBornRadii[j];
                sA[threadIdx.x].x           = temp.x;
                sA[threadIdx.x].y           = temp.y;
                sA[threadIdx.x].z           = temp.z;
                sA[threadIdx.x].q           = temp.w;
                sA[threadIdx.x].sig         = temp1.x;
                sA[threadIdx.x].eps         = temp1.y;
            }
            sA[threadIdx.x].fx          = 0.0f;
            sA[threadIdx.x].fy          = 0.0f;
            sA[threadIdx.x].fz          = 0.0f;
            sA[threadIdx.x].fb          = 0.0f;
            float q2                    = apos.w * cSim.preFactor;
            apos.w                     *= cSim.epsfac;
            if (!bExclusionFlag)
            {
316
317
318
#ifdef USE_CUTOFF
                unsigned int flags = cSim.pInteractionFlag[pos];
                if (flags == 0)
319
                {
320
321
322
323
324
325
326
                    // No interactions in this block.
                }
                else if (flags == 0xFFFFFFFF)
#endif
                {
                    // Compute all interactions within this block.

327
                    for (unsigned int j = 0; j < GRID; j++)
328
329
330
331
                    {
                        float dx                = psA[tj].x - apos.x;
                        float dy                = psA[tj].y - apos.y;
                        float dz                = psA[tj].z - apos.z;
332
#ifdef USE_PERIODIC
333
334
335
                        dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                        dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                        dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
336
#endif
337
                        float r2                = dx * dx + dy * dy + dz * dz;
338

339
340
341
342
343
344
345
346
                        // CDLJ part
                        float invR              = 1.0f / sqrt(r2);
                        float sig               = a.x + psA[tj].sig;
                        float sig2              = invR * sig;
                        sig2                   *= sig2;
                        float sig6              = sig2 * sig2 * sig2;
                        float eps               = a.y * psA[tj].eps;
                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
347
                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
348
#ifdef USE_CUTOFF
349
350
351
    #ifdef USE_EWALD
                        float r                 = sqrt(r2);
                        float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
352
                        float erfcAlphaR        = fastErfc(alphaR);
353
354
                        dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
355
    #else
356
                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
357
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
358
    #endif
359
#else
360
                        float factorX           = apos.w * psA[tj].q * invR;
361

362
363
                        dEdR                   += factorX;
                        CDLJObcGbsa_energy     += factorX;
364
#endif
365
                        dEdR                   *= invR * invR;
366

367
368
369
370
371
372
373
374
375
                        // ObcGbsaForce1 part
                        float alpha2_ij         = br * psA[tj].br;
                        float D_ij              = r2 / (4.0f * alpha2_ij);
                        float expTerm           = exp(-D_ij);
                        float denominator2      = r2 + alpha2_ij * expTerm;
                        float denominator       = sqrt(denominator2);
                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
376
                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
377
#ifdef USE_CUTOFF
378
379
                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
380
381
382
                            dEdR               = 0.0f;
                            dGpol_dalpha2_ij   = 0.0f;
       			             CDLJObcGbsa_energy = 0.0f;
383
384
                        }
#endif
385
386
387
388
389
			/* E */
                        if (i < cSim.atoms)
                        {
                            energy         += CDLJObcGbsa_energy;
                        }
390
391
392
393
394
395
396
                        // Add forces
                        dx                     *= dEdR;
                        dy                     *= dEdR;
                        dz                     *= dEdR;
                        af.x                   -= dx;
                        af.y                   -= dy;
                        af.z                   -= dz;
397
                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
398
399
400
                        psA[tj].fx             += dx;
                        psA[tj].fy             += dy;
                        psA[tj].fz             += dz;
401
                        psA[tj].fb             += dGpol_dalpha2_ij * br;
402
                        tj                      = (tj + 1) & (GRID - 1);
403
                    }
404
405
406
407
408
409
                }
#ifdef USE_CUTOFF
                else
                {
                    // Compute only a subset of the interactions in this block.

410
                    for (unsigned int j = 0; j < GRID; j++)
411
412
413
414
415
416
417
                    {
                        if ((flags&(1<<j)) != 0)
                        {
                            float dx                = psA[j].x - apos.x;
                            float dy                = psA[j].y - apos.y;
                            float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
418
419
420
                            dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                            dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                            dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
421
#endif
422
                            float r2                = dx * dx + dy * dy + dz * dz;
423

424
425
426
427
428
429
430
431
                            // CDLJ part
                            float invR              = 1.0f / sqrt(r2);
                            float sig               = a.x + psA[j].sig;
                            float sig2              = invR * sig;
                            sig2                   *= sig2;
                            float sig6              = sig2 * sig2 * sig2;
                            float eps               = a.y * psA[j].eps;
                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
432
                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
433
#ifdef USE_CUTOFF
434
435
436
    #ifdef USE_EWALD
                            float r                 = sqrt(r2);
                            float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
437
                            float erfcAlphaR        = fastErfc(alphaR);
438
439
                            dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
440
    #else
441
                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
442
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
443
    #endif
444
#else
445
                            float factorX           = apos.w * psA[j].q * invR;
446

447
448
                            dEdR                   += factorX;
                            CDLJObcGbsa_energy     += factorX;
449
450
451
452
453
454
455
456
457
458
459
460
#endif
                            dEdR                   *= invR * invR;

                            // ObcGbsaForce1 part
                            float alpha2_ij         = br * psA[j].br;
                            float D_ij              = r2 / (4.0f * alpha2_ij);
                            float expTerm           = exp(-D_ij);
                            float denominator2      = r2 + alpha2_ij * expTerm;
                            float denominator       = sqrt(denominator2);
                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
461
                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
462
463
464
465
466

#ifdef USE_CUTOFF
                            if (r2 > cSim.nonbondedCutoffSqr)
                            {
                                dEdR = 0.0f;
467
468
                                dGpol_dalpha2_ij = 0.0f;
				                    CDLJObcGbsa_energy  = 0.0f;
469
470
                            }
#endif
471
472
                            if (i < cSim.atoms)
                            {
473
                                 energy         += CDLJObcGbsa_energy;
474
                            }
475
476
477
478
479
480
481
                            // Add forces
                            dx                     *= dEdR;
                            dy                     *= dEdR;
                            dz                     *= dEdR;
                            af.x                   -= dx;
                            af.y                   -= dy;
                            af.z                   -= dz;
482
483
                            af.w                   += dGpol_dalpha2_ij * psA[j].br;

484
485
486
487
488
489
490
491
492
493
494
                            tempBuffer[threadIdx.x] = dx;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
495

496
497
498
499
500
501
502
503
504
505
506
                            tempBuffer[threadIdx.x] = dy;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
507

508
509
510
511
512
513
514
515
516
517
518
                            tempBuffer[threadIdx.x] = dz;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533

                            // Sum the Born forces.

                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];

534
535
                        }
                    }
536
                }
537
#endif
538
539
540
            }
            else  // bExclusion
            {
541
542
                unsigned int xi   = x>>GRIDBITS;
                unsigned int yi   = y>>GRIDBITS;
543
                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
544
545
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                excl              = (excl >> tgx) | (excl << (GRID - tgx));
546
                for (unsigned int j = 0; j < GRID; j++)
547
548
549
550
551
                {
                    float dx                = psA[tj].x - apos.x;
                    float dy                = psA[tj].y - apos.y;
                    float dz                = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
552
553
554
                    dx -= floor(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
555
556
557
558
559
560
561
562
563
564
565
#endif
                    float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[tj].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[tj].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
566
		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
567
#ifdef USE_CUTOFF
568
569
570
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
571
                    float erfcAlphaR        = fastErfc(alphaR);
572
573
                    dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
Peter Eastman's avatar
Peter Eastman committed
574
575
576
577
578
                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
                    if (needCorrection)
                    {
                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

579
580
                        dEdR               = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        CDLJObcGbsa_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
Peter Eastman's avatar
Peter Eastman committed
581
                    }
582
    #else
583
                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
584
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
585
    #endif
586
#else
587
588
589
                    float factorX           = apos.w * psA[tj].q * invR;
                    dEdR                   += factorX;
                    CDLJObcGbsa_energy     += factorX;
590
591
#endif
                    dEdR                   *= invR * invR;
Peter Eastman's avatar
Peter Eastman committed
592
593
594
#ifdef USE_EWALD
                    if (!(excl & 0x1) && !needCorrection)
#else
595
                    if (!(excl & 0x1))
Peter Eastman's avatar
Peter Eastman committed
596
#endif
597
598
                    {
                        dEdR = 0.0f;
599
			               CDLJObcGbsa_energy = 0.0f;
600
601
602
603
604
605
606
607
608
609
610
                    }

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[tj].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
611
		              CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
612
613
614
#if defined USE_PERIODIC
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
                    {
615
616
617
                        dEdR               = 0.0f;
                        dGpol_dalpha2_ij   = 0.0f;
			               CDLJObcGbsa_energy = 0.0f;
618
619
620
621
                    }
#elif defined USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
622
623
624
                        dEdR               = 0.0f;
                        dGpol_dalpha2_ij   = 0.0f;
			               CDLJObcGbsa_energy = 0.0f;
625
626
                    }
#endif
627
628
629
630
631
		    /* E */
                    if (i < cSim.atoms)
                    {
                        energy             += CDLJObcGbsa_energy;
                    }
632
633
634
635
636
637
638
                    // Add forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
639
                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
640
641
642
                    psA[tj].fx             += dx;
                    psA[tj].fy             += dy;
                    psA[tj].fz             += dz;
643
                    psA[tj].fb             += dGpol_dalpha2_ij * br;
644
645
646
647
648
649
650
                    excl                  >>= 1;
                    tj                      = (tj + 1) & (GRID - 1);
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
651
            unsigned int offset         = x + tgx + warp*cSim.stride;
Peter Eastman's avatar
Peter Eastman committed
652
653
654
#else
            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
655
            float4 of                   = cSim.pForce4[offset];
656
657
658
659
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
            of.w                       += af.w;
660
            cSim.pForce4[offset]       = of;
661
            cSim.pBornForce[offset]     = of.w;
Peter Eastman's avatar
Peter Eastman committed
662
#ifdef USE_OUTPUT_BUFFER_PER_WARP
663
            offset                      = y + tgx + warp*cSim.stride;
Peter Eastman's avatar
Peter Eastman committed
664
665
666
#else
            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
667
            of                          = cSim.pForce4[offset];
668
669
670
671
            of.x                       += sA[threadIdx.x].fx;
            of.y                       += sA[threadIdx.x].fy;
            of.z                       += sA[threadIdx.x].fz;
            of.w                       += sA[threadIdx.x].fb;
672
            cSim.pForce4[offset]       = of;
673
            cSim.pBornForce[offset]     = of.w;
674
675
676
677
            lasty = y;
        }
        pos++;
    }
678
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
679
}