kCalculateCDLJObcGbsaForces1.h 33.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
13
14
15
16
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
17
 *                                                                            *
18
19
20
21
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
22
 *                                                                            *
23
24
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
25
26
27
28
29
30
31
32
 * -------------------------------------------------------------------------- */

/**
 * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
 * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
 * different versions of the kernels.
 */

33
34
35
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795

Scott Le Grand's avatar
Scott Le Grand committed
36
37
38
39
40
41
42
43
44
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
45
46
47
48
{
    extern __shared__ Atom sA[];
    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
49
    unsigned int numWorkUnits = cSim.pInteractionCount[0];
50
51
    unsigned int pos = warp*numWorkUnits/totalWarps;
    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
52
53
    float CDLJObcGbsa_energy;
    float energy = 0.0f;
54
55
56
#ifdef USE_CUTOFF
    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
57

58
#ifdef USE_EWALD
59
    const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
60
61
#endif

62
    unsigned int lasty = -0xFFFFFFFF;
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    while (pos < end)
    {

        // Extract cell coordinates from appropriate work unit
        unsigned int x                      = workUnit[pos];
        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
        bool bExclusionFlag                 = (x & 0x1);
        x                                   = (x >> 17) << GRIDBITS;
        unsigned int tgx                    = threadIdx.x & (GRID - 1);
        unsigned int i                      = x + tgx;
        float4 apos                         = cSim.pPosq[i];
        float2 a                            = cSim.pAttr[i];
        float br                            = cSim.pBornRadii[i];
        unsigned int tbx                    = threadIdx.x - tgx;
77
        unsigned int tj                     = tgx;
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
        Atom* psA                           = &sA[tbx];
        float4 af;
        af.x                        = 0.0f;
        af.y                        = 0.0f;
        af.z                        = 0.0f;
        af.w                        = 0.0f;
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
            sA[threadIdx.x].x           = apos.x;
            sA[threadIdx.x].y           = apos.y;
            sA[threadIdx.x].z           = apos.z;
            sA[threadIdx.x].q           = apos.w;
            float q2                    = cSim.preFactor * apos.w;
            apos.w                     *= cSim.epsfac;
            sA[threadIdx.x].sig         = a.x;
            sA[threadIdx.x].eps         = a.y;
            sA[threadIdx.x].br          = br;
            if (!bExclusionFlag)
            {
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
                    float dy                = psA[j].y - apos.y;
                    float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
                    float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[j].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
118
119
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
120
#ifdef USE_CUTOFF
121
122
123
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
124
                    float erfcAlphaR        = fastErfc(alphaR);
125
                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
126
		    /* E */
127
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
128
    #else
129
                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
130
131
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
132
    #endif
133
#else
134
                    float factorX           = apos.w * psA[j].q * invR;
135

136
137
138
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
139
140
141
142
143
144
145
146
147
148
149
150
151
#endif
                    dEdR                   *= invR * invR;

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
152
153
		    /* E */
		    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
154
155
156
157
#ifdef USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
158
159
160
 			/* E */
			CDLJObcGbsa_energy  = 0.0f;
                   }
161
#endif
162
163
164
		    /* E */
                    if (i < cSim.atoms)
                    {
165
                        energy             += 0.5f*CDLJObcGbsa_energy;
166
                    }
167
168
169
170
171
172
173
174
175
176
177
                    // Add Forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                }
            }
            else  // bExclusion
            {
178
                unsigned int xi   = x>>GRIDBITS;
179
                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
180
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
                    float dy                = psA[j].y - apos.y;
                    float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
                        float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[j].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
201
202
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
203
#ifdef USE_CUTOFF
204
205
206
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
207
                    float erfcAlphaR        = fastErfc(alphaR);
208
                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
209
		    /* E */
210
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
Peter Eastman's avatar
Peter Eastman committed
211
212
213
214
215
                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
                    if (needCorrection)
                    {
                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

216
217
                        dEdR               = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        CDLJObcGbsa_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
Peter Eastman's avatar
Peter Eastman committed
218
                    }
219
    #else
220
                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
221
222
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
223
    #endif
224
#else
225
                    float factorX           = apos.w * psA[j].q * invR;
226

227
228
229
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
230
231
#endif
                    dEdR                   *= invR * invR;
Peter Eastman's avatar
Peter Eastman committed
232
233
234
#ifdef USE_EWALD
                    if (!(excl & 0x1) && !needCorrection)
#else
235
                    if (!(excl & 0x1))
Peter Eastman's avatar
Peter Eastman committed
236
#endif
237
238
                    {
                        dEdR = 0.0f;
239
240
			/* E */
		        CDLJObcGbsa_energy  = 0.0f;
241
242
243
244
245
246
247
248
249
250
251
252
                    }

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
253
254
                    /* E */
		    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
255
256
257
258
#if defined USE_PERIODIC
                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
259
260
			/* E */
		        CDLJObcGbsa_energy = 0.0f;
261
262
263
264
265
                    }
#elif defined USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
266
267
			/* E */
		        CDLJObcGbsa_energy = 0.0f;
268
269
                    }
#endif
270
271
272
		    /* E */
                    if (i < cSim.atoms)
                    {
273
                        energy         += 0.5f*CDLJObcGbsa_energy;
274
                    }
275
276
277
278
279
280
281
282
283
284
285
286
287
                    // Add Forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                    excl                  >>= 1;
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
288
            unsigned int offset         = x + tgx + warp*cSim.stride;
Peter Eastman's avatar
Peter Eastman committed
289
290
291
#else
            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
292
            float4 of                   = cSim.pForce4[offset];
293
294
295
296
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
            of.w                       += af.w;
297
            cSim.pForce4[offset]        = of;
Peter Eastman's avatar
Peter Eastman committed
298
            cSim.pBornForce[offset]     = of.w;
299
300
301
302
303
304
        }
        else        // 100% utilization
        {
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
305
                unsigned int j              = y + tgx;
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
                float4 temp                 = cSim.pPosq[j];
                float2 temp1                = cSim.pAttr[j];
                sA[threadIdx.x].br          = cSim.pBornRadii[j];
                sA[threadIdx.x].x           = temp.x;
                sA[threadIdx.x].y           = temp.y;
                sA[threadIdx.x].z           = temp.z;
                sA[threadIdx.x].q           = temp.w;
                sA[threadIdx.x].sig         = temp1.x;
                sA[threadIdx.x].eps         = temp1.y;
            }
            sA[threadIdx.x].fx          = 0.0f;
            sA[threadIdx.x].fy          = 0.0f;
            sA[threadIdx.x].fz          = 0.0f;
            sA[threadIdx.x].fb          = 0.0f;
            float q2                    = apos.w * cSim.preFactor;
            apos.w                     *= cSim.epsfac;
            if (!bExclusionFlag)
            {
324
325
326
#ifdef USE_CUTOFF
                unsigned int flags = cSim.pInteractionFlag[pos];
                if (flags == 0)
327
                {
328
329
330
331
332
333
334
                    // No interactions in this block.
                }
                else if (flags == 0xFFFFFFFF)
#endif
                {
                    // Compute all interactions within this block.

335
                    for (unsigned int j = 0; j < GRID; j++)
336
337
338
339
                    {
                        float dx                = psA[tj].x - apos.x;
                        float dy                = psA[tj].y - apos.y;
                        float dz                = psA[tj].z - apos.z;
340
#ifdef USE_PERIODIC
341
342
343
                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
344
#endif
345
                        float r2                = dx * dx + dy * dy + dz * dz;
346

347
348
349
350
351
352
353
354
                        // CDLJ part
                        float invR              = 1.0f / sqrt(r2);
                        float sig               = a.x + psA[tj].sig;
                        float sig2              = invR * sig;
                        sig2                   *= sig2;
                        float sig6              = sig2 * sig2 * sig2;
                        float eps               = a.y * psA[tj].eps;
                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
355
356
                        /* E */ 
                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
357
#ifdef USE_CUTOFF
358
359
360
    #ifdef USE_EWALD
                        float r                 = sqrt(r2);
                        float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
361
                        float erfcAlphaR        = fastErfc(alphaR);
362
                        dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
363
                        /* E */
364
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
365
    #else
366
                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
367
368
                        /* E */
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
369
    #endif
370
#else
371
                        float factorX           = apos.w * psA[tj].q * invR;
372

373
374
375
                        dEdR                   += factorX;
                        /* E */
                        CDLJObcGbsa_energy     += factorX;
376
#endif
377
                        dEdR                   *= invR * invR;
378

379
380
381
382
383
384
385
386
387
388
389
                        // ObcGbsaForce1 part
                        float alpha2_ij         = br * psA[tj].br;
                        float D_ij              = r2 / (4.0f * alpha2_ij);
                        float expTerm           = exp(-D_ij);
                        float denominator2      = r2 + alpha2_ij * expTerm;
                        float denominator       = sqrt(denominator2);
                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
                        psA[tj].fb             += dGpol_dalpha2_ij * br;
                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
390
391
                        /* E */
                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
392
#ifdef USE_CUTOFF
393
394
395
                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dEdR = 0.0f;
396
397
			    /* E */
       			    CDLJObcGbsa_energy = 0.0f;
398
399
                        }
#endif
400
401
402
403
404
			/* E */
                        if (i < cSim.atoms)
                        {
                            energy         += CDLJObcGbsa_energy;
                        }
405
406
407
408
409
410
411
412
413
414
415
                        // Add forces
                        dx                     *= dEdR;
                        dy                     *= dEdR;
                        dz                     *= dEdR;
                        af.x                   -= dx;
                        af.y                   -= dy;
                        af.z                   -= dz;
                        psA[tj].fx             += dx;
                        psA[tj].fy             += dy;
                        psA[tj].fz             += dz;
                        tj                      = (tj + 1) & (GRID - 1);
416
                    }
417
418
419
420
421
422
                }
#ifdef USE_CUTOFF
                else
                {
                    // Compute only a subset of the interactions in this block.

423
                    for (unsigned int j = 0; j < GRID; j++)
424
425
426
427
428
429
430
431
432
433
                    {
                        if ((flags&(1<<j)) != 0)
                        {
                            float dx                = psA[j].x - apos.x;
                            float dy                = psA[j].y - apos.y;
                            float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
434
#endif
435
                            float r2                = dx * dx + dy * dy + dz * dz;
436

437
438
439
440
441
442
443
444
                            // CDLJ part
                            float invR              = 1.0f / sqrt(r2);
                            float sig               = a.x + psA[j].sig;
                            float sig2              = invR * sig;
                            sig2                   *= sig2;
                            float sig6              = sig2 * sig2 * sig2;
                            float eps               = a.y * psA[j].eps;
                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
445
446
                            /* E */ 
                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
447
#ifdef USE_CUTOFF
448
449
450
    #ifdef USE_EWALD
                            float r                 = sqrt(r2);
                            float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
451
                            float erfcAlphaR        = fastErfc(alphaR);
452
453
                            dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
454
    #else
455
                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
456
457
                            /* E */
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
458
    #endif
459
#else
460
                            float factorX           = apos.w * psA[j].q * invR;
461

462
463
464
                            dEdR                   += factorX;
                            /* E */
                            CDLJObcGbsa_energy     += factorX;
465
466
467
468
469
470
471
472
473
474
475
476
477
#endif
                            dEdR                   *= invR * invR;

                            // ObcGbsaForce1 part
                            float alpha2_ij         = br * psA[j].br;
                            float D_ij              = r2 / (4.0f * alpha2_ij);
                            float expTerm           = exp(-D_ij);
                            float denominator2      = r2 + alpha2_ij * expTerm;
                            float denominator       = sqrt(denominator2);
                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
478
479
                            /* E */
                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497

                            // Sum the Born forces.

                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
#ifdef USE_CUTOFF
                            if (r2 > cSim.nonbondedCutoffSqr)
                            {
                                dEdR = 0.0f;
498
499
				/* E */
				CDLJObcGbsa_energy  = 0.0f;
500
501
                            }
#endif
502
503
504
505
506
			    /* E */
                            if (i < cSim.atoms)
                            {
            			energy         += CDLJObcGbsa_energy;
                            }
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
                            // Add forces
                            dx                     *= dEdR;
                            dy                     *= dEdR;
                            dz                     *= dEdR;
                            af.x                   -= dx;
                            af.y                   -= dy;
                            af.z                   -= dz;
                            tempBuffer[threadIdx.x] = dx;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                            tempBuffer[threadIdx.x] = dy;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                            tempBuffer[threadIdx.x] = dz;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                        }
                    }
549
                }
550
#endif
551
552
553
            }
            else  // bExclusion
            {
554
555
                unsigned int xi   = x>>GRIDBITS;
                unsigned int yi   = y>>GRIDBITS;
556
                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
557
558
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                excl              = (excl >> tgx) | (excl << (GRID - tgx));
559
                for (unsigned int j = 0; j < GRID; j++)
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
                {
                    float dx                = psA[tj].x - apos.x;
                    float dy                = psA[tj].y - apos.y;
                    float dz                = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
                    float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[tj].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[tj].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
579
580
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
581
#ifdef USE_CUTOFF
582
583
584
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
Peter Eastman's avatar
Peter Eastman committed
585
                    float erfcAlphaR        = fastErfc(alphaR);
586
                    dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
587
                    /* E */
588
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
Peter Eastman's avatar
Peter Eastman committed
589
590
591
592
593
                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
                    if (needCorrection)
                    {
                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

594
595
                        dEdR               = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
                        CDLJObcGbsa_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
Peter Eastman's avatar
Peter Eastman committed
596
                    }
597
    #else
598
                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
599
600
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
601
    #endif
602
#else
603
604
605
606
                    float factorX           = apos.w * psA[tj].q * invR;
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
607
608
#endif
                    dEdR                   *= invR * invR;
Peter Eastman's avatar
Peter Eastman committed
609
610
611
#ifdef USE_EWALD
                    if (!(excl & 0x1) && !needCorrection)
#else
612
                    if (!(excl & 0x1))
Peter Eastman's avatar
Peter Eastman committed
613
#endif
614
615
                    {
                        dEdR = 0.0f;
616
617
			/* E */
			CDLJObcGbsa_energy = 0.0f;
618
619
620
621
622
623
624
625
626
627
628
629
630
                    }

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[tj].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
                    psA[tj].fb             += dGpol_dalpha2_ij * br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
631
632
		    /* E */
		    CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
633
634
635
636
#if defined USE_PERIODIC
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
637
638
			/* E */
			CDLJObcGbsa_energy = 0.0f;
639
640
641
642
643
                    }
#elif defined USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
644
645
			/* E */
			CDLJObcGbsa_energy = 0.0f;
646
647
                    }
#endif
648
649
650
651
652
		    /* E */
                    if (i < cSim.atoms)
                    {
                        energy             += CDLJObcGbsa_energy;
                    }
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
                    // Add forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                    psA[tj].fx             += dx;
                    psA[tj].fy             += dy;
                    psA[tj].fz             += dz;
                    excl                  >>= 1;
                    tj                      = (tj + 1) & (GRID - 1);
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
670
            unsigned int offset         = x + tgx + warp*cSim.stride;
Peter Eastman's avatar
Peter Eastman committed
671
672
673
#else
            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
674
            float4 of                   = cSim.pForce4[offset];
675
676
677
678
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
            of.w                       += af.w;
679
            cSim.pForce4[offset]       = of;
680
            cSim.pBornForce[offset]     = of.w;
Peter Eastman's avatar
Peter Eastman committed
681
#ifdef USE_OUTPUT_BUFFER_PER_WARP
682
            offset                      = y + tgx + warp*cSim.stride;
Peter Eastman's avatar
Peter Eastman committed
683
684
685
#else
            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
686
            of                          = cSim.pForce4[offset];
687
688
689
690
            of.x                       += sA[threadIdx.x].fx;
            of.y                       += sA[threadIdx.x].fy;
            of.z                       += sA[threadIdx.x].fz;
            of.w                       += sA[threadIdx.x].fb;
691
            cSim.pForce4[offset]       = of;
692
            cSim.pBornForce[offset]     = of.w;
693
694
695
696
            lasty = y;
        }
        pos++;
    }
697
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
698
}