kCalculateCDLJObcGbsaForces1.h 32.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
13
14
15
16
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
17
 *                                                                            *
18
19
20
21
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
22
 *                                                                            *
23
24
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
25
26
27
28
29
30
31
32
 * -------------------------------------------------------------------------- */

/**
 * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
 * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
 * different versions of the kernels.
 */

33
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
34
35
36
37
{
    extern __shared__ Atom sA[];
    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
38
    unsigned int numWorkUnits = cSim.pInteractionCount[0];
39
40
    unsigned int pos = warp*numWorkUnits/totalWarps;
    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
41
42
    float CDLJObcGbsa_energy;
    float energy = 0.0f;
43
44
45
#ifdef USE_CUTOFF
    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
46

47
48
49
50
#ifdef USE_EWALD
    const float SQRT_PI = sqrt(PI);
#endif

51
    unsigned int lasty = -0xFFFFFFFF;
52
53
54
55
56
57
58
59
60
61
62
63
64
65
    while (pos < end)
    {

        // Extract cell coordinates from appropriate work unit
        unsigned int x                      = workUnit[pos];
        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
        bool bExclusionFlag                 = (x & 0x1);
        x                                   = (x >> 17) << GRIDBITS;
        unsigned int tgx                    = threadIdx.x & (GRID - 1);
        unsigned int i                      = x + tgx;
        float4 apos                         = cSim.pPosq[i];
        float2 a                            = cSim.pAttr[i];
        float br                            = cSim.pBornRadii[i];
        unsigned int tbx                    = threadIdx.x - tgx;
66
        unsigned int tj                     = tgx;
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
        Atom* psA                           = &sA[tbx];
        float4 af;
        af.x                        = 0.0f;
        af.y                        = 0.0f;
        af.z                        = 0.0f;
        af.w                        = 0.0f;
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
            sA[threadIdx.x].x           = apos.x;
            sA[threadIdx.x].y           = apos.y;
            sA[threadIdx.x].z           = apos.z;
            sA[threadIdx.x].q           = apos.w;
            float q2                    = cSim.preFactor * apos.w;
            apos.w                     *= cSim.epsfac;
            sA[threadIdx.x].sig         = a.x;
            sA[threadIdx.x].eps         = a.y;
            sA[threadIdx.x].br          = br;
            if (!bExclusionFlag)
            {
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
                    float dy                = psA[j].y - apos.y;
                    float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
                    float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[j].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
107
108
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
109
#ifdef USE_CUTOFF
110
111
112
113
114
115
116
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
                    dEdR                   += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
		    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfc(alphaR);
    #else
117
                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
118
119
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
120
    #endif
121
#else
122
123
124
125
                    float factorX           = apos.w * psA[j].q * invR;
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
126
127
128
129
130
131
132
133
134
135
136
137
138
#endif
                    dEdR                   *= invR * invR;

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
139
140
		    /* E */
		    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
141
142
143
144
#ifdef USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
145
146
147
 			/* E */
			CDLJObcGbsa_energy  = 0.0f;
                   }
148
#endif
149
150
151
		    /* E */
                    if (i < cSim.atoms)
                    {
152
                        energy             += 0.5f*CDLJObcGbsa_energy;
153
                    }
154
155
156
157
158
159
160
161
162
163
164
                    // Add Forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                }
            }
            else  // bExclusion
            {
165
                unsigned int xi   = x>>GRIDBITS;
166
                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
167
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
                    float dy                = psA[j].y - apos.y;
                    float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
                        float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[j].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
188
189
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
190
#ifdef USE_CUTOFF
191
192
193
194
195
196
197
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
                    dEdR                   += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
		    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfc(alphaR);
    #else
198
                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
199
200
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
201
    #endif
202
#else
203
204
205
206
                    float factorX           = apos.w * psA[j].q * invR;
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
207
208
209
210
211
#endif
                    dEdR                   *= invR * invR;
                    if (!(excl & 0x1))
                    {
                        dEdR = 0.0f;
212
213
			/* E */
		        CDLJObcGbsa_energy  = 0.0f;
214
215
216
217
218
219
220
221
222
223
224
225
                    }

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
226
227
                    /* E */
		    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
228
229
230
231
#if defined USE_PERIODIC
                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
232
233
			/* E */
		        CDLJObcGbsa_energy = 0.0f;
234
235
236
237
238
                    }
#elif defined USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
239
240
			/* E */
		        CDLJObcGbsa_energy = 0.0f;
241
242
                    }
#endif
243
244
245
		    /* E */
                    if (i < cSim.atoms)
                    {
246
                        energy         += 0.5f*CDLJObcGbsa_energy;
247
                    }
248
249
250
251
252
253
254
255
256
257
258
259
260
                    // Add Forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                    excl                  >>= 1;
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
261
            unsigned int offset         = x + tgx + warp*cSim.stride;
262
263
264
265
266
267
268
269
            float4 of                   = cSim.pForce4a[offset];
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
            of.w                       += af.w;
            cSim.pForce4a[offset]       = of;
            cSim.pBornForce[offset]     = af.w;
#else
270
            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
271
272
273
274
275
276
277
278
279
            cSim.pForce4a[offset]       = af;
            cSim.pBornForce[offset]     = af.w;
#endif
        }
        else        // 100% utilization
        {
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
280
                unsigned int j              = y + tgx;
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
                float4 temp                 = cSim.pPosq[j];
                float2 temp1                = cSim.pAttr[j];
                sA[threadIdx.x].br          = cSim.pBornRadii[j];
                sA[threadIdx.x].x           = temp.x;
                sA[threadIdx.x].y           = temp.y;
                sA[threadIdx.x].z           = temp.z;
                sA[threadIdx.x].q           = temp.w;
                sA[threadIdx.x].sig         = temp1.x;
                sA[threadIdx.x].eps         = temp1.y;
            }
            sA[threadIdx.x].fx          = 0.0f;
            sA[threadIdx.x].fy          = 0.0f;
            sA[threadIdx.x].fz          = 0.0f;
            sA[threadIdx.x].fb          = 0.0f;
            float q2                    = apos.w * cSim.preFactor;
            apos.w                     *= cSim.epsfac;
            if (!bExclusionFlag)
            {
299
300
301
#ifdef USE_CUTOFF
                unsigned int flags = cSim.pInteractionFlag[pos];
                if (flags == 0)
302
                {
303
304
305
306
307
308
309
                    // No interactions in this block.
                }
                else if (flags == 0xFFFFFFFF)
#endif
                {
                    // Compute all interactions within this block.

310
                    for (unsigned int j = 0; j < GRID; j++)
311
312
313
314
                    {
                        float dx                = psA[tj].x - apos.x;
                        float dy                = psA[tj].y - apos.y;
                        float dz                = psA[tj].z - apos.z;
315
#ifdef USE_PERIODIC
316
317
318
                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
319
#endif
320
                        float r2                = dx * dx + dy * dy + dz * dz;
321

322
323
324
325
326
327
328
329
                        // CDLJ part
                        float invR              = 1.0f / sqrt(r2);
                        float sig               = a.x + psA[tj].sig;
                        float sig2              = invR * sig;
                        sig2                   *= sig2;
                        float sig6              = sig2 * sig2 * sig2;
                        float eps               = a.y * psA[tj].eps;
                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
330
331
                        /* E */ 
                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
332
#ifdef USE_CUTOFF
333
334
335
336
337
338
339
    #ifdef USE_EWALD
                        float r                 = sqrt(r2);
                        float alphaR            = cSim.alphaEwald * r;
                        dEdR                   += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
                        /* E */
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfc(alphaR);
    #else
340
                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
341
342
                        /* E */
                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
343
    #endif
344
#else
345
346
347
348
                        float factorX           = apos.w * psA[tj].q * invR;
                        dEdR                   += factorX;
                        /* E */
                        CDLJObcGbsa_energy     += factorX;
349
#endif
350
                        dEdR                   *= invR * invR;
351

352
353
354
355
356
357
358
359
360
361
362
                        // ObcGbsaForce1 part
                        float alpha2_ij         = br * psA[tj].br;
                        float D_ij              = r2 / (4.0f * alpha2_ij);
                        float expTerm           = exp(-D_ij);
                        float denominator2      = r2 + alpha2_ij * expTerm;
                        float denominator       = sqrt(denominator2);
                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
                        psA[tj].fb             += dGpol_dalpha2_ij * br;
                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
363
364
                        /* E */
                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
365
#ifdef USE_CUTOFF
366
367
368
                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dEdR = 0.0f;
369
370
			    /* E */
       			    CDLJObcGbsa_energy = 0.0f;
371
372
                        }
#endif
373
374
375
376
377
			/* E */
                        if (i < cSim.atoms)
                        {
                            energy         += CDLJObcGbsa_energy;
                        }
378
379
380
381
382
383
384
385
386
387
388
                        // Add forces
                        dx                     *= dEdR;
                        dy                     *= dEdR;
                        dz                     *= dEdR;
                        af.x                   -= dx;
                        af.y                   -= dy;
                        af.z                   -= dz;
                        psA[tj].fx             += dx;
                        psA[tj].fy             += dy;
                        psA[tj].fz             += dz;
                        tj                      = (tj + 1) & (GRID - 1);
389
                    }
390
391
392
393
394
395
                }
#ifdef USE_CUTOFF
                else
                {
                    // Compute only a subset of the interactions in this block.

396
                    for (unsigned int j = 0; j < GRID; j++)
397
398
399
400
401
402
403
404
405
406
                    {
                        if ((flags&(1<<j)) != 0)
                        {
                            float dx                = psA[j].x - apos.x;
                            float dy                = psA[j].y - apos.y;
                            float dz                = psA[j].z - apos.z;
#ifdef USE_PERIODIC
                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
407
#endif
408
                            float r2                = dx * dx + dy * dy + dz * dz;
409

410
411
412
413
414
415
416
417
                            // CDLJ part
                            float invR              = 1.0f / sqrt(r2);
                            float sig               = a.x + psA[j].sig;
                            float sig2              = invR * sig;
                            sig2                   *= sig2;
                            float sig6              = sig2 * sig2 * sig2;
                            float eps               = a.y * psA[j].eps;
                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
418
419
                            /* E */ 
                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
420
#ifdef USE_CUTOFF
421
422
423
424
425
426
    #ifdef USE_EWALD
                            float r                 = sqrt(r2);
                            float alphaR            = cSim.alphaEwald * r;
                            dEdR                   += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfc(alphaR);
    #else
427
                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
428
429
                            /* E */
                            CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
430
    #endif
431
#else
432
433
434
435
                            float factorX           = apos.w * psA[j].q * invR;
                            dEdR                   += factorX;
                            /* E */
                            CDLJObcGbsa_energy     += factorX;
436
437
438
439
440
441
442
443
444
445
446
447
448
#endif
                            dEdR                   *= invR * invR;

                            // ObcGbsaForce1 part
                            float alpha2_ij         = br * psA[j].br;
                            float D_ij              = r2 / (4.0f * alpha2_ij);
                            float expTerm           = exp(-D_ij);
                            float denominator2      = r2 + alpha2_ij * expTerm;
                            float denominator       = sqrt(denominator2);
                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
449
450
                            /* E */
                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468

                            // Sum the Born forces.

                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
#ifdef USE_CUTOFF
                            if (r2 > cSim.nonbondedCutoffSqr)
                            {
                                dEdR = 0.0f;
469
470
				/* E */
				CDLJObcGbsa_energy  = 0.0f;
471
472
                            }
#endif
473
474
475
476
477
			    /* E */
                            if (i < cSim.atoms)
                            {
            			energy         += CDLJObcGbsa_energy;
                            }
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
                            // Add forces
                            dx                     *= dEdR;
                            dy                     *= dEdR;
                            dz                     *= dEdR;
                            af.x                   -= dx;
                            af.y                   -= dy;
                            af.z                   -= dz;
                            tempBuffer[threadIdx.x] = dx;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                            tempBuffer[threadIdx.x] = dy;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                            tempBuffer[threadIdx.x] = dz;
                            if (tgx % 2 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                            if (tgx % 4 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                            if (tgx % 8 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                            if (tgx % 16 == 0)
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                        }
                    }
520
                }
521
#endif
522
523
524
            }
            else  // bExclusion
            {
525
526
                unsigned int xi   = x>>GRIDBITS;
                unsigned int yi   = y>>GRIDBITS;
527
                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
528
529
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                excl              = (excl >> tgx) | (excl << (GRID - tgx));
530
                for (unsigned int j = 0; j < GRID; j++)
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
                {
                    float dx                = psA[tj].x - apos.x;
                    float dy                = psA[tj].y - apos.y;
                    float dz                = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
                    float r2                = dx * dx + dy * dy + dz * dz;

                    // CDLJ part
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[tj].sig;
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float eps               = a.y * psA[tj].eps;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
550
551
                    /* E */ 
		    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
552
#ifdef USE_CUTOFF
553
554
555
556
557
558
559
    #ifdef USE_EWALD
                    float r                 = sqrt(r2);
                    float alphaR            = cSim.alphaEwald * r;
                    dEdR                   += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0f * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfc(alphaR);
    #else
560
                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
561
562
                    /* E */
                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
563
    #endif
564
#else
565
566
567
568
                    float factorX           = apos.w * psA[tj].q * invR;
                    dEdR                   += factorX;
                    /* E */
                    CDLJObcGbsa_energy     += factorX;
569
570
571
572
573
#endif
                    dEdR                   *= invR * invR;
                    if (!(excl & 0x1))
                    {
                        dEdR = 0.0f;
574
575
			/* E */
			CDLJObcGbsa_energy = 0.0f;
576
577
578
579
580
581
582
583
584
585
586
587
588
                    }

                    // ObcGbsaForce1 part
                    float alpha2_ij         = br * psA[tj].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
                    float denominator2      = r2 + alpha2_ij * expTerm;
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
                    psA[tj].fb             += dGpol_dalpha2_ij * br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
589
590
		    /* E */
		    CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
591
592
593
594
#if defined USE_PERIODIC
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
595
596
			/* E */
			CDLJObcGbsa_energy = 0.0f;
597
598
599
600
601
                    }
#elif defined USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR = 0.0f;
602
603
			/* E */
			CDLJObcGbsa_energy = 0.0f;
604
605
                    }
#endif
606
607
608
609
610
		    /* E */
                    if (i < cSim.atoms)
                    {
                        energy             += CDLJObcGbsa_energy;
                    }
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
                    // Add forces
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                    psA[tj].fx             += dx;
                    psA[tj].fy             += dy;
                    psA[tj].fz             += dz;
                    excl                  >>= 1;
                    tj                      = (tj + 1) & (GRID - 1);
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
628
            unsigned int offset         = x + tgx + warp*cSim.stride;
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
            float4 of                   = cSim.pForce4a[offset];
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
            of.w                       += af.w;
            cSim.pForce4a[offset]       = of;
            cSim.pBornForce[offset]     = af.w;
            offset                      = y + tgx + warp*cSim.stride;
            of                          = cSim.pForce4a[offset];
            of.x                       += sA[threadIdx.x].fx;
            of.y                       += sA[threadIdx.x].fy;
            of.z                       += sA[threadIdx.x].fz;
            of.w                       += sA[threadIdx.x].fb;
            cSim.pForce4a[offset]       = of;
            cSim.pBornForce[offset]     = af.w;
#else
645
            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
646
647
648
649
650
651
652
653
654
655
656
657
658
659
            cSim.pForce4a[offset]       = af;
            cSim.pBornForce[offset]     = af.w;
            af.x                        = sA[threadIdx.x].fx;
            af.y                        = sA[threadIdx.x].fy;
            af.z                        = sA[threadIdx.x].fz;
            af.w                        = sA[threadIdx.x].fb;
            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
            cSim.pForce4a[offset]       = af;
            cSim.pBornForce[offset]     = af.w;
#endif
            lasty = y;
        }
        pos++;
    }
660
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
661
}