kCalculateObcGbsaBornSum.h 16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
13
14
15
16
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
17
 *                                                                            *
18
19
20
21
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
22
 *                                                                            *
23
24
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
25
26
27
28
29
30
31
32
 * -------------------------------------------------------------------------- */

/**
 * This file contains the kernel for calculating Born sums.  It is included
 * several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
 * different versions of the kernels.
 */

Scott Le Grand's avatar
Scott Le Grand committed
33
34
35
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
36
#elif (__CUDA_ARCH__ >= 120)
Scott Le Grand's avatar
Scott Le Grand committed
37
38
39
40
41
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
42
{
43
    extern __shared__ volatile Atom sA[];
44
45
    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
46
    unsigned int numWorkUnits = cSim.pInteractionCount[0];
47
48
    unsigned int pos          = warp*numWorkUnits/totalWarps;
    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
49

50
#ifdef USE_CUTOFF
51
    volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
52
#endif
53

54
    while (pos < end)
55
56
    {
        // Extract cell coordinates from appropriate work unit
57
58
        
        unsigned int x = workUnit[pos];
59
60
61
62
63
64
65
66
67
68
        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
        x = (x >> 17) << GRIDBITS;
        float       dx;
        float       dy;
        float       dz;
        float       r2;
        float       r;

        unsigned int tgx = threadIdx.x & (GRID - 1);
        unsigned int tbx = threadIdx.x - tgx;
69
        unsigned int tj  = tgx;
70
        volatile Atom* psA = &sA[tbx];
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
            unsigned int i = x + tgx;
            float4 apos = cSim.pPosq[i];    // Local atom x, y, z, sum
            float2 ar = cSim.pObcData[i];   // Local atom vr, sr
            sA[threadIdx.x].x           = apos.x;
            sA[threadIdx.x].y           = apos.y;
            sA[threadIdx.x].z           = apos.z;
            sA[threadIdx.x].r           = ar.x;
            sA[threadIdx.x].sr          = ar.y;
            apos.w                      = 0.0f;

            for (unsigned int j = 0; j < GRID; j++)
            {
                dx                      = psA[j].x - apos.x;
                dy                      = psA[j].y - apos.y;
                dz                      = psA[j].z - apos.z;
#ifdef USE_PERIODIC
91
92
93
                dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
94
95
#endif
                r2                      = dx * dx + dy * dy + dz * dz;
96
#if defined USE_CUTOFF
97
                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
98
99
#else
                if (i < cSim.atoms && x+j < cSim.atoms )
100
101
#endif
                {
102
                    r                       = sqrtf(r2);
103
104
105
106
107
108
109
110
                    float rInverse          = 1.0f / r;
                    float rScaledRadiusJ    = r + psA[j].sr;
                    if ((j != tgx) && (ar.x < rScaledRadiusJ))
                    {
                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
                        float u_ij     = 1.0f / rScaledRadiusJ;
                        float l_ij2    = l_ij * l_ij;
                        float u_ij2    = u_ij * u_ij;
111
                        float ratio    = logf(u_ij / l_ij);
112
113
114
115
116
117
                        apos.w        += l_ij -
                                         u_ij +
                                         0.25f * r * (u_ij2 - l_ij2) +
                                         (0.50f * rInverse * ratio) +
                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
                                         (l_ij2 - u_ij2);
118
                        float rj = psA[j].sr;
119
                        if (ar.x < (rj - r))
120
121
122
123
124
125
126
127
128
                        {
                            apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
                        }
                    }
                }
            }

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
129
            unsigned int offset = x + tgx + warp*cSim.stride;
130
131
            cSim.pBornSum[offset] += apos.w;
#else
132
            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
133
134
135
136
137
138
            cSim.pBornSum[offset] = apos.w;
#endif
        }
        else        // 100% utilization
        {
            // Read fixed atom data into registers and GRF
139
            unsigned int j                           = y + tgx;
140
141
142
143
144
145
146
147
148
149
150
151
152
            unsigned int i                  = x + tgx;

            float4 temp                     = cSim.pPosq[j];
            float2 temp1                    = cSim.pObcData[j];
            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
            sA[threadIdx.x].x               = temp.x;
            sA[threadIdx.x].y               = temp.y;
            sA[threadIdx.x].z               = temp.z;
            sA[threadIdx.x].r               = temp1.x;
            sA[threadIdx.x].sr              = temp1.y;
            sA[threadIdx.x].sum = apos.w    = 0.0f;

153
#ifdef USE_CUTOFF
154
            unsigned int flags = cSim.pInteractionFlag[pos];
155
156
157
158
159
160
            if (flags == 0)
            {
                // No interactions in this block.
            }
            else if (flags == 0xFFFFFFFF)
#endif
161
            {
162
163
164
165
166
167
168
                // Compute all interactions within this block.

                for (unsigned int j = 0; j < GRID; j++)
                {
                    dx                      = psA[tj].x - apos.x;
                    dy                      = psA[tj].y - apos.y;
                    dz                      = psA[tj].z - apos.z;
169
#ifdef USE_PERIODIC
170
171
172
                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
173
#endif
174
                    r2                      = dx * dx + dy * dy + dz * dz;
175
#ifdef USE_CUTOFF
176
                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
177
178
#else
                    if (i < cSim.atoms && y+tj < cSim.atoms)
179
180
#endif
                    {
181
                        r                       = sqrtf(r2);
182
183
184
                        float rInverse          = 1.0f / r;
                        float rScaledRadiusJ    = r + psA[tj].sr;
                        if (ar.x < rScaledRadiusJ)
185
                        {
186
187
188
189
                            float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
                            float u_ij     = 1.0f / rScaledRadiusJ;
                            float l_ij2    = l_ij * l_ij;
                            float u_ij2    = u_ij * u_ij;
190
                            float ratio    = logf(u_ij / l_ij);
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
                            float term     = l_ij -
                                             u_ij +
                                             0.25f * r * (u_ij2 - l_ij2) +
                                             (0.50f * rInverse * ratio) +
                                             (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
                                             (l_ij2 - u_ij2);
                            float srj = psA[tj].sr;
                            if (ar.x < (srj - r))
                            {
                                term += 2.0f * ((1.0f / ar.x) - l_ij);
                            }
                            apos.w        += term;
                        }
                        float rScaledRadiusI    = r + ar.y;
                        if (psA[tj].r < rScaledRadiusI)
                        {
                            float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
                            float u_ij     = 1.0f / rScaledRadiusI;
                            float l_ij2    = l_ij * l_ij;
                            float u_ij2    = u_ij * u_ij;
211
                            float ratio    = logf(u_ij / l_ij);
212
213
214
215
216
217
218
219
220
221
222
223
                            float term     = l_ij -
                                             u_ij +
                                             0.25f * r * (u_ij2 - l_ij2) +
                                             (0.50f * rInverse * ratio) +
                                             (0.25f * ar.y * ar.y * rInverse) *
                                             (l_ij2 - u_ij2);
                            float rj = psA[tj].r;
                            if (rj < (ar.y - r))
                            {
                                term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
                            }
                            psA[tj].sum    += term;
224
225
                        }
                    }
226
227
228
229
230
231
232
                    tj = (tj - 1) & (GRID - 1);
                }
            }
#ifdef USE_CUTOFF
            else
            {
                // Compute only a subset of the interactions in this block.
233

234
235
236
237
238
239
240
241
242
                for (unsigned int j = 0; j < GRID; j++)
                {
                    if ((flags&(1<<j)) != 0)
                    {
                        tempBuffer[threadIdx.x] = 0.0f;
                        dx                      = psA[j].x - apos.x;
                        dy                      = psA[j].y - apos.y;
                        dz                      = psA[j].z - apos.z;
#ifdef USE_PERIODIC
243
244
245
                        dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                        dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                        dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
246
247
#endif
                        r2                      = dx * dx + dy * dy + dz * dz;
248
#ifdef USE_CUTOFF
249
                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
250
251
#else
                        if (i < cSim.atoms && y+j < cSim.atoms)
252
#endif
253
                        {
254
                            r                       = sqrtf(r2);
255
256
257
258
259
260
261
262
                            float rInverse          = 1.0f / r;
                            float rScaledRadiusJ    = r + psA[j].sr;
                            if (ar.x < rScaledRadiusJ)
                            {
                                float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
                                float u_ij     = 1.0f / rScaledRadiusJ;
                                float l_ij2    = l_ij * l_ij;
                                float u_ij2    = u_ij * u_ij;
263
                                float ratio    = logf(u_ij / l_ij);
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
                                float term     = l_ij -
                                                 u_ij +
                                                 0.25f * r * (u_ij2 - l_ij2) +
                                                 (0.50f * rInverse * ratio) +
                                                 (0.25f * psA[j].sr * psA[j].sr * rInverse) *
                                                 (l_ij2 - u_ij2);
                                float srj = psA[j].sr;
                                if (ar.x < (srj - r))
                                {
                                    term += 2.0f * ((1.0f / ar.x) - l_ij);
                                }
                                apos.w        += term;
                            }
                            float rScaledRadiusI    = r + ar.y;
                            if (psA[j].r < rScaledRadiusI)
                            {
                                float l_ij     = 1.0f / max(psA[j].r, fabs(r - ar.y));
                                float u_ij     = 1.0f / rScaledRadiusI;
                                float l_ij2    = l_ij * l_ij;
                                float u_ij2    = u_ij * u_ij;
284
                                float ratio    = logf(u_ij / l_ij);
285
286
287
288
289
290
291
292
293
294
295
296
297
                                float term     = l_ij -
                                                 u_ij +
                                                 0.25f * r * (u_ij2 - l_ij2) +
                                                 (0.50f * rInverse * ratio) +
                                                 (0.25f * ar.y * ar.y * rInverse) *
                                                 (l_ij2 - u_ij2);
                                float rj = psA[j].r;
                                if (rj < (ar.y - r))
                                {
                                    term += 2.0f * ((1.0f / psA[j].r) - l_ij);
                                }
                                tempBuffer[threadIdx.x] = term;
                            }
298
                        }
299
300
301
302
303
304
305
306
307
308
309
310
311

                        // Sum the terms.

                        if (tgx % 2 == 0)
                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
                        if (tgx % 4 == 0)
                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
                        if (tgx % 8 == 0)
                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
                        if (tgx % 16 == 0)
                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                        if (tgx == 0)
                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
312
313
314
                    }
                }
            }
315
#endif
316
317
318

            // Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
319
            unsigned int offset = x + tgx + warp*cSim.stride;
320
321
322
323
            cSim.pBornSum[offset] += apos.w;
            offset = y + tgx + warp*cSim.stride;
            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else
324
            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
325
326
327
328
329
            cSim.pBornSum[offset] = apos.w;
            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
#endif
        }
330
        pos++;
331
332
    }
}