kCalculatePME.cu 17.7 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2009-2010 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 * Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman       *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "gputypes.h"
28
#include "bbsort.h"
29
30
31
32
33
34
#include <cuda.h>

using namespace std;

static __constant__ cudaGmxSimulation cSim;

Peter Eastman's avatar
Peter Eastman committed
35
36
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
37

38
39
40
41
42
43
44
45
46
47
48
49
50
51
void SetCalculatePMESim(gpuContext gpu)
{
    cudaError_t status;
    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}

void GetCalculatePMESim(gpuContext gpu)
{
    cudaError_t status;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}

Peter Eastman's avatar
Peter Eastman committed
52
53
texture<float4, 1, cudaReadModeElementType> bsplineThetaRef;

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
inline __host__ __device__ int fast_mod(int a, int b)
{
    return (b & (b - 1)) ? a % b : a & (b - 1);
}
inline __host__ __device__ float4 make_float4(float s)
{
    return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 operator-(float4 &a)
{
    return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float b)
{
    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ float4 operator+(float a, float4 b)
{
    return make_float4(a + b.x, a + b.y, a + b.z,  a + b.w);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float4 a, float4 b)
{
    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w + b.w);
}
inline __host__ __device__ float4 make_float4(int3 a)
{
92
    return make_float4((float) a.x, (float) a.y, (float) a.z, 0);
93
94
}

Scott Le Grand's avatar
Scott Le Grand committed
95
96
97
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
98
#elif (__CUDA_ARCH__ >= 120)
Scott Le Grand's avatar
Scott Le Grand committed
99
100
101
102
103
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kUpdateBsplines_kernel()
104
105
106
{
    unsigned int    tnb = blockDim.x * gridDim.x;
    unsigned int    tid = blockIdx.x * blockDim.x + threadIdx.x;
107
    extern __shared__ float4 bsplines_cache[]; // size = 2 * block_size * pme_order
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

    const float4 div_o   = make_float4(1.0f/(PME_ORDER - 1));

    for (int i = tid; i < cSim.atoms; i += tnb)
    {

        float4* data    = &bsplines_cache[threadIdx.x*PME_ORDER];
        float4* ddata   = &bsplines_cache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];

        for (int j = 0; j < PME_ORDER; j++)
        {
	    data[j] = make_float4(0.0f);
            ddata[j] = make_float4(0.0f);
        }

Peter Eastman's avatar
Peter Eastman committed
123
        float4 posq = cSim.pPosq[i];
124
125
126
127
128
129
        posq.x -= floor(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
        posq.y -= floor(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
        posq.z -= floor(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
        float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
Peter Eastman's avatar
Peter Eastman committed
130
        float4 dr = make_float4(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
131
132
133
134
        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
                              ((int) t.y) % cSim.pmeGridSize.y,
                              ((int) t.z) % cSim.pmeGridSize.z);
        cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

        data[PME_ORDER - 1] = make_float4(0.0f);
        data[1]            = dr;
        data[0]            = make_float4(1.0f) - dr;

        for (int j = 3; j < PME_ORDER; j++)
        {
            float div = 1.0f / ((float)j - 1.0f);
            data[j - 1] = div * dr * data[j - 2];

            for (int k = 1; k < (j - 1); k++)
            {
                data[j - k - 1] =
                   div * (
                           (dr + float(k))          * data[j - k - 2] +
                           (-dr + ((float)(j - k))) * data[j - k - 1]);
            }
            data[0] = div * (- dr + 1) * data[0];
        }

        ddata[0] = -data[0];

        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j - 1] - data[j];

        data[PME_ORDER - 1] = div_o * dr * data[PME_ORDER - 2];

        for (int j = 1; j < (PME_ORDER - 1); j++)
        {
            data[PME_ORDER - j - 1] =
                div_o * (
                    (dr + (float)j)                 * data[PME_ORDER - j - 2] +
                    (-dr + ((float)(PME_ORDER - j))) * data[PME_ORDER - j - 1]
                );
        }
        data[0] = div_o * (-dr + 1.0f) * data[0];

        for (int j = 0; j < PME_ORDER; j++)
        {
174
            data[j].w = posq.w; // Storing the charge here improves cache coherency in the charge spreading kernel
175
176
            cSim.pPmeBsplineTheta[i + j*cSim.atoms] =  data[j];
            cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
177
178
179
180
        }
    }
}

181
182
183
184
185
186
/**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
187
#elif (__CUDA_ARCH__ >= 120)
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kFindAtomRangeForGrid_kernel()
{
    int thread = blockIdx.x*blockDim.x+threadIdx.x;
    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
    int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
    for (int i = start; i < end; ++i)
    {
        int2 atomData = cSim.pPmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last)
        {
            for (int j = last+1; j <= gridIndex; ++j)
                cSim.pPmeAtomRange[j] = i;
            last = gridIndex;
        }

        // The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
        // some work in the charge spreading kernel.

        float posz = cSim.pPosq[atomData.x].z;
        posz -= floor(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
        int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
        cSim.pPmeAtomGridIndex[i].y = z;
    }

    // Fill in values beyond the last atom.

    if (thread == blockDim.x*gridDim.x-1)
    {
        int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        for (int j = last+1; j <= gridSize; ++j)
            cSim.pPmeAtomRange[j] = cSim.atoms;
    }
}

Peter Eastman's avatar
Peter Eastman committed
228
__global__
Scott Le Grand's avatar
Scott Le Grand committed
229
void kGridSpreadCharge_kernel()
230
{
231
232
233
    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
    unsigned int numThreads = gridDim.x*blockDim.x;
    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
234
235
    {
        int3 gridPoint;
236
237
238
239
        gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
        int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        gridPoint.y = remainder/cSim.pmeGridSize.z;
        gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
240
        float result = 0.0f;
241
        for (int ix = 0; ix < PME_ORDER; ++ix)
242
243
        {
            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
244
            for (int iy = 0; iy < PME_ORDER; ++iy)
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
            {
                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
                int z1 = gridPoint.z-PME_ORDER+1;
                z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
                int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
                int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
                int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
                int firstAtom = cSim.pPmeAtomRange[gridIndex1];
                int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
                for (int i = firstAtom; i < lastAtom; ++i)
                {
                    int2 atomData = cSim.pPmeAtomGridIndex[i];
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
260
                    float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
261
262
263
                    result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
                }
                if (z1 > gridPoint.z)
264
                {
265
266
267
268
                    gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
                    gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
                    firstAtom = cSim.pPmeAtomRange[gridIndex1];
                    lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
269
270
                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
271
                        int2 atomData = cSim.pPmeAtomGridIndex[i];
272
                        int atomIndex = atomData.x;
273
274
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
275
                        float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
Peter Eastman's avatar
Peter Eastman committed
276
                        result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
277
278
                    }
                }
279
280
            }
        }
281
        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
282
283
284
    }
}

Scott Le Grand's avatar
Scott Le Grand committed
285
286
287
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
288
#elif (__CUDA_ARCH__ >= 120)
Scott Le Grand's avatar
Scott Le Grand committed
289
290
291
292
293
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kReciprocalConvolution_kernel()
294
295
{
    const unsigned int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
296
297
    float expFactor = LOCAL_HACK_PI*LOCAL_HACK_PI/(cSim.alphaEwald*cSim.alphaEwald);
    float scaleFactor = 1.0/(LOCAL_HACK_PI*cSim.periodicBoxSizeX*cSim.periodicBoxSizeY*cSim.periodicBoxSizeZ);
298
299
300
301
302
303
304
305
306
307
308
309
    float energy = 0.0f;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
    {
        int kx = index/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
        int remainder = index-kx*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        int ky = remainder/cSim.pmeGridSize.z;
        int kz = remainder-ky*cSim.pmeGridSize.z;
        if (kx == 0 && ky == 0 && kz == 0)
            continue;
        int mx = (kx < (cSim.pmeGridSize.x+1)/2) ? kx : (kx-cSim.pmeGridSize.x);
        int my = (ky < (cSim.pmeGridSize.y+1)/2) ? ky : (ky-cSim.pmeGridSize.y);
        int mz = (kz < (cSim.pmeGridSize.z+1)/2) ? kz : (kz-cSim.pmeGridSize.z);
310
311
312
        float mhx = mx*cSim.invPeriodicBoxSizeX;
        float mhy = my*cSim.invPeriodicBoxSizeY;
        float mhz = mz*cSim.invPeriodicBoxSizeZ;
313
314
315
316
317
318
319
320
321
322
        float bx = cSim.pPmeBsplineModuli[0][kx];
        float by = cSim.pPmeBsplineModuli[1][ky];
        float bz = cSim.pPmeBsplineModuli[2][kz];
        cuComplex grid = cSim.pPmeGrid[index];
        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        float denom = m2*bx*by*bz;
        float eterm = scaleFactor*exp(-expFactor*m2)/denom;
        cSim.pPmeGrid[index] = make_cuComplex(grid.x*eterm, grid.y*eterm);
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
323
324
325
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}

Scott Le Grand's avatar
Scott Le Grand committed
326
327
328
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
329
#elif (__CUDA_ARCH__ >= 120)
Scott Le Grand's avatar
Scott Le Grand committed
330
331
332
333
334
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kGridInterpolateForce_kernel()
335
336
337
338
339
{
    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < cSim.atoms; atom += blockDim.x*gridDim.x)
    {
        float3 force = make_float3(0.0f, 0.0f, 0.0f);
        float4 posq = cSim.pPosq[atom];
340
341
342
343
344
345
        posq.x -= floor(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
        posq.y -= floor(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
        posq.z -= floor(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
        float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
Peter Eastman's avatar
Peter Eastman committed
346
347
348
        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
                              ((int) t.y) % cSim.pmeGridSize.y,
                              ((int) t.z) % cSim.pmeGridSize.z);
349
350
        for (int ix = 0; ix < PME_ORDER; ix++)
        {
351
352
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= cSim.pmeGridSize.x ? cSim.pmeGridSize.x : 0);
353
354
            float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x;
            float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x;
355
356
            for (int iy = 0; iy < PME_ORDER; iy++)
            {
357
358
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= cSim.pmeGridSize.y ? cSim.pmeGridSize.y : 0);
359
360
                float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y;
                float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y;
361
362
                for (int iz = 0; iz < PME_ORDER; iz++)
                {
363
364
                    int zindex               = gridIndex.z+iz;
                    zindex -= (zindex >= cSim.pmeGridSize.z ? cSim.pmeGridSize.z : 0);
365
366
                    float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z;
                    float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z;
367
                    int index                = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex;
Peter Eastman's avatar
Peter Eastman committed
368
369
370
371
                    float gridvalue          = cSim.pPmeGrid[index].x;
                    force.x                 += dtx*ty*tz*gridvalue;
                    force.y                 += tx*dty*tz*gridvalue;
                    force.z                 += tx*ty*dtz*gridvalue;
372
373
374
375
376
                }
            }
        }
        float4 totalForce = cSim.pForce4[atom];
        float q = posq.w*sqrt(cSim.epsfac);
377
378
379
        totalForce.x -= q*force.x*cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
        totalForce.y -= q*force.y*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
        totalForce.z -= q*force.z*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
380
381
        cSim.pForce4[atom] = totalForce;
    }
382
383
}

384
385
386
void kCalculatePME(gpuContext gpu)
{
//    printf("kCalculatePME\n");
Peter Eastman's avatar
Peter Eastman committed
387
388
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
    cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
389
390
    unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
    kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
391
    LAUNCHERROR("kUpdateBsplines");
392
393
394
    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
    kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kFindAtomRangeForGrid");
Peter Eastman's avatar
Peter Eastman committed
395
    kGridSpreadCharge_kernel<<<16*gpu->sim.blocks, 64>>>();
396
    LAUNCHERROR("kGridSpreadCharge");
397
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
398
    kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
399
400
    LAUNCHERROR("kReciprocalConvolution");
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
Peter Eastman's avatar
Peter Eastman committed
401
    kGridInterpolateForce_kernel<<<2*gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
402
    LAUNCHERROR("kGridInterpolateForce");
403
}