kCalculatePME.cu 16.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman       *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "gputypes.h"
28
#include "bbsort.h"
29
30
31
32
33
34
#include <cuda.h>

using namespace std;

static __constant__ cudaGmxSimulation cSim;

Peter Eastman's avatar
Peter Eastman committed
35
36
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
37

38
39
40
41
42
43
44
45
46
47
48
49
50
51
void SetCalculatePMESim(gpuContext gpu)
{
    cudaError_t status;
    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}

void GetCalculatePMESim(gpuContext gpu)
{
    cudaError_t status;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}

Peter Eastman's avatar
Peter Eastman committed
52
53
texture<float4, 1, cudaReadModeElementType> bsplineThetaRef;

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
inline __host__ __device__ int fast_mod(int a, int b)
{
    return (b & (b - 1)) ? a % b : a & (b - 1);
}
inline __host__ __device__ float4 make_float4(float s)
{
    return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 operator-(float4 &a)
{
    return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
}
inline __host__ __device__ float4 operator+(float4 a, float b)
{
    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
}
inline __host__ __device__ float4 operator+(float a, float4 b)
{
    return make_float4(a + b.x, a + b.y, a + b.z,  a + b.w);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float4 a, float4 b)
{
    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w + b.w);
}
inline __host__ __device__ float4 make_float4(int3 a)
{
92
    return make_float4((float) a.x, (float) a.y, (float) a.z, 0);
93
94
}

Scott Le Grand's avatar
Scott Le Grand committed
95
96
97
98
99
100
101
102
103
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kUpdateGridIndexAndFraction_kernel()
104
105
106
107
108
109
{
    unsigned int tnb = blockDim.x * gridDim.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    for (int i = tid; i < cSim.atoms; i += tnb)
    {
Peter Eastman's avatar
Peter Eastman committed
110
111
112
113
114
115
116
117
        float4 posq = cSim.pPosq[i];
        float3 t = make_float3((posq.x/cSim.periodicBoxSizeX+1.0f)*cSim.pmeGridSize.x,
                               (posq.y/cSim.periodicBoxSizeY+1.0f)*cSim.pmeGridSize.y,
                               (posq.z/cSim.periodicBoxSizeZ+1.0f)*cSim.pmeGridSize.z);
        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
                              ((int) t.y) % cSim.pmeGridSize.y,
                              ((int) t.z) % cSim.pmeGridSize.z);
        cSim.pPmeAtomGridIndex[i] = make_float2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
Peter Eastman's avatar
Peter Eastman committed
118
    }
119
}
Peter Eastman's avatar
Peter Eastman committed
120

121
122
123
/**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
124

Scott Le Grand's avatar
Scott Le Grand committed
125
126
127
128
129
130
131
132
133
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kFindAtomRangeForGrid_kernel()
134
135
136
137
{
    int thread = blockIdx.x*blockDim.x+threadIdx.x;
    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
Peter Eastman's avatar
Peter Eastman committed
138
    int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
139
    for (int i = start; i < end; ++i)
Peter Eastman's avatar
Peter Eastman committed
140
    {
141
142
143
        float2 atomData = cSim.pPmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last)
Peter Eastman's avatar
Peter Eastman committed
144
        {
145
146
147
            for (int j = last+1; j <= gridIndex; ++j)
                cSim.pPmeAtomRange[j] = i;
            last = gridIndex;
Peter Eastman's avatar
Peter Eastman committed
148
        }
149
150
151
152
153
154
155
156
157
158
159
160
161
162

        // The grid index won't be needed again.  Reuse that component to hold the atom charge, thus saving
        // an extra load operation in the charge spreading kernel.

        cSim.pPmeAtomGridIndex[i].y = cSim.pPosq[(int) atomData.x].w;
    }

    // Fill in values beyond the last atom.
    
    if (thread == blockDim.x*gridDim.x-1)
    {
        int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        for (int j = last+1; j <= gridSize; ++j)
            cSim.pPmeAtomRange[j] = cSim.atoms;
163
164
165
    }
}

Scott Le Grand's avatar
Scott Le Grand committed
166
167
168
169
170
171
172
173
174
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kUpdateBsplines_kernel()
175
176
177
{
    unsigned int    tnb = blockDim.x * gridDim.x;
    unsigned int    tid = blockIdx.x * blockDim.x + threadIdx.x;
178
    extern __shared__ float4 bsplines_cache[]; // size = 2 * block_size * pme_order
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

    const float4 div_o   = make_float4(1.0f/(PME_ORDER - 1));

    for (int i = tid; i < cSim.atoms; i += tnb)
    {

        float4* data    = &bsplines_cache[threadIdx.x*PME_ORDER];
        float4* ddata   = &bsplines_cache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];

        for (int j = 0; j < PME_ORDER; j++)
        {
	    data[j] = make_float4(0.0f);
            ddata[j] = make_float4(0.0f);
        }

Peter Eastman's avatar
Peter Eastman committed
194
        float4 posq = cSim.pPosq[i];
Peter Eastman's avatar
Peter Eastman committed
195
196
197
198
        float3 t = make_float3((posq.x/cSim.periodicBoxSizeX+1.0f)*cSim.pmeGridSize.x,
                               (posq.y/cSim.periodicBoxSizeY+1.0f)*cSim.pmeGridSize.y,
                               (posq.z/cSim.periodicBoxSizeZ+1.0f)*cSim.pmeGridSize.z);
        float4 dr = make_float4(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

        data[PME_ORDER - 1] = make_float4(0.0f);
        data[1]            = dr;
        data[0]            = make_float4(1.0f) - dr;

        for (int j = 3; j < PME_ORDER; j++)
        {
            float div = 1.0f / ((float)j - 1.0f);
            data[j - 1] = div * dr * data[j - 2];

            for (int k = 1; k < (j - 1); k++)
            {
                data[j - k - 1] =
                   div * (
                           (dr + float(k))          * data[j - k - 2] +
                           (-dr + ((float)(j - k))) * data[j - k - 1]);
            }
            data[0] = div * (- dr + 1) * data[0];
        }

        ddata[0] = -data[0];

        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j - 1] - data[j];

        data[PME_ORDER - 1] = div_o * dr * data[PME_ORDER - 2];

        for (int j = 1; j < (PME_ORDER - 1); j++)
        {
            data[PME_ORDER - j - 1] =
                div_o * (
                    (dr + (float)j)                 * data[PME_ORDER - j - 2] +
                    (-dr + ((float)(PME_ORDER - j))) * data[PME_ORDER - j - 1]
                );
        }
        data[0] = div_o * (-dr + 1.0f) * data[0];

        for (int j = 0; j < PME_ORDER; j++)
        {
238
239
            cSim.pPmeBsplineTheta[i + j*cSim.atoms] =  data[j];
            cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
240
241
242
243
        }
    }
}

Scott Le Grand's avatar
Scott Le Grand committed
244
245
246
247
248
249
250
251
252
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kGridSpreadCharge_kernel()
253
{
254
255
256
    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
    unsigned int numThreads = gridDim.x*blockDim.x;
    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
257
258
    {
        int3 gridPoint;
259
260
261
262
263
264
265
        gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
        int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        gridPoint.y = remainder/cSim.pmeGridSize.z;
        gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
        gridPoint.x += cSim.pmeGridSize.x;
        gridPoint.y += cSim.pmeGridSize.y;
        gridPoint.z += cSim.pmeGridSize.z;
266
        float result = 0.0f;
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        for (int ix = 0; ix < PME_ORDER; ++ix)
            for (int iy = 0; iy < PME_ORDER; ++iy)
                for (int iz = 0; iz < PME_ORDER; ++iz)
                {
                    int x = (gridPoint.x-ix)%cSim.pmeGridSize.x;
                    int y = (gridPoint.y-iy)%cSim.pmeGridSize.y;
                    int z = (gridPoint.z-iz)%cSim.pmeGridSize.z;
                    int gridIndex = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z;
                    int firstAtom = cSim.pPmeAtomRange[gridIndex];
                    int lastAtom = cSim.pPmeAtomRange[gridIndex+1];
                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
                        float2 atomData = cSim.pPmeAtomGridIndex[i];
                        int atomIndex = atomData.x;
                        float atomCharge = atomData.y;
Peter Eastman's avatar
Peter Eastman committed
282
                        result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
283
284
285
                    }
                }
        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
286
287
288
    }
}

Scott Le Grand's avatar
Scott Le Grand committed
289
290
291
292
293
294
295
296
297
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kReciprocalConvolution_kernel()
298
299
{
    const unsigned int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
300
301
    float expFactor = LOCAL_HACK_PI*LOCAL_HACK_PI/(cSim.alphaEwald*cSim.alphaEwald);
    float scaleFactor = 1.0/(LOCAL_HACK_PI*cSim.periodicBoxSizeX*cSim.periodicBoxSizeY*cSim.periodicBoxSizeZ);
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    float energy = 0.0f;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
    {
        int kx = index/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
        int remainder = index-kx*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
        int ky = remainder/cSim.pmeGridSize.z;
        int kz = remainder-ky*cSim.pmeGridSize.z;
        if (kx == 0 && ky == 0 && kz == 0)
            continue;
        int mx = (kx < (cSim.pmeGridSize.x+1)/2) ? kx : (kx-cSim.pmeGridSize.x);
        int my = (ky < (cSim.pmeGridSize.y+1)/2) ? ky : (ky-cSim.pmeGridSize.y);
        int mz = (kz < (cSim.pmeGridSize.z+1)/2) ? kz : (kz-cSim.pmeGridSize.z);
        float mhx = mx/cSim.periodicBoxSizeX;
        float mhy = my/cSim.periodicBoxSizeY;
        float mhz = mz/cSim.periodicBoxSizeZ;
        float bx = cSim.pPmeBsplineModuli[0][kx];
        float by = cSim.pPmeBsplineModuli[1][ky];
        float bz = cSim.pPmeBsplineModuli[2][kz];
        cuComplex grid = cSim.pPmeGrid[index];
        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        float denom = m2*bx*by*bz;
        float eterm = scaleFactor*exp(-expFactor*m2)/denom;
        cSim.pPmeGrid[index] = make_cuComplex(grid.x*eterm, grid.y*eterm);
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
327
328
329
    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}

Scott Le Grand's avatar
Scott Le Grand committed
330
331
332
333
334
335
336
337
338
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kGridInterpolateForce_kernel()
339
340
341
342
343
{
    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < cSim.atoms; atom += blockDim.x*gridDim.x)
    {
        float3 force = make_float3(0.0f, 0.0f, 0.0f);
        float4 posq = cSim.pPosq[atom];
Peter Eastman's avatar
Peter Eastman committed
344
345
346
347
348
349
        float3 t = make_float3((posq.x/cSim.periodicBoxSizeX+1.0f)*cSim.pmeGridSize.x,
                               (posq.y/cSim.periodicBoxSizeY+1.0f)*cSim.pmeGridSize.y,
                               (posq.z/cSim.periodicBoxSizeZ+1.0f)*cSim.pmeGridSize.z);
        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
                              ((int) t.y) % cSim.pmeGridSize.y,
                              ((int) t.z) % cSim.pmeGridSize.z);
350
351
352
        for (int ix = 0; ix < PME_ORDER; ix++)
        {
            int xindex = (gridIndex.x + ix) % cSim.pmeGridSize.x;
353
354
            float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x;
            float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x;
355
356
357
            for (int iy = 0; iy < PME_ORDER; iy++)
            {
                int yindex = (gridIndex.y + iy) % cSim.pmeGridSize.y;
358
359
                float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y;
                float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y;
360
361
362
                for (int iz = 0; iz < PME_ORDER; iz++)
                {
                    int zindex               = (gridIndex.z + iz) % cSim.pmeGridSize.z;
363
364
                    float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z;
                    float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z;
365
                    int index                = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex;
Peter Eastman's avatar
Peter Eastman committed
366
367
368
369
                    float gridvalue          = cSim.pPmeGrid[index].x;
                    force.x                 += dtx*ty*tz*gridvalue;
                    force.y                 += tx*dty*tz*gridvalue;
                    force.z                 += tx*ty*dtz*gridvalue;
370
371
372
373
374
375
376
377
378
379
                }
            }
        }
        float4 totalForce = cSim.pForce4[atom];
        float q = posq.w*sqrt(cSim.epsfac);
        totalForce.x -= q*force.x*cSim.pmeGridSize.x/cSim.periodicBoxSizeX;
        totalForce.y -= q*force.y*cSim.pmeGridSize.y/cSim.periodicBoxSizeY;
        totalForce.z -= q*force.z*cSim.pmeGridSize.z/cSim.periodicBoxSizeZ;
        cSim.pForce4[atom] = totalForce;
    }
380
381
}

382
383
384
void kCalculatePME(gpuContext gpu)
{
//    printf("kCalculatePME\n");
Peter Eastman's avatar
Peter Eastman committed
385
386
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
    cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
387
    kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
388
    LAUNCHERROR("kUpdateGridIndexAndFraction");
389
390
391
    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
    kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kFindAtomRangeForGrid");
392
393
    unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
    kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
394
    LAUNCHERROR("kUpdateBsplines");
Peter Eastman's avatar
Peter Eastman committed
395
    kGridSpreadCharge_kernel<<<8*gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
396
    LAUNCHERROR("kGridSpreadCharge");
397
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
398
    kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
399
400
    LAUNCHERROR("kReciprocalConvolution");
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
Peter Eastman's avatar
Peter Eastman committed
401
    kGridInterpolateForce_kernel<<<2*gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
402
    LAUNCHERROR("kGridInterpolateForce");
403
}