pme.cl 16.2 KB
Newer Older
1
2
3
__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
        __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const real4 scale = 1/(real) (PME_ORDER-1);
4
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
5
6
        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
        real4 pos = posq[i];
7
8
9
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
10
        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
11
12
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
13
        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
14
15
16
17
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
        pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
18
19
20
21
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
22
            real div = RECIP(j-1.0f);
23
24
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
25
                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
26
27
28
29
            data[0] = div*(- dr+1.0f)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
30
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
31
32
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
33
            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
34
35
36
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
        }
    }
37
}
Peter Eastman's avatar
Peter Eastman committed
38

39
40
41
/**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
42
__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
Peter Eastman's avatar
Peter Eastman committed
43
44
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
45
    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
Peter Eastman's avatar
Peter Eastman committed
46
    for (int i = start; i < end; ++i) {
47
48
49
50
51
52
53
        int2 atomData = pmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last) {
            for (int j = last+1; j <= gridIndex; ++j)
                pmeAtomRange[j] = i;
            last = gridIndex;
        }
Peter Eastman's avatar
Peter Eastman committed
54
    }
55
56
57
58
59
60
61
62

    // Fill in values beyond the last atom.

    if (get_global_id(0) == get_global_size(0)-1) {
        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
        for (int j = last+1; j <= gridSize; ++j)
            pmeAtomRange[j] = NUM_ATOMS;
    }
63
64
}

65
66
67
68
/**
 * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
 * some work in the charge spreading kernel.
 */
69
__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
70
71
72
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    for (int i = start; i < end; ++i) {
73
        real posz = posq[pmeAtomGridIndex[i].x].z;
74
75
76
77
78
79
        posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
        int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
        pmeAtomGridIndex[i].y = z;
    }
}

80
81
82
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable

83
84
85
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)

__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
86
87
void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
88
89
90
91
    int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
    int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
    int iy = remainder/PME_ORDER;
    int iz = remainder-iy*PME_ORDER;
92
93
    __local real4 theta[PME_ORDER];
    __local real charge[BUFFER_SIZE];
94
95
96
    __local int basex[BUFFER_SIZE];
    __local int basey[BUFFER_SIZE];
    __local int basez[BUFFER_SIZE];
97
    if (ix < PME_ORDER) {
98
99
100
101
102
103
        for (int baseIndex = get_group_id(0)*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += get_num_groups(0)*BUFFER_SIZE) {
            // Load the next block of atoms into the buffers.

            if (get_local_id(0) < BUFFER_SIZE) {
                int atomIndex = baseIndex+get_local_id(0);
                if (atomIndex < NUM_ATOMS) {
104
                    real4 pos = posq[atomIndex];
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
                    charge[get_local_id(0)] = pos.w;
                    pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
                    pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
                    pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
                    basex[get_local_id(0)] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
                    basey[get_local_id(0)] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
                    basez[get_local_id(0)] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
            for (int index = 0; index < lastIndex; index++) {
                int atomIndex = index+baseIndex;
                if (get_local_id(0) < PME_ORDER)
                    theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
                barrier(CLK_LOCAL_MEM_FENCE);
121
                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
122
123
124
125
126
127
                int x = basex[index]+ix;
                int y = basey[index]+iy;
                int z = basez[index]+iz;
                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
128
129
130
#ifdef USE_DOUBLE_PRECISION
                atom_add(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], (long) (add*0xFFFFFFFF));
#else
131
                atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
132
#endif
133
            }
134
135
136
137
        }
    }
}

138
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
139
    __global real2* realGrid = (__global real2*) pmeGrid;
140
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
141
    real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
142
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
143
144
145
#ifdef USE_DOUBLE_PRECISION
        long value = pmeGrid[2*index];
#else
146
        long value = pmeGrid[index];
147
148
149
#endif
        real2 realValue = (real2) ((real) (value*scale), 0);
        realGrid[index] = realValue;
150
151
152
    }
}
#else
153
154
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
155
156
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
157
158
        // Compute the charge on a grid point.

Peter Eastman's avatar
Peter Eastman committed
159
        int4 gridPoint;
160
161
162
163
        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
        gridPoint.y = remainder/GRID_SIZE_Z;
        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
164
        real result = 0.0f;
165
166

        // Loop over all atoms that affect this grid point.
167

168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
        for (int ix = 0; ix < PME_ORDER; ++ix) {
            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
            for (int iy = 0; iy < PME_ORDER; ++iy) {
                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
                int z1 = gridPoint.z-PME_ORDER+1;
                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
                int firstAtom = pmeAtomRange[gridIndex1];
                int lastAtom = pmeAtomRange[gridIndex2+1];
                for (int i = firstAtom; i < lastAtom; ++i)
                {
                    int2 atomData = pmeAtomGridIndex[i];
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
185
                    real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
186
187
188
189
190
191
192
193
194
195
196
                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                }
                if (z1 > gridPoint.z)
                {
                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
                    firstAtom = pmeAtomRange[gridIndex1];
                    lastAtom = pmeAtomRange[gridIndex2+1];
                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
                        int2 atomData = pmeAtomGridIndex[i];
197
                        int atomIndex = atomData.x;
198
199
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
200
                        real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
201
202
203
                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    }
                }
204
205
            }
        }
206
        pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
207
208
    }
}
209
#endif
210

211
212
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
213
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
214
    real energy = 0.0f;
215
216
217
218
219
220
221
222
223
224
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
        int ky = remainder/GRID_SIZE_Z;
        int kz = remainder-ky*GRID_SIZE_Z;
        if (kx == 0 && ky == 0 && kz == 0)
            continue;
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
225
226
227
228
229
230
231
232
233
234
235
        real mhx = mx*invPeriodicBoxSize.x;
        real mhy = my*invPeriodicBoxSize.y;
        real mhz = mz*invPeriodicBoxSize.z;
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
236
237
238
239
240
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
}

241
242
243
244
245
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, __local real4* restrict bsplinesCache) {
    const real4 scale = 1/(real) (PME_ORDER-1);
    __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
    __local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
246
    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
247
248
        real4 force = 0.0f;
        real4 pos = posq[atom];
249
250
251
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
252
253
254
        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
255
256
257
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
258
259
260
261

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

262
        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
263
264
265
266
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
267
            real div = RECIP(j-1.0f);
268
269
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
270
                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
271
272
273
274
275
276
277
            data[0] = div*(- dr+1.0f)*data[0];
        }
        ddata[0] = -data[0];
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
278
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
279
280
281
282
        data[0] = scale*(-dr+1.0f)*data[0];

        // Compute the force on this atom.

283
        for (int ix = 0; ix < PME_ORDER; ix++) {
284
285
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
286
            for (int iy = 0; iy < PME_ORDER; iy++) {
287
288
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
289
                for (int iz = 0; iz < PME_ORDER; iz++) {
290
291
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
292
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
293
                    real gridvalue = pmeGrid[index].x;
294
295
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
296
#ifndef MAC_AMD_WORKAROUND
297
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
298
#endif
299
300
301
                }
            }
        }
302
303
304
305
306
307
308
309
310
311
312
#ifdef MAC_AMD_WORKAROUND
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
313
                    real gridvalue = pmeGrid[index].x;
314
315
316
317
318
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
                }
            }
        }
#endif
319
320
        real4 totalForce = forceBuffers[atom];
        real q = pos.w*EPSILON_FACTOR;
321
322
323
        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
        totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
        totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
324
325
326
        forceBuffers[atom] = totalForce;
    }
}