pme.cl 19.2 KB
Newer Older
1
2
3
__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
        __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const real4 scale = 1/(real) (PME_ORDER-1);
4
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
5
6
        __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
        real4 pos = posq[i];
7
8
9
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
10
        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
11
12
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
13
        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
14
15
16
17
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
        pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
18
#ifndef SUPPORTS_64_BIT_ATOMICS
19
20
21
22
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
23
            real div = RECIP(j-1.0f);
24
25
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
26
                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
27
28
29
30
            data[0] = div*(- dr+1.0f)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
31
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
32
33
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
34
            data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
35
36
            pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
        }
37
#endif
38
    }
39
}
Peter Eastman's avatar
Peter Eastman committed
40

41
42
43
/**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
44
__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
Peter Eastman's avatar
Peter Eastman committed
45
46
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
47
    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
Peter Eastman's avatar
Peter Eastman committed
48
    for (int i = start; i < end; ++i) {
49
50
51
52
53
54
55
        int2 atomData = pmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last) {
            for (int j = last+1; j <= gridIndex; ++j)
                pmeAtomRange[j] = i;
            last = gridIndex;
        }
Peter Eastman's avatar
Peter Eastman committed
56
    }
57
58
59
60
61
62
63
64

    // Fill in values beyond the last atom.

    if (get_global_id(0) == get_global_size(0)-1) {
        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
        for (int j = last+1; j <= gridSize; ++j)
            pmeAtomRange[j] = NUM_ATOMS;
    }
65
66
}

67
68
69
70
/**
 * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
 * some work in the charge spreading kernel.
 */
71
__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
72
73
74
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    for (int i = start; i < end; ++i) {
75
        real posz = posq[pmeAtomGridIndex[i].x].z;
76
77
78
79
80
81
        posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
        int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
        pmeAtomGridIndex[i].y = z;
    }
}

82
83
84
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable

85
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
86
        __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    const real4 scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
    
    // Process the atoms in spatially sorted order.  This improves efficiency when writing
    // the grid values.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        int atom = pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
            real div = RECIP(j-1.0f);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
            data[0] = div*(- dr+1.0f)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
        data[0] = scale*(-dr+1.0f)*data[0];

        // Spread the charge from this atom onto each grid point.

        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
138
#ifdef USE_DOUBLE_PRECISION
139
                    atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
140
#else
141
                    atom_add(&pmeGrid[index], (long) (add*0x100000000));
142
#endif
143
144

                }
145
            }
146
147
148
149
        }
    }
}

150
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
151
    __global real2* realGrid = (__global real2*) pmeGrid;
152
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
153
    real scale = EPSILON_FACTOR/(real) 0x100000000;
154
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
155
156
157
#ifdef USE_DOUBLE_PRECISION
        long value = pmeGrid[2*index];
#else
158
        long value = pmeGrid[index];
159
160
161
#endif
        real2 realValue = (real2) ((real) (value*scale), 0);
        realGrid[index] = realValue;
162
163
    }
}
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#elif defined(DEVICE_IS_CPU)
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    if (firstx == lastx)
        return;
    const real4 scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
    
    // Process the atoms in spatially sorted order.  This improves efficiency when writing
    // the grid values.
    
    for (int i = 0; i < NUM_ATOMS; i++) {
        int atom = i;//pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);

        // Spread the charge from this atom onto each grid point.

        bool hasComputedThetas = false;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            if (xindex < firstx || xindex >= lastx)
                continue;
            if (!hasComputedThetas) {
                hasComputedThetas = true;
                
                // Since we need the full set of thetas, it's faster to compute them here than load them
                // from global memory.

                real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
                data[PME_ORDER-1] = 0.0f;
                data[1] = dr;
                data[0] = 1.0f-dr;
                for (int j = 3; j < PME_ORDER; j++) {
                    real div = RECIP(j-1.0f);
                    data[j-1] = div*dr*data[j-2];
                    for (int k = 1; k < (j-1); k++)
                        data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
                    data[0] = div*(- dr+1.0f)*data[0];
                }
                data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
                for (int j = 1; j < (PME_ORDER-1); j++)
                    data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
                data[0] = scale*(-dr+1.0f)*data[0];
            }
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
                }
            }
        }
    }
}
233
#else
234
235
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
236
237
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
238
239
        // Compute the charge on a grid point.

Peter Eastman's avatar
Peter Eastman committed
240
        int4 gridPoint;
241
242
243
244
        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
        gridPoint.y = remainder/GRID_SIZE_Z;
        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
245
        real result = 0.0f;
246
247

        // Loop over all atoms that affect this grid point.
248

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
        for (int ix = 0; ix < PME_ORDER; ++ix) {
            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
            for (int iy = 0; iy < PME_ORDER; ++iy) {
                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
                int z1 = gridPoint.z-PME_ORDER+1;
                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
                int firstAtom = pmeAtomRange[gridIndex1];
                int lastAtom = pmeAtomRange[gridIndex2+1];
                for (int i = firstAtom; i < lastAtom; ++i)
                {
                    int2 atomData = pmeAtomGridIndex[i];
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
266
                    real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
267
268
269
270
271
272
273
274
275
276
277
                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                }
                if (z1 > gridPoint.z)
                {
                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
                    firstAtom = pmeAtomRange[gridIndex1];
                    lastAtom = pmeAtomRange[gridIndex2+1];
                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
                        int2 atomData = pmeAtomGridIndex[i];
278
                        int atomIndex = atomData.x;
279
280
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
281
                        real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
282
283
284
                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    }
                }
285
286
            }
        }
287
        pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
288
289
    }
}
290
#endif
291

292
293
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
294
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
295
    real energy = 0.0f;
296
297
298
299
300
301
302
303
304
305
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
        int ky = remainder/GRID_SIZE_Z;
        int kz = remainder-ky*GRID_SIZE_Z;
        if (kx == 0 && ky == 0 && kz == 0)
            continue;
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
306
307
308
309
310
311
312
313
314
315
316
        real mhx = mx*invPeriodicBoxSize.x;
        real mhy = my*invPeriodicBoxSize.y;
        real mhz = mz*invPeriodicBoxSize.z;
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
317
318
319
320
321
        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
}

322
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
323
        real4 periodicBoxSize, real4 invPeriodicBoxSize, __global int2* restrict pmeAtomGridIndex) {
324
    const real4 scale = 1/(real) (PME_ORDER-1);
325
326
327
328
329
330
331
332
    real4 data[PME_ORDER];
    real4 ddata[PME_ORDER];
    
    // Process the atoms in spatially sorted order.  This improves cache performance when loading
    // the grid values.
    
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        int atom = pmeAtomGridIndex[i].x;
333
334
        real4 force = 0.0f;
        real4 pos = posq[atom];
335
336
337
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
338
339
340
        real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                           (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                           (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
341
342
343
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);
344
345
346
347

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

348
        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
349
350
351
352
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
353
            real div = RECIP(j-1.0f);
354
355
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
356
                data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
357
358
359
360
361
362
363
            data[0] = div*(- dr+1.0f)*data[0];
        }
        ddata[0] = -data[0];
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
364
            data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
365
366
367
368
        data[0] = scale*(-dr+1.0f)*data[0];

        // Compute the force on this atom.

369
        for (int ix = 0; ix < PME_ORDER; ix++) {
370
371
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
372
            for (int iy = 0; iy < PME_ORDER; iy++) {
373
374
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
375
                for (int iz = 0; iz < PME_ORDER; iz++) {
376
377
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
378
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
379
                    real gridvalue = pmeGrid[index].x;
380
381
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
382
383
384
385
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
                }
            }
        }
386
387
        real4 totalForce = forceBuffers[atom];
        real q = pos.w*EPSILON_FACTOR;
388
389
390
        totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
        totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
        totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
391
392
393
        forceBuffers[atom] = totalForce;
    }
}