pme.cc 26 KB
Newer Older
1
2
KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
3
4
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
    ) {
5
6
7
    // Compute the index of the grid point each atom is associated with.

    for (int atom = GLOBAL_ID; atom < NUM_ATOMS; atom += GLOBAL_SIZE) {
8
        real4 pos = posq[atom];
9
        APPLY_PERIODIC_TO_POS(pos)
10
11
12
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
13
14
15
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
16
17
18
19
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
20
    }
21
}
Peter Eastman's avatar
Peter Eastman committed
22

23
24
25
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
        GLOBAL mm_ulong* RESTRICT pmeGrid,
26
#else
27
        GLOBAL real* RESTRICT pmeGrid,
28
#endif
29
30
31
32
33
34
35
36
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
#ifdef CHARGE_FROM_SIGEPS
        GLOBAL const float2* RESTRICT sigmaEpsilon
#else
        GLOBAL const real* RESTRICT charges
#endif
        ) {
37
    // Process the atoms in spatially sorted order.  This improves efficiency when writing
38
39
    // the grid values.  PME_ORDER threads process one atom.

40
41
    real3 data[PME_ORDER];
    const real scale = RECIP((real) (PME_ORDER-1));
42
43
    for (int i = GLOBAL_ID; i < NUM_ATOMS*PME_ORDER; i += GLOBAL_SIZE) {
        int atom = pmeAtomGridIndex[i/PME_ORDER].x;
44
        real4 pos = posq[atom];
45
#ifdef CHARGE_FROM_SIGEPS
46
47
48
        const float2 sigEps = sigmaEpsilon[atom];
        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
Peter Eastman's avatar
Peter Eastman committed
49
        const real charge = (CHARGE)*EPSILON_FACTOR;
50
#endif
51
        APPLY_PERIODIC_TO_POS(pos)
52
53
54
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
55
56
57
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
58
59
60
61
62
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
        if (charge == 0)
            continue;
63

64
65
66
        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

67
68
        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
69
        data[1] = dr;
70
        data[0] = make_real3(1)-dr;
71
        for (int j = 3; j < PME_ORDER; j++) {
72
            real div = RECIP((real) (j-1));
73
74
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
75
76
                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
77
78
79
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
80
81
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
82

83
84
        // Spread the charge from this atom onto each grid point.  PME_ORDER threads access
        // consecutive addresses.
85

86
87
88
89
90
91
92
93
        int iz = i%PME_ORDER;
        int zindex = gridIndex.z+iz;
        zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
        real dz = 0;
        for (int i = 0; i < PME_ORDER; i++) {
            dz = i == iz ? data[i].z : dz;
        }
        dz *= charge;
94
        for (int ix = 0; ix < PME_ORDER; ix++) {
Peter Eastman's avatar
Peter Eastman committed
95
96
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
97
98
            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
            real dzdx = dz*data[ix].x;
99
            for (int iy = 0; iy < PME_ORDER; iy++) {
Peter Eastman's avatar
Peter Eastman committed
100
101
                int ybase = gridIndex.y+iy;
                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
102
103
104
                ybase = ybase*GRID_SIZE_Z;
                int index = xbase + ybase + zindex;
                real add = dzdx*data[iy].y;
105
                if (fabs(add) > 2.3e-10f) { // Smallest value representable in 64 bit fixed point
106
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
107
                    ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
108
#if defined(__GFX12__) && defined(USE_HIP)
109
110
111
112
                    // Workaround for rare cases when few values of pmeGrid are very large and
                    // incorrect. The cause is unknown. Why this workaround or other irrelevant
                    // changes like printf help is also unknown.
                    asm volatile("s_wait_storecnt 0x0");
113
#endif
114
#else
115
                    ATOMIC_ADD(&pmeGrid[index], add);
116
#endif
117
                }
118
            }
119
120
121
122
        }
    }
}

one's avatar
one committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#if defined(USE_HIP) && defined(CHARGE_FROM_SIGEPS) && PME_ORDER == 5 && defined(PME_USE_WAVE64_LDS_SPREAD)
// HIP LJ-PME dispersion spread variant for wave64 hardware.  It keeps the generic
// PME spread algorithm unchanged, but maps one atom across PME_ORDER wavefront
// lanes so the five z-slices can be accumulated in parallel after one lane
// computes the atom's charge, grid index, and B-spline coefficients.
KERNEL void gridSpreadChargeWave64Lds(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
        GLOBAL mm_ulong* RESTRICT pmeGrid,
#else
        GLOBAL real* RESTRICT pmeGrid,
#endif
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
        GLOBAL const float2* RESTRICT sigmaEpsilon
        ) {
    real3 data[PME_ORDER];
    const real scale = RECIP((real) (PME_ORDER-1));
    // B-spline coefficients are stored as [localAtom][order] so all wavefront
    // lanes for the same atom read contiguous LDS entries during the spread phase.
    LOCAL real sharedDataX[PME_SPREAD_BLOCK_SIZE];
    LOCAL real sharedDataY[PME_SPREAD_BLOCK_SIZE];
    LOCAL real sharedDataZ[PME_SPREAD_BLOCK_SIZE];
    LOCAL real sharedCharge[PME_SPREAD_ATOMS_PER_BLOCK];
    LOCAL int sharedGridX[PME_SPREAD_ATOMS_PER_BLOCK];
    LOCAL int sharedGridY[PME_SPREAD_ATOMS_PER_BLOCK];
    LOCAL int sharedGridZ[PME_SPREAD_ATOMS_PER_BLOCK];
    // Each wavefront handles floor(PME_SPREAD_WAVE_SIZE/PME_ORDER) atoms.  For
    // PME_ORDER=5, lanes 0..4 spread one atom, lanes 5..9 spread the next, and
    // the remaining lanes in the wavefront are inactive.
    const int waveLane = LOCAL_ID & (PME_SPREAD_WAVE_SIZE-1);
    const int wave = LOCAL_ID/PME_SPREAD_WAVE_SIZE;
    int atomLane = waveLane-(waveLane/PME_ORDER)*PME_ORDER;
    int atomInWave = waveLane/PME_ORDER;
    int localAtom = wave*PME_SPREAD_ATOMS_PER_WAVE+atomInWave;
    const int atomBlockStride = (GLOBAL_SIZE/PME_SPREAD_BLOCK_SIZE)*PME_SPREAD_ATOMS_PER_BLOCK;
    for (int atomBlockStart = GROUP_ID*PME_SPREAD_ATOMS_PER_BLOCK; atomBlockStart < NUM_ATOMS; atomBlockStart += atomBlockStride) {
        int atomIndex = atomBlockStart+localAtom;
        bool isActive = (waveLane < PME_SPREAD_ATOMS_PER_WAVE*PME_ORDER && atomIndex < NUM_ATOMS);
        if (isActive && atomLane == 0) {
            int atom = pmeAtomGridIndex[atomIndex].x;
            real4 pos = posq[atom];
            const float2 sigEps = sigmaEpsilon[atom];
            const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
            APPLY_PERIODIC_TO_POS(pos)
            real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                                 pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                                 pos.z*recipBoxVecZ.z);
            t.x = (t.x-floor(t.x))*GRID_SIZE_X;
            t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
            t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
            int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                       ((int) t.y) % GRID_SIZE_Y,
                                       ((int) t.z) % GRID_SIZE_Z);
            sharedCharge[localAtom] = charge;
            sharedGridX[localAtom] = gridIndex.x;
            sharedGridY[localAtom] = gridIndex.y;
            sharedGridZ[localAtom] = gridIndex.z;

            real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
            data[PME_ORDER-1] = make_real3(0);
            data[1] = dr;
            data[0] = make_real3(1)-dr;
            for (int j = 3; j < PME_ORDER; j++) {
                real div = RECIP((real) (j-1));
                data[j-1] = div*dr*data[j-2];
                for (int k = 1; k < (j-1); k++)
                    data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
                data[0] = div*(make_real3(1)-dr)*data[0];
            }
            data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
            for (int j = 1; j < (PME_ORDER-1); j++)
                data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
            data[0] = scale*(make_real3(1)-dr)*data[0];
            for (int j = 0; j < PME_ORDER; j++) {
                int offset = localAtom*PME_ORDER+j;
                sharedDataX[offset] = data[j].x;
                sharedDataY[offset] = data[j].y;
                sharedDataZ[offset] = data[j].z;
            }
        }
        // All consumers of an atom's LDS data are in the same wavefront as the
        // producing lane, so a wave-level barrier is sufficient here.
        SYNC_WARPS
        if (isActive) {
            real charge = sharedCharge[localAtom];
            if (charge != 0) {
                int gridX = sharedGridX[localAtom];
                int gridY = sharedGridY[localAtom];
                int gridZ = sharedGridZ[localAtom];
                int zindex = gridZ+atomLane;
                zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                real dz = sharedDataZ[localAtom*PME_ORDER+atomLane]*charge;
                for (int ix = 0; ix < PME_ORDER; ix++) {
                    int xbase = gridX+ix;
                    xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
                    xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
                    real dzdx = dz*sharedDataX[localAtom*PME_ORDER+ix];
                    for (int iy = 0; iy < PME_ORDER; iy++) {
                        int ybase = gridY+iy;
                        ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                        ybase = ybase*GRID_SIZE_Z;
                        int index = xbase + ybase + zindex;
                        real add = dzdx*sharedDataY[localAtom*PME_ORDER+iy];
                        if (fabs(add) > 2.3e-10f) { // Smallest value representable in 64 bit fixed point
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
                            ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
#if defined(__GFX12__) && defined(USE_HIP)
                            // Workaround for rare cases when few values of pmeGrid are very large and
                            // incorrect. The cause is unknown. Why this workaround or other irrelevant
                            // changes like printf help is also unknown.
                            asm volatile("s_wait_storecnt 0x0");
#endif
#else
                            ATOMIC_ADD(&pmeGrid[index], add);
#endif
                        }
                    }
                }
            }
        }
        SYNC_WARPS
    }
}
#endif

248
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
249
250

KERNEL void finishSpreadCharge(
251
252
        GLOBAL const mm_long* RESTRICT grid1,
        GLOBAL real* RESTRICT grid2) {
253
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
Peter Eastman's avatar
Peter Eastman committed
254
    real scale = 1/(real) 0x100000000;
255
    for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
256
        grid2[index] = scale*grid1[index];
257
258
    }
}
259

260
261
#endif

262
263
264
KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
        GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
265
266
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
267
268
269
270
271
272
273
#ifdef USE_LJPME
    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
    real bfac = M_PI / EWALD_ALPHA;
    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
#else
274
    const real recipScaleFactor = RECIP(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
275
#endif
276

277
    for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
278
279
280
281
282
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
        int ky = remainder/(GRID_SIZE_Z/2+1);
        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
283
284
285
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
286
287
288
        real mhx = mx*recipBoxVecX.x;
        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
289
290
291
292
293
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
294
295
296
297
298
299
300
#ifdef USE_LJPME
        real denom = recipScaleFactor/(bx*by*bz);
        real m = SQRT(m2);
        real m3 = m*m2;
        real b = bfac*m;
        real expfac = -b*b;
        real expterm = EXP(expfac);
301
        real erfcterm = ERFC(b);
302
        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
303
        pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
304
#else
305
306
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
307
        if (kx != 0 || ky != 0 || kz != 0) {
308
            pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
309
310
        } else {
            pmeGrid[index] = make_real2(0);
311
        }
312
#endif
313
314
315
    }
}

316
317
KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RESTRICT energyBuffer,
                      GLOBAL const real* RESTRICT pmeBsplineModuliX, GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
318
319
320
                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
321
322
323
324
325
326
327
 #ifdef USE_LJPME
    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
    real bfac = M_PI / EWALD_ALPHA;
    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
#else
328
    const real recipScaleFactor = RECIP(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
329
#endif
330

331
    mixed energy = 0;
332
    for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
        int ky = remainder/(GRID_SIZE_Z);
        int kz = remainder-ky*(GRID_SIZE_Z);
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
        real mhx = mx*recipBoxVecX.x;
        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
348
349
350
351
352
353
354
#ifdef USE_LJPME
        real denom = recipScaleFactor/(bx*by*bz);
        real m = SQRT(m2);
        real m3 = m*m2;
        real b = bfac*m;
        real expfac = -b*b;
        real expterm = EXP(expfac);
355
        real erfcterm = ERFC(b);
356
357
        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
#else
358
359
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
360
#endif
361
362
363
364
        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
            kz = GRID_SIZE_Z-kz;
365
        }
366
367
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = pmeGrid[indexInHalfComplexGrid];
368
369
370
#ifndef USE_LJPME
        if (kx != 0 || ky != 0 || kz != 0)
#endif
371
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
372
    }
373
#if defined(USE_PME_STREAM) && !defined(USE_LJPME)
374
    energyBuffer[GLOBAL_ID] = 0.5f*energy;
peastman's avatar
peastman committed
375
#else
376
    energyBuffer[GLOBAL_ID] += 0.5f*energy;
peastman's avatar
peastman committed
377
#endif
378
379
}

380
381
382
383
384
KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
#ifdef CHARGE_FROM_SIGEPS
        GLOBAL const float2* RESTRICT sigmaEpsilon
385
#else
386
        GLOBAL const real* RESTRICT charges
387
#endif
388
389
390
391
        ) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP((real) (PME_ORDER-1));
392

393
394
    // Process the atoms in spatially sorted order.  This improves cache performance when loading
    // the grid values.
395

396
    for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) {
397
        int atom = pmeAtomGridIndex[i].x;
398
        real3 force = make_real3(0);
399
        real4 pos = posq[atom];
400
        APPLY_PERIODIC_TO_POS(pos)
401
402
403
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
404
405
406
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
407
408
409
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
410
411
412
413
414
415
416
417
#ifdef CHARGE_FROM_SIGEPS
        const float2 sigEps = sigmaEpsilon[atom];
        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
        real q = CHARGE*EPSILON_FACTOR;
#endif
        if (q == 0)
            continue;
418
419
420
421

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

422
423
        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
424
        data[1] = dr;
425
        data[0] = make_real3(1)-dr;
426
        for (int j = 3; j < PME_ORDER; j++) {
427
            real div = RECIP((real) (j-1));
428
429
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
430
431
                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
432
433
434
435
436
437
        }
        ddata[0] = -data[0];
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
438
439
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
440
441
442

        // Compute the force on this atom.

443
        for (int ix = 0; ix < PME_ORDER; ix++) {
444
445
446
447
448
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
            real dx = data[ix].x;
            real ddx = ddata[ix].x;
449

450
            for (int iy = 0; iy < PME_ORDER; iy++) {
451
452
453
454
455
                int ybase = gridIndex.y+iy;
                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                ybase = xbase + ybase*GRID_SIZE_Z;
                real dy = data[iy].y;
                real ddy = ddata[iy].y;
456

457
                for (int iz = 0; iz < PME_ORDER; iz++) {
458
459
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
460
                    int index = ybase + zindex;
461
                    real gridvalue = pmeGrid[index];
462
463
464
                    force.x += ddx*dy*data[iz].z*gridvalue;
                    force.y += dx*ddy*data[iz].z*gridvalue;
                    force.z += dx*dy*ddata[iz].z*gridvalue;
465
466
467
                }
            }
        }
468
469
470
471
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);
#ifdef USE_PME_STREAM
472
473
474
        ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) realToFixedPoint(forceX));
        ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(forceY));
        ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(forceZ));
475
#else
476
477
478
        forceBuffers[atom] += (mm_ulong) realToFixedPoint(forceX);
        forceBuffers[atom+PADDED_NUM_ATOMS] += (mm_ulong) realToFixedPoint(forceY);
        forceBuffers[atom+2*PADDED_NUM_ATOMS] += (mm_ulong) realToFixedPoint(forceZ);
479
#endif
480
481
    }
}
482

483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
KERNEL void gridInterpolateChargeDerivatives(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT derivatives, GLOBAL const real* RESTRICT pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int* RESTRICT atomIndices
        ) {
    real3 data[PME_ORDER];
    const real scale = RECIP((real) (PME_ORDER-1));

    for (int i = GLOBAL_ID; i < NUM_INDICES; i += GLOBAL_SIZE) {
        int atom = atomIndices[i];
        real derivative = 0;
        real4 pos = posq[atom];
        APPLY_PERIODIC_TO_POS(pos)
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
        for (int j = 3; j < PME_ORDER; j++) {
            real div = RECIP((real) (j-1));
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];

        // Compute the charge derivative on this atom.

        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
            real dx = data[ix].x;

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndex.y+iy;
                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                ybase = xbase + ybase*GRID_SIZE_Z;
                real dy = data[iy].y;

                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    derivative += dx*dy*data[iz].z*pmeGrid[ybase + zindex];
                }
            }
        }
        derivative *= EPSILON_FACTOR;
#ifdef USE_PME_STREAM
        ATOMIC_ADD(&derivatives[i], (mm_ulong) realToFixedPoint(derivative));
#else
        derivatives[i] += (mm_ulong) realToFixedPoint(derivative);
#endif
    }
}

554
555
556
KERNEL void addForces(GLOBAL const real4* RESTRICT forces, GLOBAL mm_long* RESTRICT forceBuffers) {
    for (int atom = GLOBAL_ID; atom < NUM_ATOMS; atom += GLOBAL_SIZE) {
        real4 f = forces[atom];
557
558
559
        forceBuffers[atom] += realToFixedPoint(f.x);
        forceBuffers[atom+PADDED_NUM_ATOMS] += realToFixedPoint(f.y);
        forceBuffers[atom+2*PADDED_NUM_ATOMS] += realToFixedPoint(f.z);
560
    }
561
}
562

563
564
KERNEL void addEnergy(GLOBAL const mixed* RESTRICT pmeEnergyBuffer, GLOBAL mixed* RESTRICT energyBuffer, int bufferSize) {
    for (int i = GLOBAL_ID; i < bufferSize; i += GLOBAL_SIZE)
565
566
        energyBuffer[i] += pmeEnergyBuffer[i];
}