pme.cc 27.7 KB
Newer Older
1
2
KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
3
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
4
5
6
7
#ifndef SUPPORTS_64_BIT_ATOMICS
        , GLOBAL real4* RESTRICT pmeBsplineTheta, LOCAL real4* RESTRICT bsplinesCache,
#ifdef CHARGE_FROM_SIGEPS
        GLOBAL const float2* RESTRICT sigmaEpsilon
8
#else
9
10
        GLOBAL const real* RESTRICT charges
#endif
11
12
#endif
    ) {
13
14
15
    // Compute the index of the grid point each atom is associated with.

    for (int atom = GLOBAL_ID; atom < NUM_ATOMS; atom += GLOBAL_SIZE) {
16
        real4 pos = posq[atom];
17
        APPLY_PERIODIC_TO_POS(pos)
18
19
20
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
21
22
23
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
24
25
26
27
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
28
#ifndef SUPPORTS_64_BIT_ATOMICS
29
30
31
32
        // Compute B-splines here for use in the charge spreading kernel.
        const real4 scale = 1/(real) (PME_ORDER-1);
        LOCAL real4* data = &bsplinesCache[LOCAL_ID*PME_ORDER];
        real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
33
34
35
36
        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
        data[0] = 1.0f-dr;
        for (int j = 3; j < PME_ORDER; j++) {
37
            real div = RECIP(j-1.0f);
38
39
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
40
                data[j-k-1] = div*((dr+make_real4(k))*data[j-k-2] + (-dr+make_real4(j-k))*data[j-k-1]);
41
42
43
44
            data[0] = div*(- dr+1.0f)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
45
            data[PME_ORDER-j-1] = scale*((dr+make_real4(j))*data[PME_ORDER-j-2] + (-dr+make_real4(PME_ORDER-j))*data[PME_ORDER-j-1]);
46
47
        data[0] = scale*(-dr+1.0f)*data[0];
        for (int j = 0; j < PME_ORDER; j++) {
48
#ifdef CHARGE_FROM_SIGEPS
49
            const float2 sigEps = sigmaEpsilon[atom];
50
51
            const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
52
            const real charge = CHARGE;
53
54
#endif
            data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
55
            pmeBsplineTheta[atom+j*NUM_ATOMS] = data[j];
56
        }
57
#endif
58
    }
59
}
Peter Eastman's avatar
Peter Eastman committed
60

61
62
63
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable

64
65
66
#if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
67
68
69
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
        GLOBAL mm_ulong* RESTRICT pmeGrid,
70
#else
71
        GLOBAL real* RESTRICT pmeGrid,
72
#endif
73
74
75
76
77
78
79
80
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
#ifdef CHARGE_FROM_SIGEPS
        GLOBAL const float2* RESTRICT sigmaEpsilon
#else
        GLOBAL const real* RESTRICT charges
#endif
        ) {
81
82
83
84
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
    (void)GLOBAL_ID;
#endif
Peter Eastman's avatar
Peter Eastman committed
85
86
87
88
    // To improve memory efficiency, we divide indices along the z axis into
    // PME_ORDER blocks, where the data for each block is stored together.  We
    // can ensure that all threads write to the same block at the same time,
    // which leads to better coalescing of writes.
89
    
90
    LOCAL int zindexTable[GRID_SIZE_Z+PME_ORDER];
Peter Eastman's avatar
Peter Eastman committed
91
    int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
92
    for (int i = LOCAL_ID; i < GRID_SIZE_Z+PME_ORDER; i += LOCAL_SIZE) {
Peter Eastman's avatar
Peter Eastman committed
93
        int zindex = i % GRID_SIZE_Z;
94
        int block = zindex % PME_ORDER;
Peter Eastman's avatar
Peter Eastman committed
95
96
        zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
    }
97
98
    SYNC_THREADS;
    
99
100
101
    // Process the atoms in spatially sorted order.  This improves efficiency when writing
    // the grid values.
    
102
103
104
    real3 data[PME_ORDER];
    const real scale = RECIP((real) (PME_ORDER-1));
    for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) {
105
106
        int atom = pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
107
#ifdef CHARGE_FROM_SIGEPS
108
109
110
        const float2 sigEps = sigmaEpsilon[atom];
        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
Peter Eastman's avatar
Peter Eastman committed
111
        const real charge = (CHARGE)*EPSILON_FACTOR;
112
#endif
113
        APPLY_PERIODIC_TO_POS(pos)
114
115
116
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
117
118
119
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
120
121
122
123
124
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
        if (charge == 0)
            continue;
125

126
127
128
        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

129
130
        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
131
        data[1] = dr;
132
        data[0] = make_real3(1)-dr;
133
        for (int j = 3; j < PME_ORDER; j++) {
134
            real div = RECIP((real) (j-1));
135
136
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
137
138
                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
139
140
141
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
142
143
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
144
145
146

        // Spread the charge from this atom onto each grid point.

147
        int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
148
        for (int ix = 0; ix < PME_ORDER; ix++) {
Peter Eastman's avatar
Peter Eastman committed
149
150
151
152
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            xbase = xbase*GRID_SIZE_Y;
            real dx = charge*data[ix].x;
153
            for (int iy = 0; iy < PME_ORDER; iy++) {
Peter Eastman's avatar
Peter Eastman committed
154
155
156
157
158
                int ybase = gridIndex.y+iy;
                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                ybase = (xbase+ybase)*blockSize;
                real dxdy = dx*data[iy].y;
                for (int i = 0; i < PME_ORDER; i++) {
159
                    int iz = (i+izoffset) % PME_ORDER;
160
                    int zindex = gridIndex.z+iz;
Peter Eastman's avatar
Peter Eastman committed
161
162
                    int index = ybase + zindexTable[zindex];
                    real add = dxdy*data[iz].z;
163
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
164
                    ATOMIC_ADD(&pmeGrid[index], (mm_ulong) realToFixedPoint(add));
165
166
167
#else
                    ATOMIC_ADD(&pmeGrid[index], add);
#endif
168
                }
169
            }
170
171
172
173
        }
    }
}

174
175
176
177
178
179
180
KERNEL void finishSpreadCharge(
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
        GLOBAL const mm_long* RESTRICT grid1,
#else
        GLOBAL const real* RESTRICT grid1,
#endif
        GLOBAL real* RESTRICT grid2) {
181
182
183
184
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
    (void)GLOBAL_ID;
#endif
Peter Eastman's avatar
Peter Eastman committed
185
186
    // During charge spreading, we shuffled the order of indices along the z
    // axis to make memory access more efficient.  We now need to unshuffle
187
188
    // them.  If the values were accumulated as fixed point, we also need to
    // convert them to floating point.
Peter Eastman's avatar
Peter Eastman committed
189

190
    LOCAL int zindexTable[GRID_SIZE_Z];
Peter Eastman's avatar
Peter Eastman committed
191
    int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
192
    for (int i = LOCAL_ID; i < GRID_SIZE_Z; i += LOCAL_SIZE) {
193
        int block = i % PME_ORDER;
Peter Eastman's avatar
Peter Eastman committed
194
195
        zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
    }
196
    SYNC_THREADS;
197
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
Peter Eastman's avatar
Peter Eastman committed
198
    real scale = 1/(real) 0x100000000;
199
    for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
Peter Eastman's avatar
Peter Eastman committed
200
201
        int zindex = index%GRID_SIZE_Z;
        int loadIndex = zindexTable[zindex] + blockSize*(int) (index/GRID_SIZE_Z);
202
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
Peter Eastman's avatar
Peter Eastman committed
203
        grid2[index] = scale*grid1[loadIndex];
204
205
206
#else
        grid2[index] = grid1[loadIndex];
#endif
207
208
    }
}
209
#elif defined(DEVICE_IS_CPU)
210
211
212
213
214
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ,
#ifdef CHARGE_FROM_SIGEPS
        GLOBAL const float2* RESTRICT sigmaEpsilon
215
#else
216
        GLOBAL const real* RESTRICT charges
217
218
#endif
    ) {
219
220
    const int firstx = GLOBAL_ID*GRID_SIZE_X/GLOBAL_SIZE;
    const int lastx = (GLOBAL_ID+1)*GRID_SIZE_X/GLOBAL_SIZE;
221
222
223
224
225
226
227
228
229
    if (firstx == lastx)
        return;
    const real4 scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
    
    // Process the atoms in spatially sorted order.  This improves efficiency when writing
    // the grid values.
    
    for (int i = 0; i < NUM_ATOMS; i++) {
230
        int atom = i;
231
        real4 pos = posq[atom];
232
        APPLY_PERIODIC_TO_POS(pos)
233
234
235
236
237
238
        real3 t = (real3) (pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                           pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                           pos.z*recipBoxVecZ.z);
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
239
240
241
242
243
244
        int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z, 0);

        // Spread the charge from this atom onto each grid point.

245
#ifdef CHARGE_FROM_SIGEPS
246
247
248
        const float2 sigEps = sigmaEpsilon[atom];
        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
Peter Eastman's avatar
Peter Eastman committed
249
        const real charge = (CHARGE)*EPSILON_FACTOR;
250
#endif
251
252
        if (charge == 0)
            continue;
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
        bool hasComputedThetas = false;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            if (xindex < firstx || xindex >= lastx)
                continue;
            if (!hasComputedThetas) {
                hasComputedThetas = true;
                
                // Since we need the full set of thetas, it's faster to compute them here than load them
                // from global memory.

                real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
                data[PME_ORDER-1] = 0.0f;
                data[1] = dr;
                data[0] = 1.0f-dr;
                for (int j = 3; j < PME_ORDER; j++) {
                    real div = RECIP(j-1.0f);
                    data[j-1] = div*dr*data[j-2];
                    for (int k = 1; k < (j-1); k++)
                        data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
                    data[0] = div*(- dr+1.0f)*data[0];
                }
                data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
                for (int j = 1; j < (PME_ORDER-1); j++)
                    data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
                data[0] = scale*(-dr+1.0f)*data[0];
            }
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int yindex = gridIndex.y+iy;
                yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                for (int iz = 0; iz < PME_ORDER; iz++) {
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
Peter Eastman's avatar
Peter Eastman committed
288
                    pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
289
290
291
292
293
                }
            }
        }
    }
}
294
#else
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
/**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
KERNEL void findAtomRangeForGrid(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL int* RESTRICT pmeAtomRange, GLOBAL const real4* RESTRICT posq) {
    int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
    int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
    for (int i = start; i < end; ++i) {
        int2 atomData = pmeAtomGridIndex[i];
        int gridIndex = atomData.y;
        if (gridIndex != last) {
            for (int j = last+1; j <= gridIndex; ++j)
                pmeAtomRange[j] = i;
            last = gridIndex;
        }
    }

    // Fill in values beyond the last atom.

    if (GLOBAL_ID == GLOBAL_SIZE-1) {
        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
        for (int j = last+1; j <= gridSize; ++j)
            pmeAtomRange[j] = NUM_ATOMS;
    }
}

/**
 * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
 * some work in the charge spreading kernel.
 */
KERNEL void recordZIndex(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL const real4* RESTRICT posq, real4 periodicBoxSize, real4 recipBoxVecZ) {
    int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
    int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
    for (int i = start; i < end; ++i) {
        real posz = posq[pmeAtomGridIndex[i].x].z;
        posz -= floor(posz*recipBoxVecZ.z)*periodicBoxSize.z;
        int z = ((int) ((posz*recipBoxVecZ.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
        pmeAtomGridIndex[i].y = z;
    }
}

KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
        GLOBAL const int2* RESTRICT pmeAtomGridIndex, GLOBAL const int* RESTRICT pmeAtomRange,
        GLOBAL const real4* RESTRICT pmeBsplineTheta
#ifdef CHARGE_FROM_SIGEPS
        , GLOBAL const float2* RESTRICT sigmaEpsilon
341
#else
342
        , GLOBAL const real* RESTRICT charges
343
344
#endif
    ) {
345
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
346
    for (int gridIndex = GLOBAL_ID; gridIndex < numGridPoints; gridIndex += GLOBAL_SIZE) {
347
348
        // Compute the charge on a grid point.

Peter Eastman's avatar
Peter Eastman committed
349
        int4 gridPoint;
350
351
352
353
        gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
        int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
        gridPoint.y = remainder/GRID_SIZE_Z;
        gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
354
        real result = 0.0f;
355
356

        // Loop over all atoms that affect this grid point.
357

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
        for (int ix = 0; ix < PME_ORDER; ++ix) {
            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
            for (int iy = 0; iy < PME_ORDER; ++iy) {
                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
                int z1 = gridPoint.z-PME_ORDER+1;
                z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
                int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
                int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
                int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
                int firstAtom = pmeAtomRange[gridIndex1];
                int lastAtom = pmeAtomRange[gridIndex2+1];
                for (int i = firstAtom; i < lastAtom; ++i)
                {
                    int2 atomData = pmeAtomGridIndex[i];
                    int atomIndex = atomData.x;
                    int z = atomData.y;
                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
375
                    real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
376
377
378
379
380
381
382
383
384
385
386
                    result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                }
                if (z1 > gridPoint.z)
                {
                    gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
                    gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
                    firstAtom = pmeAtomRange[gridIndex1];
                    lastAtom = pmeAtomRange[gridIndex2+1];
                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
                        int2 atomData = pmeAtomGridIndex[i];
387
                        int atomIndex = atomData.x;
388
389
                        int z = atomData.y;
                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
390
                        real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
391
392
393
                        result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
                    }
                }
394
395
            }
        }
396
        pmeGrid[gridIndex] = result*EPSILON_FACTOR;
397
398
    }
}
399
#endif
400

401
402
403
KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
        GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
404
405
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
406
407
408
409
410
411
412
#ifdef USE_LJPME
    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
    real bfac = M_PI / EWALD_ALPHA;
    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
#else
413
    const real recipScaleFactor = RECIP(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
414
#endif
415

416
    for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
417
418
419
420
421
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
        int ky = remainder/(GRID_SIZE_Z/2+1);
        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
422
423
424
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
425
426
427
        real mhx = mx*recipBoxVecX.x;
        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
428
429
430
431
432
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
        real2 grid = pmeGrid[index];
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
433
434
435
436
437
438
439
#ifdef USE_LJPME
        real denom = recipScaleFactor/(bx*by*bz);
        real m = SQRT(m2);
        real m3 = m*m2;
        real b = bfac*m;
        real expfac = -b*b;
        real expterm = EXP(expfac);
440
        real erfcterm = ERFC(b);
441
        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
442
        pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
443
#else
444
445
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
446
        if (kx != 0 || ky != 0 || kz != 0) {
447
            pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
448
        }
449
#endif
450
451
452
    }
}

453
454
KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RESTRICT energyBuffer,
                      GLOBAL const real* RESTRICT pmeBsplineModuliX, GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
455
456
457
                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
458
459
460
461
462
463
464
 #ifdef USE_LJPME
    const real recipScaleFactor = -(2*M_PI/6)*SQRT(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
    real bfac = M_PI / EWALD_ALPHA;
    real fac1 = 2*M_PI*M_PI*M_PI*SQRT(M_PI);
    real fac2 = EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA;
    real fac3 = -2*EWALD_ALPHA*M_PI*M_PI;
#else
465
    const real recipScaleFactor = RECIP(M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
466
#endif
467

468
    mixed energy = 0;
469
    for (int index = GLOBAL_ID; index < gridSize; index += GLOBAL_SIZE) {
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
        int ky = remainder/(GRID_SIZE_Z);
        int kz = remainder-ky*(GRID_SIZE_Z);
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
        real mhx = mx*recipBoxVecX.x;
        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real bx = pmeBsplineModuliX[kx];
        real by = pmeBsplineModuliY[ky];
        real bz = pmeBsplineModuliZ[kz];
485
486
487
488
489
490
491
#ifdef USE_LJPME
        real denom = recipScaleFactor/(bx*by*bz);
        real m = SQRT(m2);
        real m3 = m*m2;
        real b = bfac*m;
        real expfac = -b*b;
        real expterm = EXP(expfac);
492
        real erfcterm = ERFC(b);
493
494
        real eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
#else
495
496
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
497
#endif
498
499
500
501
502
503
504
        if (kz >= (GRID_SIZE_Z/2+1)) {
            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
            kz = GRID_SIZE_Z-kz;
        } 
        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
        real2 grid = pmeGrid[indexInHalfComplexGrid];
505
506
507
#ifndef USE_LJPME
        if (kx != 0 || ky != 0 || kz != 0)
#endif
508
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
509
    }
510
#if defined(USE_PME_STREAM) && !defined(USE_LJPME)
511
    energyBuffer[GLOBAL_ID] = 0.5f*energy;
peastman's avatar
peastman committed
512
#else
513
    energyBuffer[GLOBAL_ID] += 0.5f*energy;
peastman's avatar
peastman committed
514
#endif
515
516
}

517
518
519
#if defined(USE_HIP) && !defined(AMD_RDNA) && !defined(USE_DOUBLE_PRECISION)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
520
521
522
523
524
KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
#ifdef CHARGE_FROM_SIGEPS
        GLOBAL const float2* RESTRICT sigmaEpsilon
525
#else
526
        GLOBAL const real* RESTRICT charges
527
#endif
528
529
530
531
        ) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP((real) (PME_ORDER-1));
532
533
534
535
    
    // Process the atoms in spatially sorted order.  This improves cache performance when loading
    // the grid values.
    
536
    for (int i = GLOBAL_ID; i < NUM_ATOMS; i += GLOBAL_SIZE) {
537
        int atom = pmeAtomGridIndex[i].x;
538
        real3 force = make_real3(0);
539
        real4 pos = posq[atom];
540
        APPLY_PERIODIC_TO_POS(pos)
541
542
543
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
                             pos.z*recipBoxVecZ.z);
544
545
546
        t.x = (t.x-floor(t.x))*GRID_SIZE_X;
        t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
        t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
547
548
549
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
550
551
552
553

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.

554
555
        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
556
        data[1] = dr;
557
        data[0] = make_real3(1)-dr;
558
        for (int j = 3; j < PME_ORDER; j++) {
559
            real div = RECIP((real) (j-1));
560
561
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
562
563
                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
564
565
566
567
568
569
        }
        ddata[0] = -data[0];
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
570
571
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
572
573
574

        // Compute the force on this atom.

575
        for (int ix = 0; ix < PME_ORDER; ix++) {
576
577
578
579
580
581
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
            real dx = data[ix].x;
            real ddx = ddata[ix].x;
            
582
            for (int iy = 0; iy < PME_ORDER; iy++) {
583
584
585
586
587
588
                int ybase = gridIndex.y+iy;
                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                ybase = xbase + ybase*GRID_SIZE_Z;
                real dy = data[iy].y;
                real ddy = ddata[iy].y;
                
589
                for (int iz = 0; iz < PME_ORDER; iz++) {
590
591
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
592
                    int index = ybase + zindex;
593
                    real gridvalue = pmeGrid[index];
594
595
596
                    force.x += ddx*dy*data[iz].z*gridvalue;
                    force.y += dx*ddy*data[iz].z*gridvalue;
                    force.z += dx*dy*ddata[iz].z*gridvalue;
597
598
599
                }
            }
        }
600
#ifdef CHARGE_FROM_SIGEPS
601
602
603
        const float2 sigEps = sigmaEpsilon[atom];
        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
604
        real q = CHARGE*EPSILON_FACTOR;
605
#endif
606
607
608
609
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);
#ifdef USE_PME_STREAM
610
611
612
        ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) realToFixedPoint(forceX));
        ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(forceY));
        ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(forceZ));
613
#else
614
615
616
        forceBuffers[atom] += (mm_ulong) realToFixedPoint(forceX);
        forceBuffers[atom+PADDED_NUM_ATOMS] += (mm_ulong) realToFixedPoint(forceY);
        forceBuffers[atom+2*PADDED_NUM_ATOMS] += (mm_ulong) realToFixedPoint(forceZ);
617
#endif
618
619
    }
}
620

621
622
623
KERNEL void addForces(GLOBAL const real4* RESTRICT forces, GLOBAL mm_long* RESTRICT forceBuffers) {
    for (int atom = GLOBAL_ID; atom < NUM_ATOMS; atom += GLOBAL_SIZE) {
        real4 f = forces[atom];
624
625
626
        forceBuffers[atom] += realToFixedPoint(f.x);
        forceBuffers[atom+PADDED_NUM_ATOMS] += realToFixedPoint(f.y);
        forceBuffers[atom+2*PADDED_NUM_ATOMS] += realToFixedPoint(f.z);
627
    }
628
}
629

630
631
KERNEL void addEnergy(GLOBAL const mixed* RESTRICT pmeEnergyBuffer, GLOBAL mixed* RESTRICT energyBuffer, int bufferSize) {
    for (int i = GLOBAL_ID; i < bufferSize; i += GLOBAL_SIZE)
632
633
        energyBuffer[i] += pmeEnergyBuffer[i];
}