CpuPmeKernels.cpp 31.7 KB
Newer Older
peastman's avatar
peastman committed
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
peastman's avatar
peastman committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
34
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
35
#include "CpuPmeKernels.h"
36
#include "SimTKOpenMMRealType.h"
37
#include "openmm/internal/hardware.h"
38
#include "openmm/internal/vectorize.h"
39
#include "openmm/OpenMMException.h"
40
#include <cmath>
41
#include <algorithm>
peastman's avatar
peastman committed
42
#include <cstring>
43
#include <sstream>
Robert McGibbon's avatar
Robert McGibbon committed
44
#include <cstdlib>
peastman's avatar
peastman committed
45
46
47
48
49
50

using namespace OpenMM;
using namespace std;

static const int PME_ORDER = 5;

51
52
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
53

54
static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
55
    float temp[4];
peastman's avatar
peastman committed
56
57
58
59
60
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
61
62
63
64
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
Robert McGibbon's avatar
Robert McGibbon committed
65
    float posInBox[4] = {0,0,0,0};
66
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
Robert McGibbon's avatar
Robert McGibbon committed
67

68
69
70
71
72
    while (true) {
        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
        if (i >= numParticles)
            break;

peastman's avatar
peastman committed
73
        // Find the position relative to the nearest grid point.
Robert McGibbon's avatar
Robert McGibbon committed
74

75
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
76
77
78
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
79
80
81
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
peastman's avatar
peastman committed
82
83
        
        // Compute the B-spline coefficients.
Robert McGibbon's avatar
Robert McGibbon committed
84

85
86
        fvec4 data[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
peastman's avatar
peastman committed
87
        data[1] = dr;
88
        data[0] = one-dr;
peastman's avatar
peastman committed
89
        for (int j = 3; j < PME_ORDER; j++) {
90
91
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
peastman's avatar
peastman committed
92
            for (int k = 1; k < j-1; k++)
93
94
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
peastman's avatar
peastman committed
95
        }
96
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
peastman's avatar
peastman committed
97
        for (int j = 1; j < (PME_ORDER-1); j++)
98
99
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
peastman's avatar
peastman committed
100
101
102
        
        // Spread the charges.
        
103
104
105
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
106
107
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
108
109
110
111
112
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
peastman's avatar
peastman committed
113
        float charge = epsilonFactor*posq[4*i+3];
114
115
        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
        float zdata4 = data[4][2];
116
117
118
119
120
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
121
                float xdata = charge*data[ix][0];
122
123
124
125
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
126
127
128
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
129
130
131
132
133
134
135
136
137
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
        }
        else {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
138
                float xdata = charge*data[ix][0];
139
140
141
142
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
143
144
145
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    add0to3.store(temp);
peastman's avatar
peastman committed
146
147
148
149
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
                    grid[ybase+zindex[3]] += temp[3];
150
                    grid[ybase+zindex[4]] += multiplier*zdata4;
peastman's avatar
peastman committed
151
152
153
154
155
156
                }
            }
        }
    }
}

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
static void computeReciprocalDispersionEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
    const float scaleFactor = (float)  M_PI*sqrtf(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);

    float bfac = M_PI / alpha;
    float fac1 = 2.0f*M_PI*M_PI*M_PI*sqrtf(M_PI);
    float fac2 = alpha*alpha*alpha;
    float fac3 = -2.0f*alpha*M_PI*M_PI;
    float b, m, m3, expfac, expterm, erfcterm;

    for (int kx = start; kx < end; kx++) {
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
        float mhx = mx*(float)recipBoxVectors[0][0];
        float bx = bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
            for (int kz = 0; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = scaleFactor/(bxby*bz);

                m = sqrtf(m2);
                m3 = m*m2;
                b = bfac*m;
                expfac = -b*b;
                erfcterm = erfc(b);
                expterm = exp(expfac);
                recipEterm[index] = -2.0f*(fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
            }
        }
    }
}

peastman's avatar
peastman committed
197
static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
198
199
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
peastman's avatar
peastman committed
200
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
201
202
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));

203
204
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
205
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
206
        float mhx = mx*(float)recipBoxVectors[0][0];
207
208
209
        float bx = scaleFactor*bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
210
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
211
212
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
213
214
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
215
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
216
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
217
                float bz = bsplineModuli[2][kz];
218
219
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
220
221
222
223
224
225
226
                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
            }
            firstz = 0;
        }
    }
}

227
static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
228
229
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
peastman's avatar
peastman committed
230
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
231
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
232
    double energy = 0.0;
233
234
235

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
236
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
237
        float mhx = mx*(float)recipBoxVectors[0][0];
238
        float bx = scaleFactor*bsplineModuli[0][kx];
239
240
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
241
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
242
243
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
244
            for (int kz = firstz; kz < gridz; kz++) {
245
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
246
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
247
248
249
250
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
                float eterm = exp(-recipExpFactor*m2)/denom;
251
252
253
254
255
256
257
258
259
260
261
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
262
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
263
264
265
266
267
268
269
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
            }
            firstz = 0;
        }
    }
270
    return 0.5*energy;
271
272
}

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329

static double reciprocalDispersionEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
    const float scaleFactor = (float)  M_PI*sqrtf(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);

    float bfac = M_PI / alpha;
    float fac1 = 2.0f*M_PI*M_PI*M_PI*sqrtf(M_PI);
    float fac2 = alpha*alpha*alpha;
    float fac3 = -2.0f*alpha*M_PI*M_PI;
    float b, m, m3, expfac, expterm, erfcterm;
    double energy = 0.0;

    for (int kx = start; kx < end; kx++) {
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
        float mhx = mx*(float)recipBoxVectors[0][0];
        float bx = bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
            for (int kz = 0; kz < gridz; kz++) {
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = scaleFactor/(bxby*bz);
                m = sqrtf(m2);
                m3 = m*m2;
                b = bfac*m;
                expfac = -b*b;
                erfcterm = erfc(b);
                expterm = exp(expfac);
                float eterm = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;

                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
            }
        }
    }
    return -energy;
}

330
331
332
333
334
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vector<float>& recipEterm) {
    for (int index = start; index < end; index++) {
        float eterm = recipEterm[index];
        grid[index][0] *= eterm;
        grid[index][1] *= eterm;
335
336
337
    }
}

338
static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
peastman's avatar
peastman committed
339
340
341
342
343
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
344
345
346
347
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
348
349
350
351
352
    while (true) {
        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
        if (i >= numParticles)
            break;

353
354
        // Find the position relative to the nearest grid point.
        
355
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
356
357
358
359
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
360
361
362
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
363
364
365
        
        // Compute the B-spline coefficients.
        
366
367
368
        fvec4 data[PME_ORDER];
        fvec4 ddata[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
369
        data[1] = dr;
370
        data[0] = one-dr;
371
        for (int j = 3; j < PME_ORDER; j++) {
372
373
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
374
            for (int k = 1; k < j-1; k++)
375
376
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
377
        }
378
        ddata[0] = -data[0];
379
        for (int j = 1; j < PME_ORDER; j++)
380
381
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
382
        for (int j = 1; j < (PME_ORDER-1); j++)
383
384
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
385
                
386
387
        // Compute the force on this atom.
        
388
389
390
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
391
392
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
393
394
395
396
397
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
398
        fvec4 zdata[PME_ORDER];
399
        for (int j = 0; j < PME_ORDER; j++)
400
401
            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
        fvec4 f = 0.0f;
402
403
404
405
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
406
407
408
            float dx = data[ix][0];
            float ddx = ddata[ix][0];
            fvec4 xdata(ddx, dx, dx, 0);
409
410
411
412
413

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
414
415
416
                float dy = data[iy][1];
                float ddy = ddata[iy][1];
                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
417
418

                for (int iz = 0; iz < PME_ORDER; iz++) {
419
420
                    fvec4 gridValue(grid[ybase+zindex[iz]]);
                    f = f+xydata*zdata[iz]*gridValue;
421
422
423
                }
            }
        }
peastman's avatar
peastman committed
424
425
426
427
428
429
        f *= -epsilonFactor*posq[4*i+3];
        float fc[4];
        f.store(fc);
        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
430
431
432
    }
}

433
class CpuCalcPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
434
public:
435
436
437
438
    ComputeTask(CpuCalcPmeReciprocalForceKernel& owner) : owner(owner) {
    }
    void execute(ThreadPool& threads, int threadIndex) {
        owner.runWorkerThread(threads, threadIndex);
439
    }
440
    CpuCalcPmeReciprocalForceKernel& owner;
441
442
443
};

static void* threadBody(void* args) {
444
445
    CpuCalcPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel*>(args);
    owner.runMainThread();
446
447
448
    return 0;
}

449
void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
450
451
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
452
453
454
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
455
456
457
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
458
    threadEnergy.resize(numThreads);
peastman's avatar
peastman committed
459
460
461
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
462
463
464
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
465
    recipEterm.resize(gridx*gridy*gridz);
466
    
467
468
    // Initialize threads.
    
469
    isFinished = false;
470
471
472
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
    pthread_mutex_init(&lock, NULL);
473
    pthread_create(&mainThread, NULL, threadBody, this);
474
    
475
476
477
    // Wait until the main thread is up and running.
    
    pthread_mutex_lock(&lock);
478
479
    while (!isFinished)
        pthread_cond_wait(&endCondition, &lock);
480
481
    pthread_mutex_unlock(&lock);
    
482
483
    // Initialize FFTW.
    
484
485
486
    for (int i = 0; i < numThreads; i++)
        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
    realGrid = tempGrid[0];
487
488
489
490
491
492
493
494
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

495
    int maxSize = std::max(std::max(gridx, gridy), gridz);
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

549
CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
550
551
    isDeleted = true;
    pthread_mutex_lock(&lock);
552
553
    pthread_cond_broadcast(&startCondition);
    pthread_mutex_unlock(&lock);
554
    pthread_join(mainThread, NULL);
555
556
557
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
558
559
    for (int i = 0; i < (int) tempGrid.size(); i++)
        fftwf_free(tempGrid[i]);
560
561
562
563
564
565
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
566
567
}

568
569
void CpuCalcPmeReciprocalForceKernel::runMainThread() {
    // This is the main thread that coordinates all the other ones.
570
571

    pthread_mutex_lock(&lock);
572
    isFinished = true;
573
    pthread_cond_signal(&endCondition);
574
575
576
577
578
579
580
581
582
    ThreadPool threads(numThreads);
    while (true) {
        // Wait for the signal to start.

        pthread_cond_wait(&startCondition, &lock);
        if (isDeleted)
            break;
        posq = io->getPosq();
        ComputeTask task(*this);
583
        gmx_atomic_set(&atomicCounter, 0);
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
        threads.execute(task); // Signal threads to perform charge spreading.
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();
        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
            threads.waitForThreads();
        }
        if (includeEnergy) {
            threads.resumeThreads(); // Signal threads to compute energy.
            threads.waitForThreads();
            for (int i = 0; i < (int) threadEnergy.size(); i++)
                energy += threadEnergy[i];
        }
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
602
        gmx_atomic_set(&atomicCounter, 0);
603
604
605
606
607
608
609
610
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
        isFinished = true;
        lastBoxVectors[0] = periodicBoxVectors[0];
        lastBoxVectors[1] = periodicBoxVectors[1];
        lastBoxVectors[2] = periodicBoxVectors[2];
        pthread_cond_signal(&endCondition);
    }
611
612
613
    pthread_mutex_unlock(&lock);
}

614
615
616
617
618
619
void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
    int gridxStart = (index*gridx)/numThreads;
    int gridxEnd = ((index+1)*gridx)/numThreads;
    int gridSize = (gridx*gridy*gridz+3)/4;
    int gridStart = 4*((index*gridSize)/numThreads);
    int gridEnd = 4*(((index+1)*gridSize)/numThreads);
620
    int complexSize = gridx*gridy*(gridz/2+1);
621
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
622
    int complexEnd = (((index+1)*complexSize)/numThreads);
623
624
    const float epsilonFactor = calculationType==Electrostatic ? sqrt(ONE_4PI_EPS0) : 1.0f;
    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
625
626
627
628
629
630
631
    threads.syncThreads();
    int numGrids = tempGrid.size();
    for (int i = gridStart; i < gridEnd; i += 4) {
        fvec4 sum(&realGrid[i]);
        for (int j = 1; j < numGrids; j++)
            sum += fvec4(&tempGrid[j][i]);
        sum.store(&realGrid[i]);
632
    }
633
    threads.syncThreads();
634
635
636
637
638
639
640
641
642
643
644
    switch(calculationType){
    case Electrostatic:
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
            threads.syncThreads();
        }
        if (includeEnergy) {
            threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
            threads.syncThreads();
        }
        reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
645
        threads.syncThreads();
646
647
648
649
650
651
652
653
654
655
656
657
658
        break;
    case Dispersion:
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            computeReciprocalDispersionEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
            threads.syncThreads();
        }
        if (includeEnergy) {
            threadEnergy[index] = reciprocalDispersionEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
            threads.syncThreads();
        }
        // For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
        complexStart = std::max(0, ((index*complexSize)/numThreads));
        reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
659
        threads.syncThreads();
660
661
662
        break;
    default:
        throw OpenMMException("Unimplemented convolution type");
663
    }
664
665

    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
666
667
}

peastman's avatar
peastman committed
668
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
669
    this->io = &io;
peastman's avatar
peastman committed
670
671
672
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
673
674
    this->includeEnergy = includeEnergy;
    energy = 0.0;
peastman's avatar
peastman committed
675
676
677
678
679
680
681
682
683
684
685

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

686
    pthread_mutex_lock(&lock);
687
    isFinished = false;
688
    pthread_cond_signal(&startCondition);
689
690
691
    pthread_mutex_unlock(&lock);
}

692
double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
693
694
    pthread_mutex_lock(&lock);
    while (!isFinished) {
695
        pthread_cond_wait(&endCondition, &lock);
696
697
698
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
699
    return energy;
peastman's avatar
peastman committed
700
}
701
702

bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
703
    return isVec4Supported();
704
}
705

706
707
708
709
710
711
712
void CpuCalcPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    alpha = this->alpha;
    nx = gridx;
    ny = gridy;
    nz = gridz;
}

peastman's avatar
peastman committed
713
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
714
715
716
717
718
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

peastman's avatar
peastman committed
719
720
721
722
723
724
        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
725
        int unfactored = minimum;
726
        for (int factor = 2; factor < 8; factor++) {
727
728
729
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
730
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
731
732
733
734
            return minimum;
        minimum++;
    }
}