CpuPmeKernels.cpp 39.8 KB
Newer Older
peastman's avatar
peastman committed
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
peastman's avatar
peastman committed
9
 * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
peastman's avatar
peastman committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
34
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
35
#include "CpuPmeKernels.h"
36
#include "SimTKOpenMMRealType.h"
37
#include "openmm/internal/hardware.h"
38
#include "openmm/internal/vectorize.h"
39
#include "openmm/OpenMMException.h"
40
#include <cmath>
41
#include <algorithm>
peastman's avatar
peastman committed
42
#include <cstring>
43
#include <sstream>
Robert McGibbon's avatar
Robert McGibbon committed
44
#include <cstdlib>
peastman's avatar
peastman committed
45
46
47
48
49
50

using namespace OpenMM;
using namespace std;

static const int PME_ORDER = 5;

51
52
bool CpuCalcDispersionPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcDispersionPmeReciprocalForceKernel::numThreads = 0;
53

54
static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
55
    float temp[4];
peastman's avatar
peastman committed
56
57
58
59
60
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
61
62
63
64
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
Robert McGibbon's avatar
Robert McGibbon committed
65
    float posInBox[4] = {0,0,0,0};
66
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
Robert McGibbon's avatar
Robert McGibbon committed
67

68
69
70
71
72
    while (true) {
        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
        if (i >= numParticles)
            break;

peastman's avatar
peastman committed
73
        // Find the position relative to the nearest grid point.
Robert McGibbon's avatar
Robert McGibbon committed
74

75
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
76
77
78
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
79
80
81
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
peastman's avatar
peastman committed
82
83
        
        // Compute the B-spline coefficients.
Robert McGibbon's avatar
Robert McGibbon committed
84

85
86
        fvec4 data[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
peastman's avatar
peastman committed
87
        data[1] = dr;
88
        data[0] = one-dr;
peastman's avatar
peastman committed
89
        for (int j = 3; j < PME_ORDER; j++) {
90
91
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
peastman's avatar
peastman committed
92
            for (int k = 1; k < j-1; k++)
93
94
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
peastman's avatar
peastman committed
95
        }
96
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
peastman's avatar
peastman committed
97
        for (int j = 1; j < (PME_ORDER-1); j++)
98
99
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
peastman's avatar
peastman committed
100
101
102
        
        // Spread the charges.
        
103
104
105
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
106
107
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
108
109
110
111
112
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
peastman's avatar
peastman committed
113
        float charge = epsilonFactor*posq[4*i+3];
114
115
        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
        float zdata4 = data[4][2];
116
117
118
119
120
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
121
                float xdata = charge*data[ix][0];
122
123
124
125
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
126
127
128
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
129
130
131
132
133
134
135
136
137
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
        }
        else {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
138
                float xdata = charge*data[ix][0];
139
140
141
142
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
143
144
145
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    add0to3.store(temp);
peastman's avatar
peastman committed
146
147
148
149
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
                    grid[ybase+zindex[3]] += temp[3];
150
                    grid[ybase+zindex[4]] += multiplier*zdata4;
peastman's avatar
peastman committed
151
152
153
154
155
156
                }
            }
        }
    }
}

157
#define FAST_ERFC 1
158
159
160
static void computeReciprocalDispersionEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
161
    const float scaleFactor = (float)  -2.0f*M_PI*sqrtf(M_PI) / (6.0*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
162
163
164
165
166

    float bfac = M_PI / alpha;
    float fac1 = 2.0f*M_PI*M_PI*M_PI*sqrtf(M_PI);
    float fac2 = alpha*alpha*alpha;
    float fac3 = -2.0f*alpha*M_PI*M_PI;
167
    float b, m, m3, expterm, erfcterm, t;
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

    for (int kx = start; kx < end; kx++) {
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
        float mhx = mx*(float)recipBoxVectors[0][0];
        float bx = bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
            for (int kz = 0; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = scaleFactor/(bxby*bz);

                m = sqrtf(m2);
                m3 = m*m2;
                b = bfac*m;
189
190
191
192
193
194
195
196
197
                expterm = exp(-b*b);

#if FAST_ERFC
                // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
                // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
                // error of 1.5e-7.  Stolen by ACS from the CUDA platform's AMOEBA plugin.
                t = 1.0f/(1.0f+0.3275911f*b);
                erfcterm = (0.254829592f+(-0.284496736f+(1.421413741f+(-1.453152027f+1.061405429f*t)*t)*t)*t)*t*expterm;
#else
198
                erfcterm = erfc(b);
199
200
#endif
                recipEterm[index] = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
201
202
203
204
205
            }
        }
    }
}

peastman's avatar
peastman committed
206
static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
207
208
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
peastman's avatar
peastman committed
209
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
210
211
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));

212
213
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
214
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
215
        float mhx = mx*(float)recipBoxVectors[0][0];
216
217
218
        float bx = scaleFactor*bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
219
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
220
221
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
222
223
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
224
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
225
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
226
                float bz = bsplineModuli[2][kz];
227
228
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
229
230
231
232
233
234
235
                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
            }
            firstz = 0;
        }
    }
}

236
static double reciprocalEnergy(int start, int end, fftwf_complex* grid, vector<float>& recipEterm, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
237
238
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
239

240
    double energy = 0.0;
241
242
243
244
245

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
        for (int ky = 0; ky < gridy; ky++) {
            for (int kz = firstz; kz < gridz; kz++) {
246
247
248
249
250
251
252
253
254
255
256
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
257
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
258
259
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
260
                energy += recipEterm[index]*(gridReal*gridReal+gridImag*gridImag);
261
262
263
264
            }
            firstz = 0;
        }
    }
265
    return 0.5*energy;
266
267
}

268

269
static double reciprocalDispersionEnergy(int start, int end, fftwf_complex* grid, const vector<float>& recipEterm, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;

    double energy = 0.0;

    for (int kx = start; kx < end; kx++) {
        for (int ky = 0; ky < gridy; ky++) {
            for (int kz = 0; kz < gridz; kz++) {
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
292
                energy += recipEterm[index]*(gridReal*gridReal+gridImag*gridImag);
293
294
295
            }
        }
    }
296
    return 0.5f*energy;
297
298
}

299

300
301
302
303
304
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vector<float>& recipEterm) {
    for (int index = start; index < end; index++) {
        float eterm = recipEterm[index];
        grid[index][0] *= eterm;
        grid[index][1] *= eterm;
305
306
307
    }
}

308
static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
peastman's avatar
peastman committed
309
310
311
312
313
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
314
315
316
317
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
318
319
320
321
322
    while (true) {
        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
        if (i >= numParticles)
            break;

323
324
        // Find the position relative to the nearest grid point.
        
325
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
326
327
328
329
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
330
331
332
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
333
334
335
        
        // Compute the B-spline coefficients.
        
336
337
338
        fvec4 data[PME_ORDER];
        fvec4 ddata[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
339
        data[1] = dr;
340
        data[0] = one-dr;
341
        for (int j = 3; j < PME_ORDER; j++) {
342
343
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
344
            for (int k = 1; k < j-1; k++)
345
346
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
347
        }
348
        ddata[0] = -data[0];
349
        for (int j = 1; j < PME_ORDER; j++)
350
351
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
352
        for (int j = 1; j < (PME_ORDER-1); j++)
353
354
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
355
                
356
357
        // Compute the force on this atom.
        
358
359
360
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
361
362
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
363
364
365
366
367
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
368
        fvec4 zdata[PME_ORDER];
369
        for (int j = 0; j < PME_ORDER; j++)
370
371
            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
        fvec4 f = 0.0f;
372
373
374
375
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
376
377
378
            float dx = data[ix][0];
            float ddx = ddata[ix][0];
            fvec4 xdata(ddx, dx, dx, 0);
379
380
381
382
383

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
384
385
386
                float dy = data[iy][1];
                float ddy = ddata[iy][1];
                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
387
388

                for (int iz = 0; iz < PME_ORDER; iz++) {
389
390
                    fvec4 gridValue(grid[ybase+zindex[iz]]);
                    f = f+xydata*zdata[iz]*gridValue;
391
392
393
                }
            }
        }
peastman's avatar
peastman committed
394
395
396
397
398
399
        f *= -epsilonFactor*posq[4*i+3];
        float fc[4];
        f.store(fc);
        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
400
401
402
    }
}

403
static void* threadBody(void* args) {
404
405
    CpuCalcPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel*>(args);
    owner.runMainThread();
406
407
408
    return 0;
}

409
void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
410
411
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
412
413
414
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
415
416
417
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
418
    threadEnergy.resize(numThreads);
peastman's avatar
peastman committed
419
420
421
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
422
423
424
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
425
    recipEterm.resize(gridx*gridy*gridz);
426
    
427
428
    // Initialize threads.
    
429
    isFinished = false;
430
431
432
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
    pthread_mutex_init(&lock, NULL);
433
    pthread_create(&mainThread, NULL, threadBody, this);
434
    
435
436
437
    // Wait until the main thread is up and running.
    
    pthread_mutex_lock(&lock);
438
439
    while (!isFinished)
        pthread_cond_wait(&endCondition, &lock);
440
441
    pthread_mutex_unlock(&lock);
    
442
443
    // Initialize FFTW.
    
444
445
446
    for (int i = 0; i < numThreads; i++)
        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
    realGrid = tempGrid[0];
447
448
449
450
451
452
453
454
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

455
    int maxSize = std::max(std::max(gridx, gridy), gridz);
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

509
CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
510
511
    isDeleted = true;
    pthread_mutex_lock(&lock);
512
513
    pthread_cond_broadcast(&startCondition);
    pthread_mutex_unlock(&lock);
514
    pthread_join(mainThread, NULL);
515
516
517
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
518
519
    for (int i = 0; i < (int) tempGrid.size(); i++)
        fftwf_free(tempGrid[i]);
520
521
522
523
524
525
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
526
527
}

528
529
void CpuCalcPmeReciprocalForceKernel::runMainThread() {
    // This is the main thread that coordinates all the other ones.
530
531

    pthread_mutex_lock(&lock);
532
    isFinished = true;
533
    pthread_cond_signal(&endCondition);
534
535
536
537
538
539
540
541
    ThreadPool threads(numThreads);
    while (true) {
        // Wait for the signal to start.

        pthread_cond_wait(&startCondition, &lock);
        if (isDeleted)
            break;
        posq = io->getPosq();
542
        gmx_atomic_set(&atomicCounter, 0);
peastman's avatar
peastman committed
543
        threads.execute([&] (ThreadPool& threads, int threadIndex) { runWorkerThread(threads, threadIndex); }); // Signal threads to perform charge spreading.
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();
        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
            threads.waitForThreads();
        }
        if (includeEnergy) {
            threads.resumeThreads(); // Signal threads to compute energy.
            threads.waitForThreads();
            for (int i = 0; i < (int) threadEnergy.size(); i++)
                energy += threadEnergy[i];
        }
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
561
        gmx_atomic_set(&atomicCounter, 0);
562
563
564
565
566
567
568
569
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
        isFinished = true;
        lastBoxVectors[0] = periodicBoxVectors[0];
        lastBoxVectors[1] = periodicBoxVectors[1];
        lastBoxVectors[2] = periodicBoxVectors[2];
        pthread_cond_signal(&endCondition);
    }
570
571
572
    pthread_mutex_unlock(&lock);
}

573
574
575
576
577
578
void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
    int gridxStart = (index*gridx)/numThreads;
    int gridxEnd = ((index+1)*gridx)/numThreads;
    int gridSize = (gridx*gridy*gridz+3)/4;
    int gridStart = 4*((index*gridSize)/numThreads);
    int gridEnd = 4*(((index+1)*gridSize)/numThreads);
579
    int complexSize = gridx*gridy*(gridz/2+1);
580
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
581
    int complexEnd = (((index+1)*complexSize)/numThreads);
582
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
583
    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
584
585
586
587
588
589
590
    threads.syncThreads();
    int numGrids = tempGrid.size();
    for (int i = gridStart; i < gridEnd; i += 4) {
        fvec4 sum(&realGrid[i]);
        for (int j = 1; j < numGrids; j++)
            sum += fvec4(&tempGrid[j][i]);
        sum.store(&realGrid[i]);
591
    }
592
    threads.syncThreads();
593
594
    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
        computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
595
        threads.syncThreads();
596
597
598
    }
    if (includeEnergy) {
        threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
599
600
        threads.syncThreads();
    }
601
602
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
    threads.syncThreads();
603
    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
604
605
}

peastman's avatar
peastman committed
606
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
607
    this->io = &io;
peastman's avatar
peastman committed
608
609
610
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
611
612
    this->includeEnergy = includeEnergy;
    energy = 0.0;
peastman's avatar
peastman committed
613
614
615
616
617
618
619
620
621
622
623

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

624
    pthread_mutex_lock(&lock);
625
    isFinished = false;
626
    pthread_cond_signal(&startCondition);
627
628
629
    pthread_mutex_unlock(&lock);
}

630
double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
631
632
    pthread_mutex_lock(&lock);
    while (!isFinished) {
633
        pthread_cond_wait(&endCondition, &lock);
634
635
636
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
637
    return energy;
peastman's avatar
peastman committed
638
}
639
640

bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
641
    return isVec4Supported();
642
}
643

644
645
646
647
648
649
650
void CpuCalcPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    alpha = this->alpha;
    nx = gridx;
    ny = gridy;
    nz = gridz;
}

peastman's avatar
peastman committed
651
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
652
653
654
655
656
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

peastman's avatar
peastman committed
657
658
659
660
661
662
        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
663
        int unfactored = minimum;
664
        for (int factor = 2; factor < 8; factor++) {
665
666
667
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
668
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
669
670
671
672
            return minimum;
        minimum++;
    }
}
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965

/*
 * Everything below here is just a clone of the above, but to handle the dispersion term
 * instead of electrostatics.
 */

bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;


class CpuCalcDispersionPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
public:
    ComputeTask(CpuCalcDispersionPmeReciprocalForceKernel& owner) : owner(owner) {
    }
    void execute(ThreadPool& threads, int threadIndex) {
        owner.runWorkerThread(threads, threadIndex);
    }
    CpuCalcDispersionPmeReciprocalForceKernel& owner;
};

static void* dispersionThreadBody(void* args) {
    CpuCalcDispersionPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcDispersionPmeReciprocalForceKernel*>(args);
    owner.runMainThread();
    return 0;
}

void CpuCalcDispersionPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
    threadEnergy.resize(numThreads);
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
    recipEterm.resize(gridx*gridy*gridz);
    
    // Initialize threads.
    
    isFinished = false;
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
    pthread_mutex_init(&lock, NULL);
    pthread_create(&mainThread, NULL, dispersionThreadBody, this);
    
    // Wait until the main thread is up and running.
    
    pthread_mutex_lock(&lock);
    while (!isFinished)
        pthread_cond_wait(&endCondition, &lock);
    pthread_mutex_unlock(&lock);
    
    // Initialize FFTW.
    
    for (int i = 0; i < numThreads; i++)
        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
    realGrid = tempGrid[0];
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

    int maxSize = std::max(std::max(gridx, gridy), gridz);
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

CpuCalcDispersionPmeReciprocalForceKernel::~CpuCalcDispersionPmeReciprocalForceKernel() {
    isDeleted = true;
    pthread_mutex_lock(&lock);
    pthread_cond_broadcast(&startCondition);
    pthread_mutex_unlock(&lock);
    pthread_join(mainThread, NULL);
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
    for (int i = 0; i < (int) tempGrid.size(); i++)
        fftwf_free(tempGrid[i]);
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
}

void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
    // This is the main thread that coordinates all the other ones.

    pthread_mutex_lock(&lock);
    isFinished = true;
    pthread_cond_signal(&endCondition);
    ThreadPool threads(numThreads);
    while (true) {
        // Wait for the signal to start.

        pthread_cond_wait(&startCondition, &lock);
        if (isDeleted)
            break;
        posq = io->getPosq();
        ComputeTask task(*this);
        gmx_atomic_set(&atomicCounter, 0);
        threads.execute(task); // Signal threads to perform charge spreading.
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();
        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
            threads.waitForThreads();
        }
        if (includeEnergy) {
            threads.resumeThreads(); // Signal threads to compute energy.
            threads.waitForThreads();
            for (int i = 0; i < (int) threadEnergy.size(); i++)
                energy += threadEnergy[i];
        }
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
        gmx_atomic_set(&atomicCounter, 0);
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
        isFinished = true;
        lastBoxVectors[0] = periodicBoxVectors[0];
        lastBoxVectors[1] = periodicBoxVectors[1];
        lastBoxVectors[2] = periodicBoxVectors[2];
        pthread_cond_signal(&endCondition);
    }
    pthread_mutex_unlock(&lock);
}

void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
    int gridxStart = (index*gridx)/numThreads;
    int gridxEnd = ((index+1)*gridx)/numThreads;
    int gridSize = (gridx*gridy*gridz+3)/4;
    int gridStart = 4*((index*gridSize)/numThreads);
    int gridEnd = 4*(((index+1)*gridSize)/numThreads);
    int complexSize = gridx*gridy*(gridz/2+1);
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
    int complexEnd = (((index+1)*complexSize)/numThreads);
    const float epsilonFactor = 1.0f;
    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
    threads.syncThreads();
    int numGrids = tempGrid.size();
    for (int i = gridStart; i < gridEnd; i += 4) {
        fvec4 sum(&realGrid[i]);
        for (int j = 1; j < numGrids; j++)
            sum += fvec4(&tempGrid[j][i]);
        sum.store(&realGrid[i]);
    }
    threads.syncThreads();
    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
        computeReciprocalDispersionEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
        threads.syncThreads();
    }
    if (includeEnergy) {
        threadEnergy[index] = reciprocalDispersionEnergy(gridxStart, gridxEnd, complexGrid, recipEterm, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
        threads.syncThreads();
    }
    // For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
    complexStart = (index*complexSize)/numThreads;
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
    threads.syncThreads();
    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor);
}

void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
    this->io = &io;
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
    this->includeEnergy = includeEnergy;
    energy = 0.0;

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

    pthread_mutex_lock(&lock);
    isFinished = false;
    pthread_cond_signal(&startCondition);
    pthread_mutex_unlock(&lock);
}

double CpuCalcDispersionPmeReciprocalForceKernel::finishComputation(IO& io) {
    pthread_mutex_lock(&lock);
    while (!isFinished) {
        pthread_cond_wait(&endCondition, &lock);
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
    return energy;
}

bool CpuCalcDispersionPmeReciprocalForceKernel::isProcessorSupported() {
    return isVec4Supported();
}

void CpuCalcDispersionPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    alpha = this->alpha;
    nx = gridx;
    ny = gridy;
    nz = gridz;
}

int CpuCalcDispersionPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
        int unfactored = minimum;
        for (int factor = 2; factor < 8; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
            return minimum;
        minimum++;
    }
}