CpuPmeKernels.cpp 26.7 KB
Newer Older
peastman's avatar
peastman committed
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
peastman's avatar
peastman committed
9
 * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
peastman's avatar
peastman committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
34
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
35
#include "CpuPmeKernels.h"
36
#include "SimTKOpenMMRealType.h"
37
#include "openmm/internal/hardware.h"
38
#include "openmm/internal/vectorize.h"
39
#include <cmath>
peastman's avatar
peastman committed
40
#include <cstring>
peastman's avatar
peastman committed
41
42
43
44
45
46

using namespace OpenMM;
using namespace std;

static const int PME_ORDER = 5;

47
48
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
49

peastman's avatar
peastman committed
50
static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
51
    float temp[4];
peastman's avatar
peastman committed
52
53
54
55
56
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
57
58
59
60
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
peastman's avatar
peastman committed
61
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
62
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
63
    for (int i = start; i < end; i++) {
peastman's avatar
peastman committed
64
65
        // Find the position relative to the nearest grid point.
        
66
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
67
68
69
70
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
71
72
73
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
peastman's avatar
peastman committed
74
75
76
        
        // Compute the B-spline coefficients.
        
77
78
        fvec4 data[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
peastman's avatar
peastman committed
79
        data[1] = dr;
80
        data[0] = one-dr;
peastman's avatar
peastman committed
81
        for (int j = 3; j < PME_ORDER; j++) {
82
83
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
peastman's avatar
peastman committed
84
            for (int k = 1; k < j-1; k++)
85
86
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
peastman's avatar
peastman committed
87
        }
88
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
peastman's avatar
peastman committed
89
        for (int j = 1; j < (PME_ORDER-1); j++)
90
91
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
peastman's avatar
peastman committed
92
93
94
        
        // Spread the charges.
        
95
96
97
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
98
99
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
100
101
102
103
104
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
peastman's avatar
peastman committed
105
        float charge = epsilonFactor*posq[4*i+3];
106
107
        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
        float zdata4 = data[4][2];
108
109
110
111
112
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
113
                float xdata = charge*data[ix][0];
114
115
116
117
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
118
119
120
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
121
122
123
124
125
126
127
128
129
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
        }
        else {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
130
                float xdata = charge*data[ix][0];
131
132
133
134
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
135
136
137
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    add0to3.store(temp);
peastman's avatar
peastman committed
138
139
140
141
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
                    grid[ybase+zindex[3]] += temp[3];
142
                    grid[ybase+zindex[4]] += multiplier*zdata4;
peastman's avatar
peastman committed
143
144
145
146
147
148
                }
            }
        }
    }
}

peastman's avatar
peastman committed
149
static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
150
151
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
peastman's avatar
peastman committed
152
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
153
154
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));

155
156
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
157
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
158
        float mhx = mx*(float)recipBoxVectors[0][0];
159
160
161
        float bx = scaleFactor*bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
162
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
163
164
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
165
166
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
167
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
168
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
169
                float bz = bsplineModuli[2][kz];
170
171
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
172
173
174
175
176
177
178
                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
            }
            firstz = 0;
        }
    }
}

peastman's avatar
peastman committed
179
static float reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
180
181
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
peastman's avatar
peastman committed
182
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
183
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
184
185
186
187
    float energy = 0.0f;

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
188
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
189
        float mhx = mx*(float)recipBoxVectors[0][0];
190
        float bx = scaleFactor*bsplineModuli[0][kx];
191
192
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
193
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
194
195
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
196
            for (int kz = firstz; kz < gridz; kz++) {
197
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
198
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
199
200
201
202
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
                float eterm = exp(-recipExpFactor*m2)/denom;
203
204
205
206
207
208
209
210
211
212
213
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
214
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
215
216
217
218
219
220
221
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
            }
            firstz = 0;
        }
    }
222
    return 0.5f*energy;
223
224
}

225
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, vector<float>& recipEterm) {
226
227
228
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;

229
230
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
231
232
233
        for (int ky = 0; ky < gridy; ky++) {
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
234
                float eterm = recipEterm[index];
235
236
                grid[index][0] *= eterm;
                grid[index][1] *= eterm;
237
238
239
240
241
242
            }
            firstz = 0;
        }
    }
}

peastman's avatar
peastman committed
243
244
245
246
247
248
static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
249
250
251
252
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
253
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
254
    for (int i = start; i < end; i++) {
255
256
        // Find the position relative to the nearest grid point.
        
257
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
258
259
260
261
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
262
263
264
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
265
266
267
        
        // Compute the B-spline coefficients.
        
268
269
270
        fvec4 data[PME_ORDER];
        fvec4 ddata[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
271
        data[1] = dr;
272
        data[0] = one-dr;
273
        for (int j = 3; j < PME_ORDER; j++) {
274
275
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
276
            for (int k = 1; k < j-1; k++)
277
278
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
279
        }
280
        ddata[0] = -data[0];
281
        for (int j = 1; j < PME_ORDER; j++)
282
283
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
284
        for (int j = 1; j < (PME_ORDER-1); j++)
285
286
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
287
                
288
289
        // Compute the force on this atom.
        
290
291
292
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
293
294
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
295
296
297
298
299
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
300
        fvec4 zdata[PME_ORDER];
301
        for (int j = 0; j < PME_ORDER; j++)
302
303
            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
        fvec4 f = 0.0f;
304
305
306
307
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
308
309
310
            float dx = data[ix][0];
            float ddx = ddata[ix][0];
            fvec4 xdata(ddx, dx, dx, 0);
311
312
313
314
315

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
316
317
318
                float dy = data[iy][1];
                float ddy = ddata[iy][1];
                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
319
320

                for (int iz = 0; iz < PME_ORDER; iz++) {
321
322
                    fvec4 gridValue(grid[ybase+zindex[iz]]);
                    f = f+xydata*zdata[iz]*gridValue;
323
324
325
                }
            }
        }
peastman's avatar
peastman committed
326
327
328
329
330
331
        f *= -epsilonFactor*posq[4*i+3];
        float fc[4];
        f.store(fc);
        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
332
333
334
    }
}

335
class CpuCalcPmeReciprocalForceKernel::ThreadData {
336
public:
337
    CpuCalcPmeReciprocalForceKernel& owner;
338
    int index;
339
    float* tempGrid;
340
    ThreadData(CpuCalcPmeReciprocalForceKernel& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
341
342
343
344
    }
};

static void* threadBody(void* args) {
345
    CpuCalcPmeReciprocalForceKernel::ThreadData& data = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel::ThreadData*>(args);
346
    data.owner.runThread(data.index);
347
348
    if (data.tempGrid != NULL)
        fftwf_free(data.tempGrid);
349
350
351
352
    delete &data;
    return 0;
}

353
void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
354
355
356
357
358
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
peastman's avatar
peastman committed
359
360
361
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
362
363
364
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
365
    recipEterm.resize(gridx*gridy*gridz);
366
    
367
368
369
370
    // Initialize threads.
    
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
371
372
    pthread_cond_init(&mainThreadStartCondition, NULL);
    pthread_cond_init(&mainThreadEndCondition, NULL);
373
374
375
376
377
378
    pthread_mutex_init(&lock, NULL);
    thread.resize(numThreads);
    for (int i = 0; i < numThreads; i++) {
        ThreadData* data = new ThreadData(*this, i);
        threadData.push_back(data);
        pthread_create(&thread[i], NULL, threadBody, data);
379
        data->tempGrid = (float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3));
380
    }
381
382
    pthread_create(&mainThread, NULL, threadBody, new ThreadData(*this, -1));
    
383
384
    // Initialize FFTW.
    
385
    realGrid = threadData[0]->tempGrid;
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

    int maxSize = max(max(gridx, gridy), gridz);
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

448
CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
449
450
    isDeleted = true;
    pthread_mutex_lock(&lock);
451
    pthread_cond_broadcast(&startCondition);
452
    pthread_cond_broadcast(&mainThreadStartCondition);
453
    pthread_mutex_unlock(&lock);
454
    for (int i = 0; i < (int) thread.size(); i++)
455
        pthread_join(thread[i], NULL);
456
    pthread_join(mainThread, NULL);
457
458
459
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
460
461
    pthread_cond_destroy(&mainThreadStartCondition);
    pthread_cond_destroy(&mainThreadEndCondition);
462
463
464
465
466
467
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
468
469
}

470
void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
471
472
473
    if (index == -1) {
        // This is the main thread that coordinates all the other ones.
        
474
        pthread_mutex_lock(&lock);
475
476
477
478
479
        while (true) {
            // Wait for the signal to start.
            
            pthread_cond_wait(&mainThreadStartCondition, &lock);
            if (isDeleted)
480
481
                break;
            posq = io->getPosq();
482
483
484
            advanceThreads(); // Signal threads to perform charge spreading.
            advanceThreads(); // Signal threads to sum the charge grids.
            fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
peastman's avatar
peastman committed
485
            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2])
486
                advanceThreads(); // Signal threads to compute the reciprocal scale factors.
487
488
489
490
491
492
            if (includeEnergy)
                advanceThreads(); // Signal threads to compute energy.
            advanceThreads(); // Signal threads to perform reciprocal convolution.
            fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
            advanceThreads(); // Signal threads to interpolate forces.
            isFinished = true;
peastman's avatar
peastman committed
493
494
495
            lastBoxVectors[0] = periodicBoxVectors[0];
            lastBoxVectors[1] = periodicBoxVectors[1];
            lastBoxVectors[2] = periodicBoxVectors[2];
496
497
            pthread_cond_signal(&mainThreadEndCondition);
        }
498
        pthread_mutex_unlock(&lock);
499
500
501
502
503
504
505
506
507
508
509
510
    }
    else {
        // This is a worker thread.
        
        int particleStart = (index*numParticles)/numThreads;
        int particleEnd = ((index+1)*numParticles)/numThreads;
        int gridxStart = (index*gridx)/numThreads;
        int gridxEnd = ((index+1)*gridx)/numThreads;
        int gridSize = (gridx*gridy*gridz+3)/4;
        int gridStart = 4*((index*gridSize)/numThreads);
        int gridEnd = 4*(((index+1)*gridSize)/numThreads);
        while (true) {
511
            threadWait();
512
            if (isDeleted)
513
                break;
peastman's avatar
peastman committed
514
            spreadCharge(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
515
516
517
            threadWait();
            int numGrids = threadData.size();
            for (int i = gridStart; i < gridEnd; i += 4) {
518
                fvec4 sum(&realGrid[i]);
519
                for (int j = 1; j < numGrids; j++)
520
521
                    sum += fvec4(&threadData[j]->tempGrid[i]);
                sum.store(&realGrid[i]);
522
523
            }
            threadWait();
peastman's avatar
peastman committed
524
525
            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
                computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
526
527
                threadWait();
            }
528
            if (includeEnergy) {
peastman's avatar
peastman committed
529
                double threadEnergy = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
530
531
532
533
534
                pthread_mutex_lock(&lock);
                energy += threadEnergy;
                pthread_mutex_unlock(&lock);
                threadWait();
            }
535
            reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm);
536
            threadWait();
peastman's avatar
peastman committed
537
            interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
538
539
540
541
        }
    }
}

542
void CpuCalcPmeReciprocalForceKernel::threadWait() {
543
544
545
546
547
548
549
    pthread_mutex_lock(&lock);
    waitCount++;
    pthread_cond_signal(&endCondition);
    pthread_cond_wait(&startCondition, &lock);
    pthread_mutex_unlock(&lock);
}

550
void CpuCalcPmeReciprocalForceKernel::advanceThreads() {
551
552
553
554
555
556
557
    waitCount = 0;
    pthread_cond_broadcast(&startCondition);
    while (waitCount < numThreads) {
        pthread_cond_wait(&endCondition, &lock);
    }
}

peastman's avatar
peastman committed
558
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
559
    this->io = &io;
peastman's avatar
peastman committed
560
561
562
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
563
564
    this->includeEnergy = includeEnergy;
    energy = 0.0;
peastman's avatar
peastman committed
565
566
567
568
569
570
571
572
573
574
575

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

576
    pthread_mutex_lock(&lock);
577
    isFinished = false;
578
579
580
581
    pthread_cond_signal(&mainThreadStartCondition);
    pthread_mutex_unlock(&lock);
}

582
double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
583
584
585
586
587
588
    pthread_mutex_lock(&lock);
    while (!isFinished) {
        pthread_cond_wait(&mainThreadEndCondition, &lock);
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
589
    return energy;
peastman's avatar
peastman committed
590
}
591
592

bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
593
    return isVec4Supported();
594
}
595

peastman's avatar
peastman committed
596
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
597
598
599
600
601
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

peastman's avatar
peastman committed
602
603
604
605
606
607
        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
608
        int unfactored = minimum;
609
        for (int factor = 2; factor < 8; factor++) {
610
611
612
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
613
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
614
615
616
617
            return minimum;
        minimum++;
    }
}