CpuPmeKernels.cpp 27.2 KB
Newer Older
peastman's avatar
peastman committed
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
peastman's avatar
peastman committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
34
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
35
#include "CpuPmeKernels.h"
36
#include "SimTKOpenMMRealType.h"
37
#include "openmm/internal/hardware.h"
38
#include "openmm/internal/vectorize.h"
39
#include <cmath>
peastman's avatar
peastman committed
40
#include <cstring>
41
#include <sstream>
Robert McGibbon's avatar
Robert McGibbon committed
42
#include <cstdlib>
peastman's avatar
peastman committed
43
44
45
46
47
48

using namespace OpenMM;
using namespace std;

static const int PME_ORDER = 5;

49
50
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
51

peastman's avatar
peastman committed
52
static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
53
    float temp[4];
peastman's avatar
peastman committed
54
55
56
57
58
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
59
60
61
62
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
Robert McGibbon's avatar
Robert McGibbon committed
63
    float posInBox[4] = {0,0,0,0};
peastman's avatar
peastman committed
64
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
65
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
Robert McGibbon's avatar
Robert McGibbon committed
66

67
    for (int i = start; i < end; i++) {
peastman's avatar
peastman committed
68
        // Find the position relative to the nearest grid point.
Robert McGibbon's avatar
Robert McGibbon committed
69

70
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
71
72
73
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
74
75
76
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
peastman's avatar
peastman committed
77
78
        
        // Compute the B-spline coefficients.
Robert McGibbon's avatar
Robert McGibbon committed
79

80
81
        fvec4 data[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
peastman's avatar
peastman committed
82
        data[1] = dr;
83
        data[0] = one-dr;
peastman's avatar
peastman committed
84
        for (int j = 3; j < PME_ORDER; j++) {
85
86
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
peastman's avatar
peastman committed
87
            for (int k = 1; k < j-1; k++)
88
89
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
peastman's avatar
peastman committed
90
        }
91
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
peastman's avatar
peastman committed
92
        for (int j = 1; j < (PME_ORDER-1); j++)
93
94
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
peastman's avatar
peastman committed
95
96
97
        
        // Spread the charges.
        
98
99
100
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
101
102
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
103
104
105
106
107
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
peastman's avatar
peastman committed
108
        float charge = epsilonFactor*posq[4*i+3];
109
110
        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
        float zdata4 = data[4][2];
111
112
113
114
115
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
116
                float xdata = charge*data[ix][0];
117
118
119
120
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
121
122
123
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
124
125
126
127
128
129
130
131
132
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
        }
        else {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
133
                float xdata = charge*data[ix][0];
134
135
136
137
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
138
139
140
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    add0to3.store(temp);
peastman's avatar
peastman committed
141
142
143
144
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
                    grid[ybase+zindex[3]] += temp[3];
145
                    grid[ybase+zindex[4]] += multiplier*zdata4;
peastman's avatar
peastman committed
146
147
148
149
150
151
                }
            }
        }
    }
}

peastman's avatar
peastman committed
152
static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
153
154
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
peastman's avatar
peastman committed
155
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
156
157
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));

158
159
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
160
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
161
        float mhx = mx*(float)recipBoxVectors[0][0];
162
163
164
        float bx = scaleFactor*bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
165
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
166
167
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
168
169
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
170
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
171
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
172
                float bz = bsplineModuli[2][kz];
173
174
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
175
176
177
178
179
180
181
                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
            }
            firstz = 0;
        }
    }
}

182
static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
183
184
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
peastman's avatar
peastman committed
185
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
186
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
187
    double energy = 0.0;
188
189
190

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
191
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
192
        float mhx = mx*(float)recipBoxVectors[0][0];
193
        float bx = scaleFactor*bsplineModuli[0][kx];
194
195
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
196
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
197
198
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
199
            for (int kz = firstz; kz < gridz; kz++) {
200
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
201
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
202
203
204
205
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
                float eterm = exp(-recipExpFactor*m2)/denom;
206
207
208
209
210
211
212
213
214
215
216
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
217
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
218
219
220
221
222
223
224
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
            }
            firstz = 0;
        }
    }
225
    return 0.5*energy;
226
227
}

228
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, vector<float>& recipEterm) {
229
230
231
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;

232
233
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
234
235
236
        for (int ky = 0; ky < gridy; ky++) {
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
237
                float eterm = recipEterm[index];
238
239
                grid[index][0] *= eterm;
                grid[index][1] *= eterm;
240
241
242
243
244
245
            }
            firstz = 0;
        }
    }
}

peastman's avatar
peastman committed
246
247
248
249
250
251
static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
252
253
254
255
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
256
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
257
    for (int i = start; i < end; i++) {
258
259
        // Find the position relative to the nearest grid point.
        
260
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
261
262
263
264
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
265
266
267
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
268
269
270
        
        // Compute the B-spline coefficients.
        
271
272
273
        fvec4 data[PME_ORDER];
        fvec4 ddata[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
274
        data[1] = dr;
275
        data[0] = one-dr;
276
        for (int j = 3; j < PME_ORDER; j++) {
277
278
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
279
            for (int k = 1; k < j-1; k++)
280
281
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
282
        }
283
        ddata[0] = -data[0];
284
        for (int j = 1; j < PME_ORDER; j++)
285
286
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
287
        for (int j = 1; j < (PME_ORDER-1); j++)
288
289
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
290
                
291
292
        // Compute the force on this atom.
        
293
294
295
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
296
297
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
298
299
300
301
302
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
303
        fvec4 zdata[PME_ORDER];
304
        for (int j = 0; j < PME_ORDER; j++)
305
306
            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
        fvec4 f = 0.0f;
307
308
309
310
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
311
312
313
            float dx = data[ix][0];
            float ddx = ddata[ix][0];
            fvec4 xdata(ddx, dx, dx, 0);
314
315
316
317
318

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
319
320
321
                float dy = data[iy][1];
                float ddy = ddata[iy][1];
                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
322
323

                for (int iz = 0; iz < PME_ORDER; iz++) {
324
325
                    fvec4 gridValue(grid[ybase+zindex[iz]]);
                    f = f+xydata*zdata[iz]*gridValue;
326
327
328
                }
            }
        }
peastman's avatar
peastman committed
329
330
331
332
333
334
        f *= -epsilonFactor*posq[4*i+3];
        float fc[4];
        f.store(fc);
        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
335
336
337
    }
}

338
class CpuCalcPmeReciprocalForceKernel::ThreadData {
339
public:
340
    CpuCalcPmeReciprocalForceKernel& owner;
341
    int index;
342
    float* tempGrid;
343
    ThreadData(CpuCalcPmeReciprocalForceKernel& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
344
345
346
347
    }
};

static void* threadBody(void* args) {
348
    CpuCalcPmeReciprocalForceKernel::ThreadData& data = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel::ThreadData*>(args);
349
    data.owner.runThread(data.index);
350
351
    if (data.tempGrid != NULL)
        fftwf_free(data.tempGrid);
352
353
354
355
    delete &data;
    return 0;
}

356
void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
357
358
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
359
360
361
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
362
363
364
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
peastman's avatar
peastman committed
365
366
367
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
368
369
370
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
371
    recipEterm.resize(gridx*gridy*gridz);
372
    
373
374
    // Initialize threads.
    
375
    isFinished = false;
376
377
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
378
379
    pthread_cond_init(&mainThreadStartCondition, NULL);
    pthread_cond_init(&mainThreadEndCondition, NULL);
380
381
382
383
384
385
    pthread_mutex_init(&lock, NULL);
    thread.resize(numThreads);
    for (int i = 0; i < numThreads; i++) {
        ThreadData* data = new ThreadData(*this, i);
        threadData.push_back(data);
        pthread_create(&thread[i], NULL, threadBody, data);
386
        data->tempGrid = (float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3));
387
    }
388
389
    pthread_create(&mainThread, NULL, threadBody, new ThreadData(*this, -1));
    
390
391
392
393
394
395
396
397
    // Wait until the main thread is up and running.
    
    pthread_mutex_lock(&lock);
    while (!isFinished) {
        pthread_cond_wait(&mainThreadEndCondition, &lock);
    }
    pthread_mutex_unlock(&lock);
    
398
399
    // Initialize FFTW.
    
400
    realGrid = threadData[0]->tempGrid;
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

    int maxSize = max(max(gridx, gridy), gridz);
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

463
CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
464
465
    isDeleted = true;
    pthread_mutex_lock(&lock);
466
    pthread_cond_broadcast(&startCondition);
467
    pthread_cond_broadcast(&mainThreadStartCondition);
468
    pthread_mutex_unlock(&lock);
469
    for (int i = 0; i < (int) thread.size(); i++)
470
        pthread_join(thread[i], NULL);
471
    pthread_join(mainThread, NULL);
472
473
474
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
475
476
    pthread_cond_destroy(&mainThreadStartCondition);
    pthread_cond_destroy(&mainThreadEndCondition);
477
478
479
480
481
482
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
483
484
}

485
void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
486
487
488
    if (index == -1) {
        // This is the main thread that coordinates all the other ones.
        
489
        pthread_mutex_lock(&lock);
490
491
        isFinished = true;
        pthread_cond_signal(&mainThreadEndCondition);
492
493
494
495
496
        while (true) {
            // Wait for the signal to start.
            
            pthread_cond_wait(&mainThreadStartCondition, &lock);
            if (isDeleted)
497
498
                break;
            posq = io->getPosq();
499
500
501
            advanceThreads(); // Signal threads to perform charge spreading.
            advanceThreads(); // Signal threads to sum the charge grids.
            fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
peastman's avatar
peastman committed
502
            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2])
503
                advanceThreads(); // Signal threads to compute the reciprocal scale factors.
504
505
506
507
508
509
            if (includeEnergy)
                advanceThreads(); // Signal threads to compute energy.
            advanceThreads(); // Signal threads to perform reciprocal convolution.
            fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
            advanceThreads(); // Signal threads to interpolate forces.
            isFinished = true;
peastman's avatar
peastman committed
510
511
512
            lastBoxVectors[0] = periodicBoxVectors[0];
            lastBoxVectors[1] = periodicBoxVectors[1];
            lastBoxVectors[2] = periodicBoxVectors[2];
513
514
            pthread_cond_signal(&mainThreadEndCondition);
        }
515
        pthread_mutex_unlock(&lock);
516
517
518
519
520
521
522
523
524
525
526
527
    }
    else {
        // This is a worker thread.
        
        int particleStart = (index*numParticles)/numThreads;
        int particleEnd = ((index+1)*numParticles)/numThreads;
        int gridxStart = (index*gridx)/numThreads;
        int gridxEnd = ((index+1)*gridx)/numThreads;
        int gridSize = (gridx*gridy*gridz+3)/4;
        int gridStart = 4*((index*gridSize)/numThreads);
        int gridEnd = 4*(((index+1)*gridSize)/numThreads);
        while (true) {
528
            threadWait();
529
            if (isDeleted)
530
                break;
peastman's avatar
peastman committed
531
            spreadCharge(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
532
533
534
            threadWait();
            int numGrids = threadData.size();
            for (int i = gridStart; i < gridEnd; i += 4) {
535
                fvec4 sum(&realGrid[i]);
536
                for (int j = 1; j < numGrids; j++)
537
538
                    sum += fvec4(&threadData[j]->tempGrid[i]);
                sum.store(&realGrid[i]);
539
540
            }
            threadWait();
peastman's avatar
peastman committed
541
542
            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
                computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
543
544
                threadWait();
            }
545
            if (includeEnergy) {
peastman's avatar
peastman committed
546
                double threadEnergy = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
547
548
549
550
551
                pthread_mutex_lock(&lock);
                energy += threadEnergy;
                pthread_mutex_unlock(&lock);
                threadWait();
            }
552
            reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm);
553
            threadWait();
peastman's avatar
peastman committed
554
            interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
555
556
557
558
        }
    }
}

559
void CpuCalcPmeReciprocalForceKernel::threadWait() {
560
561
562
563
564
565
566
    pthread_mutex_lock(&lock);
    waitCount++;
    pthread_cond_signal(&endCondition);
    pthread_cond_wait(&startCondition, &lock);
    pthread_mutex_unlock(&lock);
}

567
void CpuCalcPmeReciprocalForceKernel::advanceThreads() {
568
569
570
571
572
573
574
    waitCount = 0;
    pthread_cond_broadcast(&startCondition);
    while (waitCount < numThreads) {
        pthread_cond_wait(&endCondition, &lock);
    }
}

peastman's avatar
peastman committed
575
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
576
    this->io = &io;
peastman's avatar
peastman committed
577
578
579
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
580
581
    this->includeEnergy = includeEnergy;
    energy = 0.0;
peastman's avatar
peastman committed
582
583
584
585
586
587
588
589
590
591
592

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

593
    pthread_mutex_lock(&lock);
594
    isFinished = false;
595
596
597
598
    pthread_cond_signal(&mainThreadStartCondition);
    pthread_mutex_unlock(&lock);
}

599
double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
600
601
602
603
604
605
    pthread_mutex_lock(&lock);
    while (!isFinished) {
        pthread_cond_wait(&mainThreadEndCondition, &lock);
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
606
    return energy;
peastman's avatar
peastman committed
607
}
608
609

bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
610
    return isVec4Supported();
611
}
612

peastman's avatar
peastman committed
613
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
614
615
616
617
618
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

peastman's avatar
peastman committed
619
620
621
622
623
624
        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
625
        int unfactored = minimum;
626
        for (int factor = 2; factor < 8; factor++) {
627
628
629
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
630
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
631
632
633
634
            return minimum;
        minimum++;
    }
}