CpuPmeKernels.cpp 26.8 KB
Newer Older
peastman's avatar
peastman committed
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
peastman's avatar
peastman committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
34
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
35
#include "CpuPmeKernels.h"
36
#include "SimTKOpenMMRealType.h"
37
#include "openmm/internal/hardware.h"
38
#include "openmm/internal/vectorize.h"
39
#include <cmath>
peastman's avatar
peastman committed
40
#include <cstring>
41
#include <sstream>
peastman's avatar
peastman committed
42
43
44
45
46
47

using namespace OpenMM;
using namespace std;

static const int PME_ORDER = 5;

48
49
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
50

peastman's avatar
peastman committed
51
static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
52
    float temp[4];
peastman's avatar
peastman committed
53
54
55
56
57
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
58
59
60
61
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
peastman's avatar
peastman committed
62
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
63
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
64
    for (int i = start; i < end; i++) {
peastman's avatar
peastman committed
65
66
        // Find the position relative to the nearest grid point.
        
67
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
68
69
70
71
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
72
73
74
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
peastman's avatar
peastman committed
75
76
77
        
        // Compute the B-spline coefficients.
        
78
79
        fvec4 data[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
peastman's avatar
peastman committed
80
        data[1] = dr;
81
        data[0] = one-dr;
peastman's avatar
peastman committed
82
        for (int j = 3; j < PME_ORDER; j++) {
83
84
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
peastman's avatar
peastman committed
85
            for (int k = 1; k < j-1; k++)
86
87
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
peastman's avatar
peastman committed
88
        }
89
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
peastman's avatar
peastman committed
90
        for (int j = 1; j < (PME_ORDER-1); j++)
91
92
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
peastman's avatar
peastman committed
93
94
95
        
        // Spread the charges.
        
96
97
98
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
99
100
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
101
102
103
104
105
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
peastman's avatar
peastman committed
106
        float charge = epsilonFactor*posq[4*i+3];
107
108
        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
        float zdata4 = data[4][2];
109
110
111
112
113
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
114
                float xdata = charge*data[ix][0];
115
116
117
118
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
119
120
121
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
122
123
124
125
126
127
128
129
130
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
        }
        else {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
131
                float xdata = charge*data[ix][0];
132
133
134
135
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
136
137
138
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    add0to3.store(temp);
peastman's avatar
peastman committed
139
140
141
142
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
                    grid[ybase+zindex[3]] += temp[3];
143
                    grid[ybase+zindex[4]] += multiplier*zdata4;
peastman's avatar
peastman committed
144
145
146
147
148
149
                }
            }
        }
    }
}

peastman's avatar
peastman committed
150
static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
151
152
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
peastman's avatar
peastman committed
153
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
154
155
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));

156
157
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
158
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
159
        float mhx = mx*(float)recipBoxVectors[0][0];
160
161
162
        float bx = scaleFactor*bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
163
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
164
165
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
166
167
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
168
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
169
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
170
                float bz = bsplineModuli[2][kz];
171
172
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
173
174
175
176
177
178
179
                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
            }
            firstz = 0;
        }
    }
}

180
static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
181
182
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
peastman's avatar
peastman committed
183
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
184
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
185
    double energy = 0.0;
186
187
188

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
189
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
190
        float mhx = mx*(float)recipBoxVectors[0][0];
191
        float bx = scaleFactor*bsplineModuli[0][kx];
192
193
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
194
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
195
196
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
197
            for (int kz = firstz; kz < gridz; kz++) {
198
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
199
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
200
201
202
203
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
                float eterm = exp(-recipExpFactor*m2)/denom;
204
205
206
207
208
209
210
211
212
213
214
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
215
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
216
217
218
219
220
221
222
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
            }
            firstz = 0;
        }
    }
223
    return 0.5*energy;
224
225
}

226
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, vector<float>& recipEterm) {
227
228
229
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;

230
231
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
232
233
234
        for (int ky = 0; ky < gridy; ky++) {
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
235
                float eterm = recipEterm[index];
236
237
                grid[index][0] *= eterm;
                grid[index][1] *= eterm;
238
239
240
241
242
243
            }
            firstz = 0;
        }
    }
}

peastman's avatar
peastman committed
244
245
246
247
248
249
static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
250
251
252
253
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
254
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
255
    for (int i = start; i < end; i++) {
256
257
        // Find the position relative to the nearest grid point.
        
258
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
259
260
261
262
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
263
264
265
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
266
267
268
        
        // Compute the B-spline coefficients.
        
269
270
271
        fvec4 data[PME_ORDER];
        fvec4 ddata[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
272
        data[1] = dr;
273
        data[0] = one-dr;
274
        for (int j = 3; j < PME_ORDER; j++) {
275
276
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
277
            for (int k = 1; k < j-1; k++)
278
279
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
280
        }
281
        ddata[0] = -data[0];
282
        for (int j = 1; j < PME_ORDER; j++)
283
284
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
285
        for (int j = 1; j < (PME_ORDER-1); j++)
286
287
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
288
                
289
290
        // Compute the force on this atom.
        
291
292
293
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
294
295
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
296
297
298
299
300
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
301
        fvec4 zdata[PME_ORDER];
302
        for (int j = 0; j < PME_ORDER; j++)
303
304
            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
        fvec4 f = 0.0f;
305
306
307
308
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
309
310
311
            float dx = data[ix][0];
            float ddx = ddata[ix][0];
            fvec4 xdata(ddx, dx, dx, 0);
312
313
314
315
316

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
317
318
319
                float dy = data[iy][1];
                float ddy = ddata[iy][1];
                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
320
321

                for (int iz = 0; iz < PME_ORDER; iz++) {
322
323
                    fvec4 gridValue(grid[ybase+zindex[iz]]);
                    f = f+xydata*zdata[iz]*gridValue;
324
325
326
                }
            }
        }
peastman's avatar
peastman committed
327
328
329
330
331
332
        f *= -epsilonFactor*posq[4*i+3];
        float fc[4];
        f.store(fc);
        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
333
334
335
    }
}

336
class CpuCalcPmeReciprocalForceKernel::ThreadData {
337
public:
338
    CpuCalcPmeReciprocalForceKernel& owner;
339
    int index;
340
    float* tempGrid;
341
    ThreadData(CpuCalcPmeReciprocalForceKernel& owner, int index) : owner(owner), index(index), tempGrid(NULL) {
342
343
344
345
    }
};

static void* threadBody(void* args) {
346
    CpuCalcPmeReciprocalForceKernel::ThreadData& data = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel::ThreadData*>(args);
347
    data.owner.runThread(data.index);
348
349
    if (data.tempGrid != NULL)
        fftwf_free(data.tempGrid);
350
351
352
353
    delete &data;
    return 0;
}

354
void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
355
356
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
357
358
359
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
360
361
362
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
peastman's avatar
peastman committed
363
364
365
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
366
367
368
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
369
    recipEterm.resize(gridx*gridy*gridz);
370
    
371
372
373
374
    // Initialize threads.
    
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
375
376
    pthread_cond_init(&mainThreadStartCondition, NULL);
    pthread_cond_init(&mainThreadEndCondition, NULL);
377
378
379
380
381
382
    pthread_mutex_init(&lock, NULL);
    thread.resize(numThreads);
    for (int i = 0; i < numThreads; i++) {
        ThreadData* data = new ThreadData(*this, i);
        threadData.push_back(data);
        pthread_create(&thread[i], NULL, threadBody, data);
383
        data->tempGrid = (float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3));
384
    }
385
386
    pthread_create(&mainThread, NULL, threadBody, new ThreadData(*this, -1));
    
387
388
    // Initialize FFTW.
    
389
    realGrid = threadData[0]->tempGrid;
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

    int maxSize = max(max(gridx, gridy), gridz);
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

452
CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
453
454
    isDeleted = true;
    pthread_mutex_lock(&lock);
455
    pthread_cond_broadcast(&startCondition);
456
    pthread_cond_broadcast(&mainThreadStartCondition);
457
    pthread_mutex_unlock(&lock);
458
    for (int i = 0; i < (int) thread.size(); i++)
459
        pthread_join(thread[i], NULL);
460
    pthread_join(mainThread, NULL);
461
462
463
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
464
465
    pthread_cond_destroy(&mainThreadStartCondition);
    pthread_cond_destroy(&mainThreadEndCondition);
466
467
468
469
470
471
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
472
473
}

474
void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
475
476
477
    if (index == -1) {
        // This is the main thread that coordinates all the other ones.
        
478
        pthread_mutex_lock(&lock);
479
480
481
482
483
        while (true) {
            // Wait for the signal to start.
            
            pthread_cond_wait(&mainThreadStartCondition, &lock);
            if (isDeleted)
484
485
                break;
            posq = io->getPosq();
486
487
488
            advanceThreads(); // Signal threads to perform charge spreading.
            advanceThreads(); // Signal threads to sum the charge grids.
            fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
peastman's avatar
peastman committed
489
            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2])
490
                advanceThreads(); // Signal threads to compute the reciprocal scale factors.
491
492
493
494
495
496
            if (includeEnergy)
                advanceThreads(); // Signal threads to compute energy.
            advanceThreads(); // Signal threads to perform reciprocal convolution.
            fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
            advanceThreads(); // Signal threads to interpolate forces.
            isFinished = true;
peastman's avatar
peastman committed
497
498
499
            lastBoxVectors[0] = periodicBoxVectors[0];
            lastBoxVectors[1] = periodicBoxVectors[1];
            lastBoxVectors[2] = periodicBoxVectors[2];
500
501
            pthread_cond_signal(&mainThreadEndCondition);
        }
502
        pthread_mutex_unlock(&lock);
503
504
505
506
507
508
509
510
511
512
513
514
    }
    else {
        // This is a worker thread.
        
        int particleStart = (index*numParticles)/numThreads;
        int particleEnd = ((index+1)*numParticles)/numThreads;
        int gridxStart = (index*gridx)/numThreads;
        int gridxEnd = ((index+1)*gridx)/numThreads;
        int gridSize = (gridx*gridy*gridz+3)/4;
        int gridStart = 4*((index*gridSize)/numThreads);
        int gridEnd = 4*(((index+1)*gridSize)/numThreads);
        while (true) {
515
            threadWait();
516
            if (isDeleted)
517
                break;
peastman's avatar
peastman committed
518
            spreadCharge(particleStart, particleEnd, posq, threadData[index]->tempGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
519
520
521
            threadWait();
            int numGrids = threadData.size();
            for (int i = gridStart; i < gridEnd; i += 4) {
522
                fvec4 sum(&realGrid[i]);
523
                for (int j = 1; j < numGrids; j++)
524
525
                    sum += fvec4(&threadData[j]->tempGrid[i]);
                sum.store(&realGrid[i]);
526
527
            }
            threadWait();
peastman's avatar
peastman committed
528
529
            if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
                computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
530
531
                threadWait();
            }
532
            if (includeEnergy) {
peastman's avatar
peastman committed
533
                double threadEnergy = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
534
535
536
537
538
                pthread_mutex_lock(&lock);
                energy += threadEnergy;
                pthread_mutex_unlock(&lock);
                threadWait();
            }
539
            reciprocalConvolution(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, recipEterm);
540
            threadWait();
peastman's avatar
peastman committed
541
            interpolateForces(particleStart, particleEnd, posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors);
542
543
544
545
        }
    }
}

546
void CpuCalcPmeReciprocalForceKernel::threadWait() {
547
548
549
550
551
552
553
    pthread_mutex_lock(&lock);
    waitCount++;
    pthread_cond_signal(&endCondition);
    pthread_cond_wait(&startCondition, &lock);
    pthread_mutex_unlock(&lock);
}

554
void CpuCalcPmeReciprocalForceKernel::advanceThreads() {
555
556
557
558
559
560
561
    waitCount = 0;
    pthread_cond_broadcast(&startCondition);
    while (waitCount < numThreads) {
        pthread_cond_wait(&endCondition, &lock);
    }
}

peastman's avatar
peastman committed
562
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
563
    this->io = &io;
peastman's avatar
peastman committed
564
565
566
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
567
568
    this->includeEnergy = includeEnergy;
    energy = 0.0;
peastman's avatar
peastman committed
569
570
571
572
573
574
575
576
577
578
579

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

580
    pthread_mutex_lock(&lock);
581
    isFinished = false;
582
583
584
585
    pthread_cond_signal(&mainThreadStartCondition);
    pthread_mutex_unlock(&lock);
}

586
double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
587
588
589
590
591
592
    pthread_mutex_lock(&lock);
    while (!isFinished) {
        pthread_cond_wait(&mainThreadEndCondition, &lock);
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
593
    return energy;
peastman's avatar
peastman committed
594
}
595
596

bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
597
    return isVec4Supported();
598
}
599

peastman's avatar
peastman committed
600
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
601
602
603
604
605
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

peastman's avatar
peastman committed
606
607
608
609
610
611
        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
612
        int unfactored = minimum;
613
        for (int factor = 2; factor < 8; factor++) {
614
615
616
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
617
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
618
619
620
621
            return minimum;
        minimum++;
    }
}