"openmmapi/vscode:/vscode.git/clone" did not exist on "9d779a3975a76da0b3a28c5bbe02eaaa9c49e98f"
CpuPmeKernels.cpp 26.2 KB
Newer Older
peastman's avatar
peastman committed
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
peastman's avatar
peastman committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
34
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
35
#include "CpuPmeKernels.h"
36
#include "SimTKOpenMMRealType.h"
37
#include "openmm/internal/hardware.h"
38
#include "openmm/internal/vectorize.h"
39
#include <cmath>
40
#include <algorithm>
peastman's avatar
peastman committed
41
#include <cstring>
42
#include <sstream>
Robert McGibbon's avatar
Robert McGibbon committed
43
#include <cstdlib>
peastman's avatar
peastman committed
44
45
46
47
48
49

using namespace OpenMM;
using namespace std;

static const int PME_ORDER = 5;

50
51
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
52

53
static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter) {
54
    float temp[4];
peastman's avatar
peastman committed
55
56
57
58
59
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
60
61
62
63
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
Robert McGibbon's avatar
Robert McGibbon committed
64
    float posInBox[4] = {0,0,0,0};
peastman's avatar
peastman committed
65
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
66
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
Robert McGibbon's avatar
Robert McGibbon committed
67

68
69
70
71
72
    while (true) {
        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
        if (i >= numParticles)
            break;

peastman's avatar
peastman committed
73
        // Find the position relative to the nearest grid point.
Robert McGibbon's avatar
Robert McGibbon committed
74

75
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
76
77
78
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
79
80
81
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
peastman's avatar
peastman committed
82
83
        
        // Compute the B-spline coefficients.
Robert McGibbon's avatar
Robert McGibbon committed
84

85
86
        fvec4 data[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
peastman's avatar
peastman committed
87
        data[1] = dr;
88
        data[0] = one-dr;
peastman's avatar
peastman committed
89
        for (int j = 3; j < PME_ORDER; j++) {
90
91
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
peastman's avatar
peastman committed
92
            for (int k = 1; k < j-1; k++)
93
94
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
peastman's avatar
peastman committed
95
        }
96
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
peastman's avatar
peastman committed
97
        for (int j = 1; j < (PME_ORDER-1); j++)
98
99
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
peastman's avatar
peastman committed
100
101
102
        
        // Spread the charges.
        
103
104
105
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
106
107
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
108
109
110
111
112
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
peastman's avatar
peastman committed
113
        float charge = epsilonFactor*posq[4*i+3];
114
115
        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
        float zdata4 = data[4][2];
116
117
118
119
120
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
121
                float xdata = charge*data[ix][0];
122
123
124
125
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
126
127
128
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
129
130
131
132
133
134
135
136
137
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
        }
        else {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
138
                float xdata = charge*data[ix][0];
139
140
141
142
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
143
144
145
                    float multiplier = xdata*data[iy][1];
                    fvec4 add0to3 = zdata0to3*multiplier;
                    add0to3.store(temp);
peastman's avatar
peastman committed
146
147
148
149
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
                    grid[ybase+zindex[3]] += temp[3];
150
                    grid[ybase+zindex[4]] += multiplier*zdata4;
peastman's avatar
peastman committed
151
152
153
154
155
156
                }
            }
        }
    }
}

peastman's avatar
peastman committed
157
static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int gridz, vector<float>& recipEterm, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
158
159
    const unsigned int zsize = gridz/2+1;
    const unsigned int yzsize = gridy*zsize;
peastman's avatar
peastman committed
160
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
161
162
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));

163
164
    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
165
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
166
        float mhx = mx*(float)recipBoxVectors[0][0];
167
168
169
        float bx = scaleFactor*bsplineModuli[0][kx];
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
170
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
171
172
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
173
174
            for (int kz = firstz; kz < zsize; kz++) {
                int index = kx*yzsize + ky*zsize + kz;
175
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
176
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
177
                float bz = bsplineModuli[2][kz];
178
179
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
180
181
182
183
184
185
186
                recipEterm[index] = exp(-recipExpFactor*m2)/denom;
            }
            firstz = 0;
        }
    }
}

187
static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
188
189
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
peastman's avatar
peastman committed
190
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
191
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
192
    double energy = 0.0;
193
194
195

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
196
        int mx = (kx < (gridx+1)/2) ? kx : kx-gridx;
peastman's avatar
peastman committed
197
        float mhx = mx*(float)recipBoxVectors[0][0];
198
        float bx = scaleFactor*bsplineModuli[0][kx];
199
200
        for (int ky = 0; ky < gridy; ky++) {
            int my = (ky < (gridy+1)/2) ? ky : ky-gridy;
peastman's avatar
peastman committed
201
            float mhy = mx*(float)recipBoxVectors[1][0] + my*(float)recipBoxVectors[1][1];
202
203
            float mhx2y2 = mhx*mhx + mhy*mhy;
            float bxby = bx*bsplineModuli[1][ky];
204
            for (int kz = firstz; kz < gridz; kz++) {
205
                int mz = (kz < (gridz+1)/2) ? kz : kz-gridz;
peastman's avatar
peastman committed
206
                float mhz = mx*(float)recipBoxVectors[2][0] + my*(float)recipBoxVectors[2][1] + mz*(float)recipBoxVectors[2][2];
207
208
209
210
                float bz = bsplineModuli[2][kz];
                float m2 = mhx2y2 + mhz*mhz;
                float denom = m2*bxby*bz;
                float eterm = exp(-recipExpFactor*m2)/denom;
211
212
213
214
215
216
217
218
219
220
221
                int kx1, ky1, kz1;
                if (kz >= gridz/2+1) {
                    kx1 = (kx == 0 ? kx : gridx-kx);
                    ky1 = (ky == 0 ? ky : gridy-ky);
                    kz1 = gridz-kz;
                }
                else {
                    kx1 = kx;
                    ky1 = ky;
                    kz1 = kz;
                }
222
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
223
224
225
226
227
228
229
                float gridReal = grid[index][0];
                float gridImag = grid[index][1];
                energy += eterm*(gridReal*gridReal+gridImag*gridImag);
            }
            firstz = 0;
        }
    }
230
    return 0.5*energy;
231
232
}

233
234
235
236
237
static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vector<float>& recipEterm) {
    for (int index = start; index < end; index++) {
        float eterm = recipEterm[index];
        grid[index][0] *= eterm;
        grid[index][1] *= eterm;
238
239
240
    }
}

241
static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter) {
peastman's avatar
peastman committed
242
243
244
245
246
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
    fvec4 recipBoxVec1((float) recipBoxVectors[1][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[1][2], 0);
    fvec4 recipBoxVec2((float) recipBoxVectors[2][0], (float) recipBoxVectors[2][1], (float) recipBoxVectors[2][2], 0);
247
248
249
250
    fvec4 gridSize(gridx, gridy, gridz, 0);
    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
251
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
252
253
254
255
256
    while (true) {
        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
        if (i >= numParticles)
            break;

257
258
        // Find the position relative to the nearest grid point.
        
259
        fvec4 pos(&posq[4*i]);
peastman's avatar
peastman committed
260
261
262
263
        float posInBox[4];
        (pos-boxSize*floor(pos*invBoxSize)).store(posInBox);
        fvec4 t = posInBox[0]*recipBoxVec0 + posInBox[1]*recipBoxVec1 + posInBox[2]*recipBoxVec2;
        t = (t-floor(t))*gridSize;
264
265
266
        ivec4 ti = t;
        fvec4 dr = t-ti;
        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
267
268
269
        
        // Compute the B-spline coefficients.
        
270
271
272
        fvec4 data[PME_ORDER];
        fvec4 ddata[PME_ORDER];
        data[PME_ORDER-1] = 0.0f;
273
        data[1] = dr;
274
        data[0] = one-dr;
275
        for (int j = 3; j < PME_ORDER; j++) {
276
277
            fvec4 div(1.0f/(j-1));
            data[j-1] = div*dr*data[j-2];
278
            for (int k = 1; k < j-1; k++)
279
280
                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
            data[0] = div*(one-dr)*data[0];
281
        }
282
        ddata[0] = -data[0];
283
        for (int j = 1; j < PME_ORDER; j++)
284
285
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
286
        for (int j = 1; j < (PME_ORDER-1); j++)
287
288
            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(one-dr)*data[0];
289
                
290
291
        // Compute the force on this atom.
        
292
293
294
        int gridIndexX = gridIndex[0];
        int gridIndexY = gridIndex[1];
        int gridIndexZ = gridIndex[2];
295
296
        if (gridIndexX < 0)
            return; // This happens when a simulation blows up and coordinates become NaN.
peastman's avatar
peastman committed
297
298
299
300
301
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
302
        fvec4 zdata[PME_ORDER];
303
        for (int j = 0; j < PME_ORDER; j++)
304
305
            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
        fvec4 f = 0.0f;
306
307
308
309
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
310
311
312
            float dx = data[ix][0];
            float ddx = ddata[ix][0];
            fvec4 xdata(ddx, dx, dx, 0);
313
314
315
316
317

            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
318
319
320
                float dy = data[iy][1];
                float ddy = ddata[iy][1];
                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
321
322

                for (int iz = 0; iz < PME_ORDER; iz++) {
323
324
                    fvec4 gridValue(grid[ybase+zindex[iz]]);
                    f = f+xydata*zdata[iz]*gridValue;
325
326
327
                }
            }
        }
peastman's avatar
peastman committed
328
329
330
331
332
333
        f *= -epsilonFactor*posq[4*i+3];
        float fc[4];
        f.store(fc);
        force[4*i+0] = fc[0]*gridx*(float)recipBoxVectors[0][0];
        force[4*i+1] = fc[0]*gridx*(float)recipBoxVectors[1][0]+fc[1]*gridy*(float)recipBoxVectors[1][1];
        force[4*i+2] = fc[0]*gridx*(float)recipBoxVectors[2][0]+fc[1]*gridy*(float)recipBoxVectors[2][1]+fc[2]*gridz*(float)recipBoxVectors[2][2];
334
335
336
    }
}

337
class CpuCalcPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
338
public:
339
340
341
342
    ComputeTask(CpuCalcPmeReciprocalForceKernel& owner) : owner(owner) {
    }
    void execute(ThreadPool& threads, int threadIndex) {
        owner.runWorkerThread(threads, threadIndex);
343
    }
344
    CpuCalcPmeReciprocalForceKernel& owner;
345
346
347
};

static void* threadBody(void* args) {
348
349
    CpuCalcPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel*>(args);
    owner.runMainThread();
350
351
352
    return 0;
}

353
void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize, int numParticles, double alpha) {
354
355
    if (!hasInitializedThreads) {
        numThreads = getNumProcessors();
356
357
358
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
359
360
361
        fftwf_init_threads();
        hasInitializedThreads = true;
    }
362
    threadEnergy.resize(numThreads);
peastman's avatar
peastman committed
363
364
365
    gridx = findFFTDimension(xsize, false);
    gridy = findFFTDimension(ysize, false);
    gridz = findFFTDimension(zsize, true);
366
367
368
    this->numParticles = numParticles;
    this->alpha = alpha;
    force.resize(4*numParticles);
369
    recipEterm.resize(gridx*gridy*gridz);
370
    
371
372
    // Initialize threads.
    
373
    isFinished = false;
374
375
376
    pthread_cond_init(&startCondition, NULL);
    pthread_cond_init(&endCondition, NULL);
    pthread_mutex_init(&lock, NULL);
377
    pthread_create(&mainThread, NULL, threadBody, this);
378
    
379
380
381
    // Wait until the main thread is up and running.
    
    pthread_mutex_lock(&lock);
382
383
    while (!isFinished)
        pthread_cond_wait(&endCondition, &lock);
384
385
    pthread_mutex_unlock(&lock);
    
386
387
    // Initialize FFTW.
    
388
389
390
    for (int i = 0; i < numThreads; i++)
        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
    realGrid = tempGrid[0];
391
392
393
394
395
396
397
398
    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
    fftwf_plan_with_nthreads(numThreads);
    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
    hasCreatedPlan = true;
    
    // Initialize the b-spline moduli.

399
    int maxSize = std::max(std::max(gridx, gridy), gridz);
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
    vector<double> data(PME_ORDER);
    vector<double> ddata(PME_ORDER);
    vector<double> bsplinesData(maxSize);
    data[PME_ORDER-1] = 0.0;
    data[1] = 0.0;
    data[0] = 1.0;
    for (int i = 3; i < PME_ORDER; i++) {
        double div = 1.0/(i-1.0);
        data[i-1] = 0.0;
        for (int j = 1; j < (i-1); j++)
            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
        data[0] = div*data[0];
    }

    // Differentiate.

    ddata[0] = -data[0];
    for (int i = 1; i < PME_ORDER; i++)
        ddata[i] = data[i-1]-data[i];
    double div = 1.0/(PME_ORDER-1);
    data[PME_ORDER-1] = 0.0;
    for (int i = 1; i < (PME_ORDER-1); i++)
        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
    data[0] = div*data[0];
    for (int i = 0; i < maxSize; i++)
        bsplinesData[i] = 0.0;
    for (int i = 1; i <= PME_ORDER; i++)
        bsplinesData[i] = data[i-1];

    // Evaluate the actual bspline moduli for X/Y/Z.

    bsplineModuli[0].resize(gridx);
    bsplineModuli[1].resize(gridy);
    bsplineModuli[2].resize(gridz);
    for (int dim = 0; dim < 3; dim++) {
        int ndata = bsplineModuli[dim].size();
        vector<float>& moduli = bsplineModuli[dim];
        for (int i = 0; i < ndata; i++) {
            double sc = 0.0;
            double ss = 0.0;
            for (int j = 0; j < ndata; j++) {
                double arg = (2.0*M_PI*i*j)/ndata;
                sc += bsplinesData[j]*cos(arg);
                ss += bsplinesData[j]*sin(arg);
            }
            moduli[i] = (float) (sc*sc+ss*ss);
        }
        for (int i = 0; i < ndata; i++)
            if (moduli[i] < 1.0e-7f)
                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f;
    }
}

453
CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
454
455
    isDeleted = true;
    pthread_mutex_lock(&lock);
456
457
    pthread_cond_broadcast(&startCondition);
    pthread_mutex_unlock(&lock);
458
    pthread_join(mainThread, NULL);
459
460
461
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
462
463
    for (int i = 0; i < (int) tempGrid.size(); i++)
        fftwf_free(tempGrid[i]);
464
465
466
467
468
469
    if (complexGrid != NULL)
        fftwf_free(complexGrid);
    if (hasCreatedPlan) {
        fftwf_destroy_plan(forwardFFT);
        fftwf_destroy_plan(backwardFFT);
    }
470
471
}

472
473
void CpuCalcPmeReciprocalForceKernel::runMainThread() {
    // This is the main thread that coordinates all the other ones.
474
475

    pthread_mutex_lock(&lock);
476
    isFinished = true;
477
    pthread_cond_signal(&endCondition);
478
479
480
481
482
483
484
485
486
    ThreadPool threads(numThreads);
    while (true) {
        // Wait for the signal to start.

        pthread_cond_wait(&startCondition, &lock);
        if (isDeleted)
            break;
        posq = io->getPosq();
        ComputeTask task(*this);
487
        gmx_atomic_set(&atomicCounter, 0);
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
        threads.execute(task); // Signal threads to perform charge spreading.
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();
        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
            threads.waitForThreads();
        }
        if (includeEnergy) {
            threads.resumeThreads(); // Signal threads to compute energy.
            threads.waitForThreads();
            for (int i = 0; i < (int) threadEnergy.size(); i++)
                energy += threadEnergy[i];
        }
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
506
        gmx_atomic_set(&atomicCounter, 0);
507
508
509
510
511
512
513
514
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
        isFinished = true;
        lastBoxVectors[0] = periodicBoxVectors[0];
        lastBoxVectors[1] = periodicBoxVectors[1];
        lastBoxVectors[2] = periodicBoxVectors[2];
        pthread_cond_signal(&endCondition);
    }
515
516
517
    pthread_mutex_unlock(&lock);
}

518
519
520
521
522
523
void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int index) {
    int gridxStart = (index*gridx)/numThreads;
    int gridxEnd = ((index+1)*gridx)/numThreads;
    int gridSize = (gridx*gridy*gridz+3)/4;
    int gridStart = 4*((index*gridSize)/numThreads);
    int gridEnd = 4*(((index+1)*gridSize)/numThreads);
524
    int complexSize = gridx*gridy*(gridz/2+1);
525
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
526
527
    int complexEnd = (((index+1)*complexSize)/numThreads);
    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter);
528
529
530
531
532
533
534
    threads.syncThreads();
    int numGrids = tempGrid.size();
    for (int i = gridStart; i < gridEnd; i += 4) {
        fvec4 sum(&realGrid[i]);
        for (int j = 1; j < numGrids; j++)
            sum += fvec4(&tempGrid[j][i]);
        sum.store(&realGrid[i]);
535
    }
536
537
538
539
540
541
542
543
544
    threads.syncThreads();
    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
        computeReciprocalEterm(gridxStart, gridxEnd, gridx, gridy, gridz, recipEterm, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
        threads.syncThreads();
    }
    if (includeEnergy) {
        threadEnergy[index] = reciprocalEnergy(gridxStart, gridxEnd, complexGrid, gridx, gridy, gridz, alpha, bsplineModuli, periodicBoxVectors, recipBoxVectors);
        threads.syncThreads();
    }
545
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
546
    threads.syncThreads();
547
    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter);
548
549
}

peastman's avatar
peastman committed
550
void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
551
    this->io = &io;
peastman's avatar
peastman committed
552
553
554
    this->periodicBoxVectors[0] = periodicBoxVectors[0];
    this->periodicBoxVectors[1] = periodicBoxVectors[1];
    this->periodicBoxVectors[2] = periodicBoxVectors[2];
555
556
    this->includeEnergy = includeEnergy;
    energy = 0.0;
peastman's avatar
peastman committed
557
558
559
560
561
562
563
564
565
566
567

    // Invert the box vectors.

    double determinant = periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2];
    double scale = 1.0/determinant;
    recipBoxVectors[0] = Vec3(periodicBoxVectors[1][1]*periodicBoxVectors[2][2], 0, 0)*scale;
    recipBoxVectors[1] = Vec3(-periodicBoxVectors[1][0]*periodicBoxVectors[2][2], periodicBoxVectors[0][0]*periodicBoxVectors[2][2], 0)*scale;
    recipBoxVectors[2] = Vec3(periodicBoxVectors[1][0]*periodicBoxVectors[2][1]-periodicBoxVectors[1][1]*periodicBoxVectors[2][0], -periodicBoxVectors[0][0]*periodicBoxVectors[2][1], periodicBoxVectors[0][0]*periodicBoxVectors[1][1])*scale;

    // Do the calculation.

568
    pthread_mutex_lock(&lock);
569
    isFinished = false;
570
    pthread_cond_signal(&startCondition);
571
572
573
    pthread_mutex_unlock(&lock);
}

574
double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
575
576
    pthread_mutex_lock(&lock);
    while (!isFinished) {
577
        pthread_cond_wait(&endCondition, &lock);
578
579
580
    }
    pthread_mutex_unlock(&lock);
    io.setForce(&force[0]);
581
    return energy;
peastman's avatar
peastman committed
582
}
583
584

bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
585
    return isVec4Supported();
586
}
587

588
589
590
591
592
593
594
void CpuCalcPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    alpha = this->alpha;
    nx = gridx;
    ny = gridy;
    nz = gridz;
}

peastman's avatar
peastman committed
595
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
596
597
598
599
600
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

peastman's avatar
peastman committed
601
602
603
604
605
606
        if (isZ && minimum%2 == 1) {
            // Force the last dimension to be even, since this produces better performance in FFTW.

            minimum++;
            continue;
        }
607
        int unfactored = minimum;
608
        for (int factor = 2; factor < 8; factor++) {
609
610
611
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
612
        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
613
614
615
616
            return minimum;
        minimum++;
    }
}