PME.cpp 23.3 KB
Newer Older
1
/*
2
 * Reference implementation of PME reciprocal space interactions.
3
 *
4
5
6
 * Copyright (c) 2009, Erik Lindahl, Rossen Apostolov, Szilard Pall
 * All rights reserved.
 * Contact: lindahl@cbr.su.se Stockholm University, Sweden.
7
8
 *
 * Redistribution and use in source and binary forms, with or without
9
10
 * modification, are permitted provided that the following conditions are met:
 *
11
12
13
14
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer. Redistributions in binary
 * form must reproduce the above copyright notice, this list of conditions and
 * the following disclaimer in the documentation and/or other materials provided
15
 * with the distribution.
16
17
 * Neither the name of the author/university nor the names of its contributors may
 * be used to endorse or promote products derived from this software without
18
19
 * specific prior written permission.
 *
20
21
22
23
24
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
26
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
30
31
32
33
34
35
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

36
#include "PME.h"
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#include "fftpack.h"


//#define ONE_4PI_EPS0 (33.20636930*4.184) /* Units of kJ/mol and nm */
// In OpenMM, atom charges are already scaled by sqrt(ONE_4PI_EPS0), so we don't need this
#define ONE_4PI_EPS0 (1.0) /* Units of kJ/mol and nm */

typedef int    ivec[3];


struct pme
{
	int          natoms;
	RealOpenMM       ewaldcoeff;
51

52
53
54
55
56
57
	t_complex *  grid;                 /* Memory for the grid we spread charges on.
                                        * Element (i,j,k) is accessed as:
                                        * grid[i*ngrid[1]*ngrid[2] + j*ngrid[2] + k]
                                        */
	int          ngrid[3];             /* Total grid dimensions (all data is complex!) */
	fftpack_t    fftplan;              /* Handle to fourier transform setup  */
58

59
	int          order;                /* PME interpolation order. Almost always 4 */
60

61
62
63
64
65
    /* Data for bspline interpolation, see the Essman PME paper */
	RealOpenMM *     bsplines_moduli[3];   /* 3 pointers, to x/y/z bspline moduli, each of length ngrid[x/y/z]   */
	RealOpenMM *     bsplines_theta[3];    /* each of x/y/z has length order*natoms */
	RealOpenMM *     bsplines_dtheta[3];   /* each of x/y/z has length order*natoms */

66
	ivec *       particleindex;        /* Array of length natoms. Each element is
67
68
69
                                        * an ivec (3 ints) that specify the grid
                                        * indices for that particular atom. Updated every step!
                                        */
70
    rvec *       particlefraction;     /* Array of length natoms. Fractional offset in the grid for
71
72
                                        * each atom in all three dimensions.
                                        */
73

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    /* Further explanation of index/fraction:
     *
     * Assume we have a cell of size 10*10*10nm, and a total grid dimension of 100*100*100 cells.
     * In other words, each cell is 0.1*0.1*0.1 nm.
     *
     * If particle i has coordinates { 0.543 , 6.235 , -0.73 }, we will get:
     *
     * particleindex[i]    = { 5 , 62 , 92 }         (-0.73 + 10 = 9.27, we always apply PBC for grid calculations!)
     * particlefraction[i] = { 0.43 , 0.35 , 0.7 }   ( this is the fraction of the cell length where the atom is)
     *
     * (The reason for precaculating / storing these is that it gets a bit more complex for triclinic cells :-)
     *
     * In the current code version we might assume that a particle is not more than a whole box length away from
     * the central cell, i.e., in this case we would assume all coordinates fall in -10 nm < x,y,z < 20 nm.
     */
89

90
	RealOpenMM       epsilon_r;             /* Dielectric coefficient to use, typically 1.0 */
91
};
92
93
94
95
96
97
98


/* Internal setup routines */



/* Only called once from init_pme(), performance does not matter! */
99
static void
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
pme_calculate_bsplines_moduli(pme_t pme)
{
	int       nmax;
	int       i,j,k,l,d;
	int       order;
	int       ndata;
	RealOpenMM *  data;
	RealOpenMM *  ddata;
	RealOpenMM *  bsplines_data;
	RealOpenMM    div;
	RealOpenMM    sc,ss,arg;

	nmax = 0;
	for(d=0;d<3;d++)
	{
		nmax = (pme->ngrid[d] > nmax) ? pme->ngrid[d] : nmax;
		pme->bsplines_moduli[d] = (RealOpenMM *) malloc(sizeof(RealOpenMM)*pme->ngrid[d]);
	}
118

119
	order = pme->order;
120

121
122
123
124
	/* temp storage in this routine */
	data          = (RealOpenMM *) malloc(sizeof(RealOpenMM)*order);
	ddata         = (RealOpenMM *) malloc(sizeof(RealOpenMM)*order);
	bsplines_data = (RealOpenMM *) malloc(sizeof(RealOpenMM)*nmax);
125

126
127
128
	data[order-1]=0;
	data[1]=0;
	data[0]=1;
129

130
131
132
133
134
135
136
137
138
139
	for(k=3;k<order;k++)
	{
		div=1.0/(k-1.0);
		data[k-1]=0;
		for(l=1;l<(k-1);l++)
		{
			data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
		}
		data[0]=div*data[0];
	}
140

141
142
143
144
145
146
	/* differentiate */
	ddata[0]=-data[0];
	for(k=1;k<order;k++)
	{
		ddata[k]=data[k-1]-data[k];
	}
147

148
149
	div=1.0/(order-1);
	data[order-1]=0;
150

151
152
153
154
	for(l=1;l<(order-1);l++)
	{
		data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
	}
155
156
	data[0]=div*data[0];

157
158
159
160
161
162
163
164
	for(i=0;i<nmax;i++)
	{
		bsplines_data[i]=0;
	}
	for(i=1;i<=order;i++)
	{
		bsplines_data[i]=data[i-1];
    }
165

166
167
168
169
170
171
172
	/* Evaluate the actual bspline moduli for X/Y/Z */
	for(d=0;d<3;d++)
	{
		ndata = pme->ngrid[d];
		for(i=0;i<ndata;i++)
		{
			sc=ss=0;
173
			for(j=0;j<ndata;j++)
174
175
176
177
178
179
180
181
182
183
184
185
186
187
			{
				arg=(2.0*M_PI*i*j)/ndata;
				sc+=bsplines_data[j]*cos(arg);
				ss+=bsplines_data[j]*sin(arg);
			}
			pme->bsplines_moduli[d][i]=sc*sc+ss*ss;
		}
		for(i=0;i<ndata;i++)
		{
			if(pme->bsplines_moduli[d][i]<1.0e-7)
			{
				pme->bsplines_moduli[d][i]=(pme->bsplines_moduli[d][i-1]+pme->bsplines_moduli[d][i+1])*0.5;
			}
		}
188
	}
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

	/* Release temp storage */
	free(data);
	free(ddata);
	free(bsplines_data);
}



static void
pme_update_grid_index_and_fraction(pme_t    pme,
								   RealOpenMM ** atomCoordinates,
								   const RealOpenMM   periodicBoxSize[3])
{
	int    i;
	int    d;
	RealOpenMM t;
	int    ti;
207

208
209
210
211
212
213
214
215
216
217
218
	for(i=0;i<pme->natoms;i++)
	{
		for(d=0;d<3;d++)
		{
			/* Index calculation (Look mom, no conditionals!):
             *
             * Both for Cuda and modern CPUs it is nice to avoid conditionals, but we still need to apply periodic boundary conditions.
             * Instead of having loops to add/subtract the box dimension, we do it this way:
             *
			 * 1. First add the box size, to make sure this atom coordinate isnt -0.1 or something.
             *    After this we assume all fractional box positions are *positive*.
219
             *    The reason for this is that we always want to round coordinates _down_ to get
220
221
222
223
224
225
226
227
228
229
230
231
             *    their grid index, and when taking the integer part of -3.4 we would get -3, not -4 as we want.
             *    Since we anyway need the grid indices to fall in the central box, it is more convenient
             *    to first manipulate the coordinates to be positive.
			 * 2. Convert to integer grid index
             *    Since we have added a whole box unit in step 1, this index might actually be larger than
             *    the grid dimension. Examples, assuming 10*10*10nm box and grid dimension 100*100*100 (spacing 0.1 nm):
             *
             *    coordinate is { 0.543 , 6.235 , -0.73 }
             *
             *    x[i][d]/box[d]                      becomes   { 0.0543 , 0.6235 , -0.073 }
             *    (x[i][d]/box[d] + 1.0)              becomes   { 1.0543 , 1.6235 , 0.927 }
             *    (x[i][d]/box[d] + 1.0)*ngrid[d]     becomes   { 105.43 , 162.35 , 92.7 }
232
             *
233
234
235
236
237
238
             *    integer part is now { 105 , 162 , 92 }
             *
             *    The fraction is calculates as t-ti, which becomes { 0.43 , 0.35 , 0.7 }
             *
			 * 3. Take the first integer index part (which can be larger than the grid) modulo the grid dimension
             *
239
             *    Now we get { 5 , 62 , 92 }
240
241
242
243
244
245
246
247
248
249
             *
             *    Voila, both index and fraction, entirely without conditionals. The one limitation here is that
             *    we only add one box length, so if the particle had a coordinate <=-10.0, we would be screwed.
             *    In principle we can of course add 100.0, but that just moves the problem, it doesnt solve it.
             *    In practice, MD programs will apply PBC to reset particles inside the central box to avoid
             *    numerical problems, so this shouldnt cause any problems.
             *    (And, by adding 100.0 box lengths, we would lose a bit of numerical accuracy here!)
			 */
			t  = (atomCoordinates[i][d] / periodicBoxSize[d] + 1.0)*pme->ngrid[d];
			ti = t;
250
251

			pme->particlefraction[i][d] = t - ti;
252
			pme->particleindex[i][d]    = ti % pme->ngrid[d];
253
		}
254
255
256
257
	}
}


258
/* Ugly bspline calculation taken from Tom Dardens reference equations.
259
260
261
262
 * This probably very sub-optimal in Cuda? Separate kernel?
 *
 * In practice, it might help to require order=4 for the cuda port.
 */
263
static void
264
265
266
267
268
269
270
pme_update_bsplines(pme_t    pme)
{
	int       i,j,k,l;
	int       order;
	RealOpenMM    dr,div;
	RealOpenMM *  data;
	RealOpenMM *  ddata;
271

272
	order = pme->order;
273
274

    for(i=0; (i<pme->natoms); i++)
275
	{
276
		for(j=0; j<3; j++)
277
278
279
		{
			/* dr is relative offset from lower cell limit */
			dr = pme->particlefraction[i][j];
280

281
282
283
284
285
			data  = &(pme->bsplines_theta[j][i*order]);
			ddata = &(pme->bsplines_dtheta[j][i*order]);
			data[order-1] = 0;
			data[1]       = dr;
			data[0]       = 1-dr;
286
287

			for(k=3; k<order; k++)
288
			{
289
				div = 1.0/(k-1.0);
290
				data[k-1] = div*dr*data[k-2];
291
				for(l=1; l<(k-1); l++)
292
293
294
295
296
297
298
299
				{
					data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)*data[k-l-1]);
				}
				data[0] = div*(1-dr)*data[0];
			}

			/* differentiate */
			ddata[0] = -data[0];
300
301

			for(k=1; k<order; k++)
302
303
304
			{
				ddata[k] = data[k-1]-data[k];
			}
305

306
307
			div           = 1.0/(order-1);
			data[order-1] = div*dr*data[order-2];
308
309

			for(l=1; l<(order-1); l++)
310
311
312
			{
				data[order-l-1] = div*((dr+l)*data[order-l-2]+(order-l-dr)*data[order-l-1]);
			}
313
			data[0] = div*(1-dr)*data[0];
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
		}
    }
}


static void
pme_grid_spread_charge(pme_t      pme,
					   RealOpenMM **   atomParameters)
{
        static const int   QIndex = 2; // atom charges are stored in atomParameters[atomID][2]
	int       order;
	int       i;
	int       ix,iy,iz;
	int       x0index,y0index,z0index;
	int       xindex,yindex,zindex;
	int       index;
	RealOpenMM    q;
	RealOpenMM *  thetax;
	RealOpenMM *  thetay;
	RealOpenMM *  thetaz;
334

335
	order = pme->order;
336

337
338
339
340
341
    /* Reset the grid */
	for(i=0;i<pme->ngrid[0]*pme->ngrid[1]*pme->ngrid[2];i++)
	{
		pme->grid[i].re = pme->grid[i].im = 0;
	}
342

343
344
345
	for(i=0;i<pme->natoms;i++)
	{
		q = atomParameters[i][QIndex];
346

347
348
349
350
        /* Grid index for the actual atom position */
		x0index = pme->particleindex[i][0];
		y0index = pme->particleindex[i][1];
		z0index = pme->particleindex[i][2];
351

352
353
354
355
        /* Bspline factors for this atom in each dimension , calculated from fractional coordinates */
		thetax  = &(pme->bsplines_theta[0][i*order]);
		thetay  = &(pme->bsplines_theta[1][i*order]);
		thetaz  = &(pme->bsplines_theta[2][i*order]);
356

357
358
359
        /* Loop over norder*norder*norder (typically 4*4*4) neighbor cells.
         *
         * As a neat optimization, we only spread in the forward direction, but apply PBC!
360
         *
361
         * Since we are going to do an FFT on the grid, it doesnt matter where the data is,
362
         * in frequency space the result will be the same.
363
364
365
366
367
         *
         * So, the influence function (bsplines) will probably be something like (0.15,0.35,0.35,0.15),
         * with largest weight 2-3 steps forward (you dont need to understand that for the implementation :-)
         * Effectively, you can look at this as translating the entire grid.
         *
368
         * Why do we do this stupid thing?
369
         *
370
         * 1) The loops get much simpler
371
372
373
         * 2) Just looking forward will hopefully get us more cache hits
         * 3) When we parallelize things, we only need to communicate in one direction instead of two!
         */
374

375
376
377
378
		for(ix=0;ix<order;ix++)
		{
            /* Calculate index, apply PBC so we spread to index 0/1/2 when a particle is close to the upper limit of the grid */
			xindex = (x0index + ix) % pme->ngrid[0];
379

380
381
382
			for(iy=0;iy<order;iy++)
			{
				yindex = (y0index + iy) % pme->ngrid[1];
383

384
385
386
				for(iz=0;iz<order;iz++)
				{
					/* Can be optimized, but we keep it simple here */
387
					zindex               = (z0index + iz) % pme->ngrid[2];
388
389
390
                    /* Calculate index in the charge grid */
					index                = xindex*pme->ngrid[1]*pme->ngrid[2] + yindex*pme->ngrid[2] + zindex;
                    /* Add the charge times the bspline spread/interpolation factors to this grid position */
391
					pme->grid[index].re += q*thetax[ix]*thetay[iy]*thetaz[iz];
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
				}
			}
		}
	}
}



static void
pme_reciprocal_convolution(pme_t     pme,
						   const RealOpenMM    periodicBoxSize[3],
						   RealOpenMM *  energy,
						   RealOpenMM    pme_virial[3][3])
{
	int kx,ky,kz;
	int nx,ny,nz;
	RealOpenMM mx,my,mz;
	RealOpenMM mhx,mhy,mhz,m2;
	RealOpenMM one_4pi_eps;
	RealOpenMM virxx,virxy,virxz,viryy,viryz,virzz;
	RealOpenMM bx,by,bz;
	RealOpenMM d1,d2;
	RealOpenMM eterm,vfactor,struct2,ets2;
	RealOpenMM esum;
	RealOpenMM factor;
	RealOpenMM denom;
	RealOpenMM boxfactor;
	RealOpenMM maxkx,maxky,maxkz;
420

421
	t_complex *ptr;
422

423
424
425
	nx = pme->ngrid[0];
	ny = pme->ngrid[1];
	nz = pme->ngrid[2];
426

427
428
429
	one_4pi_eps=ONE_4PI_EPS0/pme->epsilon_r;
	factor=M_PI*M_PI/(pme->ewaldcoeff*pme->ewaldcoeff);
	boxfactor = M_PI*periodicBoxSize[0]*periodicBoxSize[1]*periodicBoxSize[2];
430

431
432
433
434
435
436
437
	esum = 0;
	virxx = 0;
	virxy = 0;
	virxz = 0;
	viryy = 0;
	viryz = 0;
	virzz = 0;
438

439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
	maxkx = (nx+1)/2;
	maxky = (ny+1)/2;
	maxkz = (nz+1)/2;

    for(kx=0;kx<nx;kx++)
    {
        /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
        mx  = (kx<maxkx) ? kx : (kx-nx);
        mhx = mx/periodicBoxSize[0];
        bx  = boxfactor*pme->bsplines_moduli[0][kx];

        for(ky=0;ky<ny;ky++)
        {
            /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
            my  = (ky<maxky) ? ky : (ky-ny);
            mhy = my/periodicBoxSize[1];
            by  = pme->bsplines_moduli[1][ky];
456

457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
			for(kz=0;kz<nz;kz++)
			{
                /* If the net charge of the system is 0.0, there will not be any DC (direct current, zero frequency) component. However,
                 * we can still handle charged systems through a charge correction, in which case the DC
                 * component should be excluded from recprocal space. We will anyway run into problems below when dividing with the
                 * frequency if it is zero...
                 *
                 * In cuda you could probably work around this by setting something to 0.0 instead, but the short story is that we
                 * should skip the zero frequency case!
                 */
				if (kx==0 && ky==0 && kz==0)
				{
					continue;
				}

                /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
				mz        = (kz<maxkz) ? kz : (kz-nz);
				mhz       = mz/periodicBoxSize[2];

                /* Pointer to the grid cell in question */
477
				ptr       = pme->grid + kx*ny*nz + ky*nz + kz;
478
479
480
481

                /* Get grid data for this frequency */
				d1        = ptr->re;
				d2        = ptr->im;
482

483
484
485
486
                /* Calculate the convolution - see the Essman/Darden paper for the equation! */
				m2        = mhx*mhx+mhy*mhy+mhz*mhz;
				bz        = pme->bsplines_moduli[2][kz];
				denom     = m2*bx*by*bz;
487

488
489
490
491
492
				eterm     = one_4pi_eps*exp(-factor*m2)/denom;

                /* write back convolution data to grid */
				ptr->re   = d1*eterm;
				ptr->im   = d2*eterm;
493

494
495
496
497
498
499
500
501
502
503
504
505
506
				struct2   = (d1*d1+d2*d2);

				/* Long-range PME contribution to the energy for this frequency */
				ets2      = eterm*struct2;
				esum     += ets2;

				/* PME long-range contribution to atomic virial. Since it is symmetric, we only calculate half the matrix inside this loop. */
				vfactor   = (factor*m2+1.0)*2.0/m2;
				virxx    += ets2*(vfactor*mhx*mhx-1.0);
                virxy    += ets2*vfactor*mhx*mhy;
                virxz    += ets2*vfactor*mhx*mhz;
                viryy    += ets2*(vfactor*mhy*mhy-1.0);
                viryz    += ets2*vfactor*mhy*mhz;
507
                virzz    += ets2*(vfactor*mhz*mhz-1.0);
508
509
510
511
512
513
514
515
516
			}
		}
	}
	pme_virial[0][0] = 0.25*virxx;
    pme_virial[1][1] = 0.25*viryy;
    pme_virial[2][2] = 0.25*virzz;
    pme_virial[0][1] = pme_virial[1][0] = 0.25*virxy;
    pme_virial[0][2] = pme_virial[2][0] = 0.25*virxz;
    pme_virial[1][2] = pme_virial[2][1] = 0.25*viryz;
517

518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
    /* The factor 0.5 is nothing special, but it is better to have it here than inside the loop :-) */
	*energy = 0.5*esum;
}


static void
pme_grid_interpolate_force(pme_t      pme,
						   const RealOpenMM     periodicBoxSize[3],
						   RealOpenMM **   atomParameters,
						   RealOpenMM **   forces)
{
        static const int   QIndex = 2; // atom charges are stored in atomParameters[atomID][2]
	int       i;
	int       ix,iy,iz;
	int       x0index,y0index,z0index;
	int       xindex,yindex,zindex;
	int       index;
	int       order;
	RealOpenMM    q;
	RealOpenMM *  thetax;
	RealOpenMM *  thetay;
	RealOpenMM *  thetaz;
	RealOpenMM *  dthetax;
	RealOpenMM *  dthetay;
	RealOpenMM *  dthetaz;
	RealOpenMM    tx,ty,tz;
	RealOpenMM    dtx,dty,dtz;
	RealOpenMM    fx,fy,fz;
	RealOpenMM    gridvalue;
	int       nx,ny,nz;
548

549
550
551
	nx    = pme->ngrid[0];
	ny    = pme->ngrid[1];
	nz    = pme->ngrid[2];
552

553
	order = pme->order;
554

555
    /* This is almost identical to the charge spreading routine! */
556

557
558
559
	for(i=0;i<pme->natoms;i++)
	{
		fx = fy = fz = 0;
560

561
		q = atomParameters[i][QIndex];
562

563
564
565
566
        /* Grid index for the actual atom position */
		x0index = pme->particleindex[i][0];
		y0index = pme->particleindex[i][1];
		z0index = pme->particleindex[i][2];
567

568
569
570
571
572
573
574
575
576
        /* Bspline factors for this atom in each dimension , calculated from fractional coordinates */
		thetax  = &(pme->bsplines_theta[0][i*order]);
		thetay  = &(pme->bsplines_theta[1][i*order]);
		thetaz  = &(pme->bsplines_theta[2][i*order]);
		dthetax = &(pme->bsplines_dtheta[0][i*order]);
		dthetay = &(pme->bsplines_dtheta[1][i*order]);
		dthetaz = &(pme->bsplines_dtheta[2][i*order]);

		/* See pme_grid_spread_charge() for comments about the order here, and only interpolation in one direction */
577

578
579
580
581
582
583
584
585
586
        /* Since we will add order^3 (typically 4*4*4=64) terms to the force on each particle, we use temporary fx/fy/fz
         * variables, and only add it to memory forces[] at the end.
         */
		for(ix=0;ix<order;ix++)
		{
			xindex = (x0index + ix) % pme->ngrid[0];
            /* Get both the bspline factor and its derivative with respect to the x coordinate! */
			tx     = thetax[ix];
			dtx    = dthetax[ix];
587

588
589
590
591
592
593
			for(iy=0;iy<order;iy++)
			{
				yindex = (y0index + iy) % pme->ngrid[1];
                /* bspline + derivative wrt y */
				ty     = thetay[iy];
				dty    = dthetay[iy];
594

595
596
597
598
599
600
601
602
				for(iz=0;iz<order;iz++)
				{
					/* Can be optimized, but we keep it simple here */
					zindex               = (z0index + iz) % pme->ngrid[2];
                    /* bspline + derivative wrt z */
					tz                   = thetaz[iz];
					dtz                  = dthetaz[iz];
					index                = xindex*pme->ngrid[1]*pme->ngrid[2] + yindex*pme->ngrid[2] + zindex;
603

604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
                    /* Get the fft+convoluted+ifft:d data from the grid, which must be real by definition */
                    /* Checking that the imaginary part is indeed zero might be a good check :-) */
					gridvalue            = pme->grid[index].re;

					/* The d component of the force is calculated by taking the derived bspline in dimension d, normal bsplines in the other two */
					fx                  += dtx*ty*tz*gridvalue;
					fy                  += tx*dty*tz*gridvalue;
					fz                  += tx*ty*dtz*gridvalue;
				}
			}
		}
        /* Update memory force, note that we multiply by charge and some box stuff */
		forces[i][0] -= q*fx*nx/periodicBoxSize[0];
		forces[i][1] -= q*fy*ny/periodicBoxSize[1];
		forces[i][2] -= q*fz*nz/periodicBoxSize[2];
	}
}



/* EXPORTED ROUTINES */

626
int
627
628
629
pme_init(pme_t *       ppme,
		 RealOpenMM        ewaldcoeff,
		 int           natoms,
630
		 const int           ngrid[3],
631
632
633
634
635
		 int           pme_order,
         RealOpenMM        epsilon_r)
{
    pme_t pme;
    int   d;
636

637
638
639
640
641
642
    pme = (pme_t) malloc(sizeof(struct pme));

    pme->order       = pme_order;
    pme->epsilon_r   = epsilon_r;
    pme->ewaldcoeff  = ewaldcoeff;
	pme->natoms      = natoms;
643

644
645
646
647
648
649
650
651
652
    for(d=0;d<3;d++)
	{
		pme->ngrid[d]            = ngrid[d];
		pme->bsplines_theta[d]   = (RealOpenMM *)malloc(sizeof(RealOpenMM)*pme_order*natoms);
		pme->bsplines_dtheta[d]  = (RealOpenMM *)malloc(sizeof(RealOpenMM)*pme_order*natoms);
	}

	pme->particlefraction = (rvec *)malloc(sizeof(rvec)*natoms);
	pme->particleindex    = (ivec *)malloc(sizeof(ivec)*natoms);
653

654
655
	/* Allocate charge grid storage */
	pme->grid        = (t_complex *)malloc(sizeof(t_complex)*ngrid[0]*ngrid[1]*ngrid[2]);
656

657
	fftpack_init_3d(&pme->fftplan,ngrid[0],ngrid[1],ngrid[2]);
658

659
	/* Setup bspline moduli (see Essman paper) */
660
	pme_calculate_bsplines_moduli(pme);
661
662

	*ppme = pme;
663

664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
    return 0;
}





int pme_exec(pme_t       pme,
			 RealOpenMM **   atomCoordinates,
			 RealOpenMM **   forces,
			 RealOpenMM **   atomParameters,
			 const RealOpenMM      periodicBoxSize[3],
			 RealOpenMM *    energy,
			 RealOpenMM      pme_virial[3][3])
{
    /* Routine is called with coordinates in x, a box, and charges in q */
680

681
682
683
684
    /* Before we can do the actual interpolation, we need to recalculate and update
     * the indices for each particle in the charge grid (initialized in pme_init()),
     * and what its fractional offset in this grid cell is.
     */
685

686
	/* Update charge grid indices and fractional offsets for each atom.
687
     * The indices/fractions are stored internally in the pme datatype
688
689
     */
	pme_update_grid_index_and_fraction(pme,atomCoordinates,periodicBoxSize);
690

691
692
693
694
695
	/* Calculate bsplines (and their differentials) from current fractional coordinates, store in pme structure */
	pme_update_bsplines(pme);

	/* Spread the charges on grid (using newly calculated bsplines in the pme structure) */
	pme_grid_spread_charge(pme,atomParameters);
696
697

	/* do 3d-fft */
698
699
700
701
	fftpack_exec_3d(pme->fftplan,FFTPACK_FORWARD,pme->grid,pme->grid);

	/* solve in k-space */
	pme_reciprocal_convolution(pme,periodicBoxSize,energy,pme_virial);
702

703
704
	/* do 3d-invfft */
	fftpack_exec_3d(pme->fftplan,FFTPACK_BACKWARD,pme->grid,pme->grid);
705

706
707
708
709
710
711
712
713
714
715
716
717
	/* Get the particle forces from the grid and bsplines in the pme structure */
	pme_grid_interpolate_force(pme,periodicBoxSize,atomParameters,forces);

    return 0;
}



int
pme_destroy(pme_t    pme)
{
	int d;
718

719
	free(pme->grid);
720

721
722
723
724
725
726
	for(d=0;d<3;d++)
	{
		free(pme->bsplines_moduli[d]);
		free(pme->bsplines_theta[d]);
		free(pme->bsplines_dtheta[d]);
	}
727

728
729
	free(pme->particlefraction);
	free(pme->particleindex);
730

731
	fftpack_destroy(pme->fftplan);
732

733
734
	/* destroy structure itself */
	free(pme);
735

736
737
	return 0;
}