Implemented the PME method in the reference platform.

c66dd5a3 · Rossen Apostolov · a5f8fb35 · c66dd5a3 · c66dd5a3 · c66dd5a3
Commit c66dd5a3 authored Jul 07, 2009 by Rossen Apostolov
10 changed files
--- a/olla/include/openmm/kernels.h
+++ b/olla/include/openmm/kernels.h
@@ -246,7 +246,8 @@ public:
        NoCutoff = 0,
        CutoffNonPeriodic = 1,
        CutoffPeriodic = 2,
-        Ewald = 3
+        Ewald = 3,
+        PME = 4
    };
    static std::string Name() {
        return "CalcNonbondedForce";

--- a/openmmapi/include/openmm/NonbondedForce.h
+++ b/openmmapi/include/openmm/NonbondedForce.h
@@ -91,7 +91,12 @@ public:
         * Periodic boundary conditions are used, and Ewald summation is used to compute the interaction of each particle
         * with all periodic copies of every other particle.
         */
-        Ewald = 3
+        Ewald = 3,
+        /**
+         * Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
+         * with all periodic copies of every other particle.
+         */
+        PME = 4
    };
    /**
     * Create a NonbondedForce.

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -392,7 +392,7 @@ void ReferenceCalcNonbondedForceKernel::initialize(const System& system, const N
        neighborList = NULL;
    else
        neighborList = new NeighborList();
-    if (nonbondedMethod == Ewald) {
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME) {
        RealOpenMM ewaldErrorTol = (RealOpenMM) force.getEwaldErrorTolerance();
        ewaldAlpha = (RealOpenMM) (std::sqrt(-std::log(ewaldErrorTol))/nonbondedCutoff);
        RealOpenMM mx = periodicBoxSize[0]/nonbondedCutoff;
@@ -417,14 +417,17 @@ void ReferenceCalcNonbondedForceKernel::executeForces(OpenMMContextImpl& context
    ReferenceLJCoulombIxn clj;
    bool periodic = (nonbondedMethod == CutoffPeriodic);
    bool ewald  = (nonbondedMethod == Ewald);
+    bool pme  = (nonbondedMethod == PME);
    if (nonbondedMethod != NoCutoff) {
-        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, (periodic || ewald) ? periodicBoxSize : NULL, nonbondedCutoff, 0.0);
+        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, (periodic || ewald || pme) ? periodicBoxSize : NULL, nonbondedCutoff, 0.0);
        clj.setUseCutoff(nonbondedCutoff, *neighborList, 78.3f);
    }
-    if (periodic||ewald)
+    if (periodic||ewald||pme)
        clj.setPeriodic(periodicBoxSize);
    if (ewald)
        clj.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+    if (pme)
+        clj.setUsePME(ewaldAlpha);
    clj.calculatePairIxn(numParticles, posData, particleParamArray, exclusionArray, 0, forceData, 0, 0);
    ReferenceBondForce refBondForce;
    ReferenceLJCoulomb14 nonbonded14;
@@ -440,14 +443,17 @@ double ReferenceCalcNonbondedForceKernel::executeEnergy(OpenMMContextImpl& conte
    ReferenceLJCoulombIxn clj;
    bool periodic = (nonbondedMethod == CutoffPeriodic);
    bool ewald  = (nonbondedMethod == Ewald);
+    bool pme  = (nonbondedMethod == PME);
    if (nonbondedMethod != NoCutoff) {
-        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, (periodic || ewald) ? periodicBoxSize : NULL, nonbondedCutoff, 0.0);
+        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, (periodic || ewald || pme) ? periodicBoxSize : NULL, nonbondedCutoff, 0.0);
        clj.setUseCutoff(nonbondedCutoff, *neighborList, 78.3f);
    }
-    if (periodic || ewald)
+    if (periodic || ewald || pme)
        clj.setPeriodic(periodicBoxSize);
    if (ewald)
        clj.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+    if (pme)
+        clj.setUsePME(ewaldAlpha);
    clj.calculatePairIxn(numParticles, posData, particleParamArray, exclusionArray, 0, forceData, 0, &energy);
    ReferenceBondForce refBondForce;
    ReferenceLJCoulomb14 nonbonded14;

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
@@ -44,7 +44,7 @@ using std::vector;
   --------------------------------------------------------------------------------------- */
-ReferenceLJCoulombIxn::ReferenceLJCoulombIxn( ) : cutoff(false), periodic(false), ewald(false) {
+ReferenceLJCoulombIxn::ReferenceLJCoulombIxn( ) : cutoff(false), periodic(false), ewald(false), pme(false) {
   // ---------------------------------------------------------------------------------------
@@ -138,6 +138,19 @@ ReferenceLJCoulombIxn::~ReferenceLJCoulombIxn( ){
      ewald = true;
  }
+  /**---------------------------------------------------------------------------------------
+     Set the force to use Particle-Mesh Ewald (PME) summation.
+     @param alpha  the Ewald separation parameter
+     --------------------------------------------------------------------------------------- */
+  void ReferenceLJCoulombIxn::setUsePME(RealOpenMM alpha) {
+      alphaEwald = alpha;
+      pme = true;
+  }
 /**---------------------------------------------------------------------------------------
   Calculate parameters for LJ Coulomb ixn
@@ -222,7 +235,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
    RealOpenMM  factorEwald = -1 / (4*alphaEwald*alphaEwald);
    RealOpenMM SQRT_PI = sqrt(PI);
    RealOpenMM TWO_PI = 2.0 * PI;
    static const RealOpenMM epsilon     =  1.0;
@@ -415,6 +427,143 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
 }
+/**---------------------------------------------------------------------------------------
+   Calculate PME ixn
+   @param numberOfAtoms    number of atoms
+   @param atomCoordinates  atom coordinates
+   @param atomParameters   atom parameters                             atomParameters[atomIndex][paramterIndex]
+   @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                           exclusions[atomIndex][0] = number of exclusions
+                           exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                           interacting w/ atom atomIndex
+   @param fixedParameters  non atom parameters (not currently used)
+   @param forces           force array (forces added)
+   @param energyByAtom     atom energy
+   @param totalEnergy      total energy
+   @return ReferenceForce::DefaultReturn
+   --------------------------------------------------------------------------------------- */
+int ReferenceLJCoulombIxn::calculatePMEIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                                             RealOpenMM** atomParameters, int** exclusions,
+                                             RealOpenMM* fixedParameters, RealOpenMM** forces,
+                                             RealOpenMM* energyByAtom, RealOpenMM* totalEnergy) const {
+    #include "../SimTKUtilities/RealTypeSimTk.h"
+    #include "pme.h"
+    RealOpenMM SQRT_PI = sqrt(PI);
+    static const RealOpenMM one         =  1.0;
+    RealOpenMM selfEwaldEnergy = 0.0;
+    RealOpenMM realSpaceEwaldEnergy = 0.0;
+    RealOpenMM recipEnergy = 0.0;
+// **************************************************************************************
+// SELF ENERGY
+// **************************************************************************************
+    for( int atomID = 0; atomID < numberOfAtoms; atomID++ ){
+        selfEwaldEnergy = selfEwaldEnergy + atomParameters[atomID][QIndex]*atomParameters[atomID][QIndex];
+    }
+       selfEwaldEnergy = selfEwaldEnergy * alphaEwald/SQRT_PI ;
+// **************************************************************************************
+// RECIPROCAL SPACE EWALD ENERGY AND FORCES
+// **************************************************************************************
+	pme_t          pmedata; /* abstract handle for PME data */
+	int            ngrid[3];
+	RealOpenMM virial[3][3];
+	/* PME grid dimensions.
+	 * We typically want to set this as the spacing rather than absolute dimensions, but
+	 * to be able to reproduce results from other programs (e.g. Gromacs) we need to be
+	 * able to set exact grid dimenisions occasionally.
+	 */
+	ngrid[0] = 16;
+	ngrid[1] = 16;
+	ngrid[2] = 16;
+	pme_init(&pmedata,alphaEwald,numberOfAtoms,ngrid,4,1);
+	pme_exec(pmedata,atomCoordinates,forces,atomParameters,periodicBoxSize,&recipEnergy,virial);
+// **************************************************************************************
+// SHORT-RANGE ENERGY AND FORCES
+// **************************************************************************************
+       RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+       for( int atomID1 = 0; atomID1 < numberOfAtoms; atomID1++ ){
+        for( int atomID2 = atomID1 + 1; atomID2 < numberOfAtoms; atomID2++ ){
+          ReferenceForce::getDeltaRPeriodic( atomCoordinates[atomID2], atomCoordinates[atomID1], periodicBoxSize, deltaR[0] );  
+          RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+          RealOpenMM r2        = deltaR[0][ReferenceForce::R2Index];
+          RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+          realSpaceEwaldEnergy = 
+              (RealOpenMM)(realSpaceEwaldEnergy + atomParameters[atomID1][QIndex]*atomParameters[atomID2][QIndex]*inverseR*erfc(alphaEwald*r)); 
+        }
+       }
+       // allocate and initialize exclusion array
+       vector<int> exclusionIndices(numberOfAtoms);
+       for( int ii = 0; ii < numberOfAtoms; ii++ ){
+          exclusionIndices[ii] = -1;
+       }
+       for( int ii = 0; ii < numberOfAtoms; ii++ ){
+          // set exclusions
+          for( int jj = 1; jj <= exclusions[ii][0]; jj++ ){
+             exclusionIndices[exclusions[ii][jj]] = ii;
+          }
+          // loop over atom pairs
+          for( int jj = ii+1; jj < numberOfAtoms; jj++ ){
+             if( exclusionIndices[jj] != ii ){
+       ReferenceForce::getDeltaRPeriodic( atomCoordinates[jj], atomCoordinates[ii], periodicBoxSize, deltaR[0] );  
+       RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+       RealOpenMM r2        = deltaR[0][ReferenceForce::R2Index];
+       RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+       RealOpenMM alphaR    = alphaEwald * r;
+       realSpaceEwaldEnergy = 
+           (RealOpenMM)(realSpaceEwaldEnergy + atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR)); 
+       RealOpenMM dEdR = atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR;
+       dEdR = (RealOpenMM)(dEdR * (erfc(alphaR) + 2 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ));
+                for( int kk = 0; kk < 3; kk++ ){
+                   RealOpenMM force  = dEdR*deltaR[0][kk];
+                   forces[ii][kk]   += force;
+                   forces[jj][kk]   -= force;
+                } 
+             }
+          }
+       }
+// ***********************************************************************
+        if( totalEnergy ) {
+              *totalEnergy += recipEnergy + realSpaceEwaldEnergy - selfEwaldEnergy;
+        }
+   return ReferenceForce::DefaultReturn;
+}
 /**---------------------------------------------------------------------------------------
   Calculate LJ Coulomb pair ixn
@@ -440,8 +589,10 @@ int ReferenceLJCoulombIxn::calculatePairIxn( int numberOfAtoms, RealOpenMM** ato
                                             RealOpenMM* fixedParameters, RealOpenMM** forces,
                                             RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const {
-    if (ewald)
+   if (ewald)
        return calculateEwaldIxn(numberOfAtoms, atomCoordinates, atomParameters, exclusions, fixedParameters, forces, energyByAtom, totalEnergy);
+   if (pme)
+        return calculatePMEIxn(numberOfAtoms, atomCoordinates, atomParameters, exclusions, fixedParameters, forces, energyByAtom, totalEnergy);
   if (cutoff) {
       for (int i = 0; i < (int) neighborList->size(); i++) {
           OpenMM::AtomPair pair = (*neighborList)[i];

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.h
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.h
@@ -37,6 +37,7 @@ class ReferenceLJCoulombIxn : public ReferencePairIxn {
      bool cutoff;
      bool periodic;
      bool ewald;
+      bool pme;
      const OpenMM::NeighborList* neighborList;
      RealOpenMM periodicBoxSize[3];
      RealOpenMM cutoffDistance;
@@ -116,7 +117,7 @@ class ReferenceLJCoulombIxn : public ReferencePairIxn {
         --------------------------------------------------------------------------------------- */
      int setPeriodic( RealOpenMM* boxSize );      
      /**---------------------------------------------------------------------------------------
         Set the force to use Ewald summation.
@@ -130,6 +131,17 @@ class ReferenceLJCoulombIxn : public ReferencePairIxn {
      void setUseEwald(RealOpenMM alpha, int kmaxx, int kmaxy, int kmaxz);
+      /**---------------------------------------------------------------------------------------
+         Set the force to use Particle-Mesh Ewald (PME) summation.
+         @param alpha  the Ewald separation parameter
+         --------------------------------------------------------------------------------------- */
+      void setUsePME(RealOpenMM alpha);
      /**---------------------------------------------------------------------------------------
@@ -203,6 +215,31 @@ private:
                            RealOpenMM* fixedParameters, RealOpenMM** forces,
                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const;
+      /**---------------------------------------------------------------------------------------
+         Calculate PME ixn
+         @param numberOfAtoms    number of atoms
+         @param atomCoordinates  atom coordinates
+         @param atomParameters   atom parameters (charges, c6, c12, ...)     atomParameters[atomIndex][paramterIndex]
+         @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                                 exclusions[atomIndex][0] = number of exclusions
+                                 exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                                 interacting w/ atom atomIndex
+         @param fixedParameters  non atom parameters (not currently used)
+         @param forces           force array (forces added)
+         @param energyByAtom     atom energy
+         @param totalEnergy      total energy
+         @return ReferenceForce::DefaultReturn
+         --------------------------------------------------------------------------------------- */
+      int calculatePMEIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                            RealOpenMM** atomParameters, int** exclusions,
+                            RealOpenMM* fixedParameters, RealOpenMM** forces,
+                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const;
 };
 // ---------------------------------------------------------------------------------------

--- a/platforms/reference/src/SimTKReference/fftpack.cpp
+++ b/platforms/reference/src/SimTKReference/fftpack.cpp
+/* 
+ *
+ * 1D transformations based on F77 to C translation of FFTPACK.
+ * FFTPACK was originally written by paul n swarztrauber
+ * at the national center for atmospheric research
+ * 
+ * FFTPACK is in the public domain.
+ * 
+ * Higher-dimension transforms copyright Erik Lindahl, 2008-2009.
+ * Just as FFTPACK, this file may be redistributed freely, and can be
+ * considered to be in the public domain.
+ *
+ * Any errors in this (threadsafe, but not threaded) C version
+ * are due to the f2c translator, or hacks by Erik Lindahl...
+ *
+ * Erik Lindahl, lindahl@cbr.su.se
+ * Center for Biomembrane Research
+ * Stockholm University, Sweden
+ */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include "fftpack.h"
+/** Contents of the FFTPACK fft datatype. 
+ *
+ *  FFTPACK only does 1d transforms, so we use a pointers to another fft for 
+ *  the transform in the next dimension.
+ * Thus, a 3d-structure contains a pointer to a 2d one, which in turns contains
+ * a pointer to a 1d. The 1d structure has next==NULL.
+ */
+struct fftpack
+{
+    int                    ndim;     /**< Dimensions, including our subdimensions.  */
+    int                    n;        /**< Number of points in this dimension.       */
+    int                    ifac[15]; /**< 15 bytes needed for cfft and rfft         */
+    struct fftpack *       next;     /**< Pointer to next dimension, or NULL.       */
+    RealOpenMM *               work;     /**< 1st 4n reserved for cfft, 1st 2n for rfft */
+};
+static void 
+fftpack_passf2(int         ido, 
+               int         l1, 
+               RealOpenMM     cc[],
+               RealOpenMM     ch[],
+               RealOpenMM     wa1[],
+               int         isign)
+{
+    int i, k, ah, ac;
+    RealOpenMM ti2, tr2;
+    if (ido <= 2)
+    {
+        for (k=0; k<l1; k++) 
+        {
+            ah = k*ido;
+            ac = 2*k*ido;
+            ch[ah]              = cc[ac]   + cc[ac + ido];
+            ch[ah + ido*l1]     = cc[ac]   - cc[ac + ido];
+            ch[ah+1]            = cc[ac+1] + cc[ac + ido + 1];
+            ch[ah + ido*l1 + 1] = cc[ac+1] - cc[ac + ido + 1];
+        }
+    } 
+    else
+    {
+        for (k=0; k<l1; k++) 
+        {
+            for (i=0; i<ido-1; i+=2) 
+            {
+                ah              = i + k*ido;
+                ac              = i + 2*k*ido;
+                ch[ah]          = cc[ac] + cc[ac + ido];
+                tr2             = cc[ac] - cc[ac + ido];
+                ch[ah+1]        = cc[ac+1] + cc[ac + 1 + ido];
+                ti2             = cc[ac+1] - cc[ac + 1 + ido];
+                ch[ah+l1*ido+1] = wa1[i]*ti2 + isign*wa1[i+1]*tr2;
+                ch[ah+l1*ido]   = wa1[i]*tr2 - isign*wa1[i+1]*ti2;
+            }
+        }
+    }
+} 
+static void 
+fftpack_passf3(int         ido,
+               int         l1, 
+               RealOpenMM     cc[],
+               RealOpenMM     ch[],
+               RealOpenMM     wa1[], 
+               RealOpenMM     wa2[],
+               int         isign)
+{
+    const RealOpenMM taur = -0.5;
+    const RealOpenMM taui = 0.866025403784439;
+    int i, k, ac, ah;
+    RealOpenMM ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+    if (ido == 2) 
+    {
+        for (k=1; k<=l1; k++) 
+        {
+            ac = (3*k - 2)*ido;
+            tr2 = cc[ac] + cc[ac + ido];
+            cr2 = cc[ac - ido] + taur*tr2;
+            ah = (k - 1)*ido;
+            ch[ah] = cc[ac - ido] + tr2;
+            ti2 = cc[ac + 1] + cc[ac + ido + 1];
+            ci2 = cc[ac - ido + 1] + taur*ti2;
+            ch[ah + 1] = cc[ac - ido + 1] + ti2;
+            cr3 = isign*taui*(cc[ac] - cc[ac + ido]);
+            ci3 = isign*taui*(cc[ac + 1] - cc[ac + ido + 1]);
+            ch[ah + l1*ido] = cr2 - ci3;
+            ch[ah + 2*l1*ido] = cr2 + ci3;
+            ch[ah + l1*ido + 1] = ci2 + cr3;
+            ch[ah + 2*l1*ido + 1] = ci2 - cr3;
+        }
+    } 
+    else
+    {
+        for (k=1; k<=l1; k++) 
+        {
+            for (i=0; i<ido-1; i+=2)
+            {
+                ac = i + (3*k - 2)*ido;
+                tr2 = cc[ac] + cc[ac + ido];
+                cr2 = cc[ac - ido] + taur*tr2;
+                ah = i + (k-1)*ido;
+                ch[ah] = cc[ac - ido] + tr2;
+                ti2 = cc[ac + 1] + cc[ac + ido + 1];
+                ci2 = cc[ac - ido + 1] + taur*ti2;
+                ch[ah + 1] = cc[ac - ido + 1] + ti2;
+                cr3 = isign*taui*(cc[ac] - cc[ac + ido]);
+                ci3 = isign*taui*(cc[ac + 1] - cc[ac + ido + 1]);
+                dr2 = cr2 - ci3;
+                dr3 = cr2 + ci3;
+                di2 = ci2 + cr3;
+                di3 = ci2 - cr3;
+                ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
+                ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
+                ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
+                ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
+            }
+        }
+    }
+} 
+static void 
+fftpack_passf4(int          ido, 
+               int          l1,
+               RealOpenMM      cc[], 
+               RealOpenMM      ch[],
+               RealOpenMM      wa1[],
+               RealOpenMM      wa2[], 
+               RealOpenMM      wa3[],
+               int          isign)
+{
+    int i, k, ac, ah;
+    RealOpenMM ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+    if (ido == 2) 
+    {
+        for (k=0; k<l1; k++)
+        {
+            ac = 4*k*ido + 1;
+            ti1 = cc[ac] - cc[ac + 2*ido];
+            ti2 = cc[ac] + cc[ac + 2*ido];
+            tr4 = cc[ac + 3*ido] - cc[ac + ido];
+            ti3 = cc[ac + ido] + cc[ac + 3*ido];
+            tr1 = cc[ac - 1] - cc[ac + 2*ido - 1];
+            tr2 = cc[ac - 1] + cc[ac + 2*ido - 1];
+            ti4 = cc[ac + ido - 1] - cc[ac + 3*ido - 1];
+            tr3 = cc[ac + ido - 1] + cc[ac + 3*ido - 1];
+            ah = k*ido;
+            ch[ah] = tr2 + tr3;
+            ch[ah + 2*l1*ido] = tr2 - tr3;
+            ch[ah + 1] = ti2 + ti3;
+            ch[ah + 2*l1*ido + 1] = ti2 - ti3;
+            ch[ah + l1*ido] = tr1 + isign*tr4;
+            ch[ah + 3*l1*ido] = tr1 - isign*tr4;
+            ch[ah + l1*ido + 1] = ti1 + isign*ti4;
+            ch[ah + 3*l1*ido + 1] = ti1 - isign*ti4;
+        }
+    } 
+    else
+    {
+        for (k=0; k<l1; k++)
+        {
+            for (i=0; i<ido-1; i+=2)
+            {
+                ac = i + 1 + 4*k*ido;
+                ti1 = cc[ac] - cc[ac + 2*ido];
+                ti2 = cc[ac] + cc[ac + 2*ido];
+                ti3 = cc[ac + ido] + cc[ac + 3*ido];
+                tr4 = cc[ac + 3*ido] - cc[ac + ido];
+                tr1 = cc[ac - 1] - cc[ac + 2*ido - 1];
+                tr2 = cc[ac - 1] + cc[ac + 2*ido - 1];
+                ti4 = cc[ac + ido - 1] - cc[ac + 3*ido - 1];
+                tr3 = cc[ac + ido - 1] + cc[ac + 3*ido - 1];
+                ah = i + k*ido;
+                ch[ah] = tr2 + tr3;
+                cr3 = tr2 - tr3;
+                ch[ah + 1] = ti2 + ti3;
+                ci3 = ti2 - ti3;
+                cr2 = tr1 + isign*tr4;
+                cr4 = tr1 - isign*tr4;
+                ci2 = ti1 + isign*ti4;
+                ci4 = ti1 - isign*ti4;
+                ch[ah + l1*ido] = wa1[i]*cr2 - isign*wa1[i + 1]*ci2;
+                ch[ah + l1*ido + 1] = wa1[i]*ci2 + isign*wa1[i + 1]*cr2;
+                ch[ah + 2*l1*ido] = wa2[i]*cr3 - isign*wa2[i + 1]*ci3;
+                ch[ah + 2*l1*ido + 1] = wa2[i]*ci3 + isign*wa2[i + 1]*cr3;
+                ch[ah + 3*l1*ido] = wa3[i]*cr4 -isign*wa3[i + 1]*ci4;
+                ch[ah + 3*l1*ido + 1] = wa3[i]*ci4 + isign*wa3[i + 1]*cr4;
+            }
+        }
+    }
+} 
+static void 
+fftpack_passf5(int          ido,
+               int          l1, 
+               RealOpenMM      cc[],
+               RealOpenMM      ch[],
+               RealOpenMM      wa1[], 
+               RealOpenMM      wa2[],
+               RealOpenMM      wa3[], 
+               RealOpenMM      wa4[],
+               int          isign)
+{
+    const RealOpenMM tr11 = 0.309016994374947;
+    const RealOpenMM ti11 = 0.951056516295154;
+    const RealOpenMM tr12 = -0.809016994374947;
+    const RealOpenMM ti12 = 0.587785252292473;
+    int i, k, ac, ah;
+    RealOpenMM ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+        ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+    if (ido == 2) 
+    {
+        for (k = 1; k <= l1; ++k) 
+        {
+            ac = (5*k - 4)*ido + 1;
+            ti5 = cc[ac] - cc[ac + 3*ido];
+            ti2 = cc[ac] + cc[ac + 3*ido];
+            ti4 = cc[ac + ido] - cc[ac + 2*ido];
+            ti3 = cc[ac + ido] + cc[ac + 2*ido];
+            tr5 = cc[ac - 1] - cc[ac + 3*ido - 1];
+            tr2 = cc[ac - 1] + cc[ac + 3*ido - 1];
+            tr4 = cc[ac + ido - 1] - cc[ac + 2*ido - 1];
+            tr3 = cc[ac + ido - 1] + cc[ac + 2*ido - 1];
+            ah = (k - 1)*ido;
+            ch[ah] = cc[ac - ido - 1] + tr2 + tr3;
+            ch[ah + 1] = cc[ac - ido] + ti2 + ti3;
+            cr2 = cc[ac - ido - 1] + tr11*tr2 + tr12*tr3;
+            ci2 = cc[ac - ido] + tr11*ti2 + tr12*ti3;
+            cr3 = cc[ac - ido - 1] + tr12*tr2 + tr11*tr3;
+            ci3 = cc[ac - ido] + tr12*ti2 + tr11*ti3;
+            cr5 = isign*(ti11*tr5 + ti12*tr4);
+            ci5 = isign*(ti11*ti5 + ti12*ti4);
+            cr4 = isign*(ti12*tr5 - ti11*tr4);
+            ci4 = isign*(ti12*ti5 - ti11*ti4);
+            ch[ah + l1*ido] = cr2 - ci5;
+            ch[ah + 4*l1*ido] = cr2 + ci5;
+            ch[ah + l1*ido + 1] = ci2 + cr5;
+            ch[ah + 2*l1*ido + 1] = ci3 + cr4;
+            ch[ah + 2*l1*ido] = cr3 - ci4;
+            ch[ah + 3*l1*ido] = cr3 + ci4;
+            ch[ah + 3*l1*ido + 1] = ci3 - cr4;
+            ch[ah + 4*l1*ido + 1] = ci2 - cr5;
+        }
+    } 
+    else
+    {
+        for (k=1; k<=l1; k++) 
+        {
+            for (i=0; i<ido-1; i+=2) 
+            {
+                ac = i + 1 + (k*5 - 4)*ido;
+                ti5 = cc[ac] - cc[ac + 3*ido];
+                ti2 = cc[ac] + cc[ac + 3*ido];
+                ti4 = cc[ac + ido] - cc[ac + 2*ido];
+                ti3 = cc[ac + ido] + cc[ac + 2*ido];
+                tr5 = cc[ac - 1] - cc[ac + 3*ido - 1];
+                tr2 = cc[ac - 1] + cc[ac + 3*ido - 1];
+                tr4 = cc[ac + ido - 1] - cc[ac + 2*ido - 1];
+                tr3 = cc[ac + ido - 1] + cc[ac + 2*ido - 1];
+                ah = i + (k - 1)*ido;
+                ch[ah] = cc[ac - ido - 1] + tr2 + tr3;
+                ch[ah + 1] = cc[ac - ido] + ti2 + ti3;
+                cr2 = cc[ac - ido - 1] + tr11*tr2 + tr12*tr3;
+                ci2 = cc[ac - ido] + tr11*ti2 + tr12*ti3;
+                cr3 = cc[ac - ido - 1] + tr12*tr2 + tr11*tr3;
+                ci3 = cc[ac - ido] + tr12*ti2 + tr11*ti3;
+                cr5 = isign*(ti11*tr5 + ti12*tr4);
+                ci5 = isign*(ti11*ti5 + ti12*ti4);
+                cr4 = isign*(ti12*tr5 - ti11*tr4);
+                ci4 = isign*(ti12*ti5 - ti11*ti4);
+                dr3 = cr3 - ci4;
+                dr4 = cr3 + ci4;
+                di3 = ci3 + cr4;
+                di4 = ci3 - cr4;
+                dr5 = cr2 + ci5;
+                dr2 = cr2 - ci5;
+                di5 = ci2 - cr5;
+                di2 = ci2 + cr5;
+                ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
+                ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
+                ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
+                ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
+                ch[ah + 3*l1*ido] = wa3[i]*dr4 - isign*wa3[i+1]*di4;
+                ch[ah + 3*l1*ido + 1] = wa3[i]*di4 + isign*wa3[i+1]*dr4;
+                ch[ah + 4*l1*ido] = wa4[i]*dr5 - isign*wa4[i+1]*di5;
+                ch[ah + 4*l1*ido + 1] = wa4[i]*di5 + isign*wa4[i+1]*dr5;
+            }
+        }
+    }
+} 
+static void 
+fftpack_passf(int *        nac, 
+              int          ido,
+              int          ip, 
+              int          l1,
+              int          idl1,
+              RealOpenMM      cc[],
+              RealOpenMM      ch[],
+              RealOpenMM      wa[],
+              int          isign)
+{
+    int idij, idlj, idot, ipph, i, j, k, l, jc, lc, ik, nt, idj, idl, inc,idp;
+    RealOpenMM wai, war;
+    idot = ido / 2;
+    nt = ip*idl1;
+    ipph = (ip + 1) / 2;
+    idp = ip*ido;
+    if (ido >= l1) 
+    {
+        for (j=1; j<ipph; j++)
+        {
+            jc = ip - j;
+            for (k=0; k<l1; k++) 
+            {
+                for (i=0; i<ido; i++) 
+                {
+                    ch[i + (k + j*l1)*ido]  = cc[i + (j + k*ip)*ido] + cc[i + (jc + k*ip)*ido];
+                    ch[i + (k + jc*l1)*ido] = cc[i + (j + k*ip)*ido] - cc[i + (jc + k*ip)*ido];
+                }
+            }
+        }
+        for (k=0; k<l1; k++)
+            for (i=0; i<ido; i++)
+                ch[i + k*ido] = cc[i + k*ip*ido];
+    } 
+    else
+    {
+        for (j=1; j<ipph; j++) 
+        {
+            jc = ip - j;
+            for (i=0; i<ido; i++) 
+            {
+                for (k=0; k<l1; k++) 
+                {
+                    ch[i + (k + j*l1)*ido] =  cc[i + (j + k*ip)*ido] + cc[i + (jc + k*ip)*ido];
+                    ch[i + (k + jc*l1)*ido] = cc[i + (j + k*ip)*ido] - cc[i + (jc + k*ip)*ido];
+                }
+            }
+        }
+        for (i=0; i<ido; i++)
+            for (k=0; k<l1; k++)
+                ch[i + k*ido] = cc[i + k*ip*ido];
+    }
+    idl = 2 - ido;
+    inc = 0;
+    for (l=1; l<ipph; l++) 
+    {
+        lc = ip - l;
+        idl += ido;
+        for (ik=0; ik<idl1; ik++)
+        {
+            cc[ik + l*idl1] = ch[ik] + wa[idl - 2]*ch[ik + idl1];
+            cc[ik + lc*idl1] = isign*wa[idl-1]*ch[ik + (ip-1)*idl1];
+        }
+        idlj = idl;
+        inc += ido;
+        for (j=2; j<ipph; j++)
+        {
+            jc = ip - j;
+            idlj += inc;
+            if (idlj > idp) idlj -= idp;
+            war = wa[idlj - 2];
+            wai = wa[idlj-1];
+            for (ik=0; ik<idl1; ik++)
+            {
+                cc[ik + l*idl1] += war*ch[ik + j*idl1];
+                cc[ik + lc*idl1] += isign*wai*ch[ik + jc*idl1];
+            }
+        }
+    }
+    for (j=1; j<ipph; j++)
+        for (ik=0; ik<idl1; ik++)
+            ch[ik] += ch[ik + j*idl1];
+    for (j=1; j<ipph; j++) 
+    {
+        jc = ip - j;
+        for (ik=1; ik<idl1; ik+=2) 
+        {
+            ch[ik - 1 + j*idl1] = cc[ik - 1 + j*idl1] - cc[ik + jc*idl1];
+            ch[ik - 1 + jc*idl1] = cc[ik - 1 + j*idl1] + cc[ik + jc*idl1];
+            ch[ik + j*idl1] = cc[ik + j*idl1] + cc[ik - 1 + jc*idl1];
+            ch[ik + jc*idl1] = cc[ik + j*idl1] - cc[ik - 1 + jc*idl1];
+        }
+    }
+    *nac = 1;
+    if (ido == 2) 
+        return;
+    *nac = 0;
+    for (ik=0; ik<idl1; ik++)
+    {
+        cc[ik] = ch[ik];
+    }
+    for (j=1; j<ip; j++)
+    {
+        for (k=0; k<l1; k++) 
+        {
+            cc[(k + j*l1)*ido + 0] = ch[(k + j*l1)*ido + 0];
+            cc[(k + j*l1)*ido + 1] = ch[(k + j*l1)*ido + 1];
+        }
+    }
+    if (idot <= l1) 
+    {
+        idij = 0;
+        for (j=1; j<ip; j++)
+        {
+            idij += 2;
+            for (i=3; i<ido; i+=2) 
+            {
+                idij += 2;
+                for (k=0; k<l1; k++)
+                {
+                    cc[i - 1 + (k + j*l1)*ido] =
+                    wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
+                    isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
+                    cc[i + (k + j*l1)*ido] =
+                        wa[idij - 2]*ch[i + (k + j*l1)*ido] +
+                        isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
+                }
+            }
+        }
+    }
+    else
+    {
+        idj = 2 - ido;
+        for (j=1; j<ip; j++) 
+        {
+            idj += ido;
+            for (k = 0; k < l1; k++) 
+            {
+                idij = idj;
+                for (i=3; i<ido; i+=2)
+                {
+                    idij += 2;
+                    cc[i - 1 + (k + j*l1)*ido] =
+                        wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
+                        isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
+                    cc[i + (k + j*l1)*ido] =
+                        wa[idij - 2]*ch[i + (k + j*l1)*ido] +
+                        isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
+                }
+            }
+        }
+    }
+} 
+static void 
+fftpack_cfftf1(int          n,
+               RealOpenMM      c[],
+               RealOpenMM      ch[],
+               RealOpenMM      wa[],
+               int          ifac[15],
+               int          isign)
+{
+    int idot, i;
+    int k1, l1, l2;
+    int na, nf, ip, iw, ix2, ix3, ix4, nac, ido, idl1;
+    RealOpenMM *cinput, *coutput;
+    nf = ifac[1];
+    na = 0;
+    l1 = 1;
+    iw = 0;
+    for (k1=2; k1<=nf+1; k1++) 
+    {
+        ip = ifac[k1];
+        l2 = ip*l1;
+        ido = n / l2;
+        idot = ido + ido;
+        idl1 = idot*l1;
+        if (na) 
+        {
+            cinput = ch;
+            coutput = c;
+        }
+        else 
+        {
+            cinput = c;
+            coutput = ch;
+        }
+        switch (ip) 
+        {
+            case 4:
+                ix2 = iw + idot;
+                ix3 = ix2 + idot;
+                fftpack_passf4(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], isign);
+                na = !na;
+                break;
+            case 2:
+                fftpack_passf2(idot, l1, cinput, coutput, &wa[iw], isign);
+                na = !na;
+                break;
+            case 3:
+                ix2 = iw + idot;
+                fftpack_passf3(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], isign);
+                na = !na;
+                break;
+            case 5:
+                ix2 = iw + idot;
+                ix3 = ix2 + idot;
+                ix4 = ix3 + idot;
+                fftpack_passf5(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+                na = !na;
+                break;
+            default:
+                fftpack_passf(&nac, idot, ip, l1, idl1, cinput, coutput, &wa[iw], isign);
+                if (nac != 0) na = !na;
+        }
+        l1 = l2;
+        iw += (ip - 1)*idot;
+    }
+    if (na == 0) 
+        return;
+    for (i=0; i<2*n; i++) 
+        c[i] = ch[i];
+}
+static void 
+fftpack_factorize(int    n,
+                  int    ifac[15])
+{
+    static const int ntryh[4] = { 3,4,2,5 };
+    int ntry=3, i, j=0, ib, nf=0, nl=n, nq, nr;
+startloop:
+    if (j < 4)
+        ntry = ntryh[j];
+    else
+        ntry+= 2;
+    j++;
+    do 
+    {
+        nq = nl / ntry;
+        nr = nl - ntry*nq;
+        if (nr != 0) goto startloop;
+        nf++;
+        ifac[nf + 1] = ntry;
+        nl = nq;
+        if (ntry == 2 && nf != 1) 
+        {
+            for (i=2; i<=nf; i++) 
+            {
+                ib = nf - i + 2;
+                ifac[ib + 1] = ifac[ib];
+            }
+            ifac[2] = 2;
+        }
+    } 
+    while (nl != 1);
+    ifac[0] = n;
+    ifac[1] = nf;
+}
+static void
+fftpack_cffti1(int          n, 
+               RealOpenMM      wa[], 
+               int          ifac[15])
+{
+    const RealOpenMM twopi = 6.28318530717959;
+    RealOpenMM arg, argh, argld, fi;
+    int idot, i, j;
+    int i1, k1, l1, l2;
+    int ld, ii, nf, ip;
+    int ido, ipm;
+    fftpack_factorize(n,ifac);
+    nf = ifac[1];
+    argh = twopi/(RealOpenMM)n;
+    i = 1;
+    l1 = 1;
+    for (k1=1; k1<=nf; k1++) 
+    {
+        ip = ifac[k1+1];
+        ld = 0;
+        l2 = l1*ip;
+        ido = n / l2;
+        idot = ido + ido + 2;
+        ipm = ip - 1;
+        for (j=1; j<=ipm; j++)
+        {
+            i1 = i;
+            wa[i-1] = 1;
+            wa[i] = 0;
+            ld += l1;
+            fi = 0;
+            argld = ld*argh;
+            for (ii=4; ii<=idot; ii+=2) 
+            {
+                i+= 2;
+                fi+= 1;
+                arg = fi*argld;
+                wa[i-1] = cos(arg);
+                wa[i] = sin(arg);
+            }
+            if (ip > 5) 
+            {
+                wa[i1-1] = wa[i-1];
+                wa[i1] = wa[i];
+            }
+        }
+        l1 = l2;
+    }
+} 
+static int
+fftpack_transpose_2d(t_complex *          in_data,
+                     t_complex *          out_data,
+                     int                  nx,
+                     int                  ny)
+{
+	t_complex *  src;
+	int          i,j;
+    if(nx<2 || ny<2)
+    {
+        if(in_data != out_data)
+        {
+            memcpy(out_data,in_data,sizeof(t_complex)*nx*ny);
+        }
+        return 0;
+    }
+    if(in_data == out_data)
+	{
+		src = (t_complex *)malloc(sizeof(t_complex)*nx*ny);
+		memcpy(src,in_data,sizeof(t_complex)*nx*ny);
+	}
+	else
+	{
+		src = in_data;
+	}
+	for(i=0;i<nx;i++)
+	{
+		for(j=0;j<ny;j++)
+		{
+			out_data[j*nx+i].re = src[i*ny+j].re;
+			out_data[j*nx+i].im = src[i*ny+j].im;
+		}
+	}
+	if(src != in_data)
+	{
+		free(src);
+	}
+	return 0;
+}
+static int
+fftpack_transpose_2d_nelem(t_complex *          in_data,
+						   t_complex *          out_data,
+						   int                  nx,
+						   int                  ny,
+                           int                  nelem)
+{
+	t_complex *   src;
+	int           ncopy;
+	int           i,j;
+	ncopy = nelem*sizeof(t_complex);
+    if(nx<2 || ny<2)
+    {
+        if(in_data != out_data)
+        {
+            memcpy(out_data,in_data,nx*ny*ncopy);
+        }
+        return 0;
+    }
+    if(in_data == out_data)
+	{
+		src = (t_complex *)malloc(nx*ny*ncopy);
+		memcpy(src,in_data,nx*ny*ncopy);
+	}
+	else
+	{
+		src = in_data;
+	}
+	for(i=0;i<nx;i++)
+	{
+		for(j=0;j<ny;j++)
+		{
+			memcpy(out_data + (j*nx+i)*nelem , src + (i*ny+j)*nelem , ncopy);
+		}
+	}
+	if(src != in_data)
+	{
+		free(src);
+	}
+	return 0;
+}
+int
+fftpack_init_1d(fftpack_t *        pfft,
+                int                nx)
+{
+    fftpack_t    fft;
+    if(pfft==NULL)
+    {
+        fprintf(stderr,"Fatal error - Invalid FFT opaque type pointer.");
+        return EINVAL;
+    }
+    *pfft = NULL;
+    if( (fft = (struct fftpack *)malloc(sizeof(struct fftpack))) == NULL)
+    {
+        return ENOMEM;
+    }    
+    fft->next = NULL;
+    fft->n    = nx;
+    /* Need 4*n storage for 1D complex FFT */
+    if( (fft->work = (RealOpenMM *)malloc(sizeof(RealOpenMM)*(4*nx))) == NULL) 
+    {
+        free(fft);
+        return ENOMEM;
+    }
+    if(fft->n>1)
+        fftpack_cffti1(nx,fft->work,fft->ifac);
+    *pfft = fft;
+    return 0;
+};
+int
+fftpack_init_2d(fftpack_t *        pfft,
+                int                nx,
+                int                ny)
+{
+    fftpack_t     fft;
+    int           rc;
+    if(pfft==NULL)
+    {
+        fprintf(stderr,"Fatal error - Invalid FFT opaque type pointer.");
+        return EINVAL;
+    }
+    *pfft = NULL;
+    /* Create the X transform */
+    if( (rc = fftpack_init_1d(&fft,nx)) != 0)
+    {
+        return rc;
+    }    
+    /* Create Y transform as a link from X */
+    if( (rc=fftpack_init_1d(&(fft->next),ny)) != 0)
+    {
+        free(fft);
+        return rc;
+    }
+    *pfft = fft;
+    return 0;
+};
+int
+fftpack_init_3d(fftpack_t *        pfft,
+                int                nx,
+                int                ny,
+                int                nz)
+{
+    fftpack_t     fft;
+    int           rc;
+    if(pfft==NULL)
+    {
+        fprintf(stderr,"Fatal error - Invalid FFT opaque type pointer.");
+        return EINVAL;
+    }
+    *pfft = NULL;
+    /* Create the X transform */
+    if( (fft = (struct fftpack *)malloc(sizeof(struct fftpack))) == NULL)
+    {
+        return ENOMEM;
+    }    
+    fft->n    = nx;
+    /* Need 4*nx storage for 1D complex FFT.
+     */
+    if( (fft->work = (RealOpenMM *)malloc(sizeof(RealOpenMM)*(4*nx))) == NULL) 
+    {
+        free(fft);
+        return ENOMEM;
+    }
+    fftpack_cffti1(nx,fft->work,fft->ifac);
+    /* Create 2D Y/Z transforms as a link from X */
+    if( (rc=fftpack_init_2d(&(fft->next),ny,nz)) != 0)
+    {
+        free(fft);
+        return rc;
+    }
+    *pfft = fft;
+	return 0;
+};
+int 
+fftpack_exec_1d          (fftpack_t                  fft,
+                          enum fftpack_direction     dir,
+                          t_complex *                in_data,
+                          t_complex *                out_data)
+{
+    int             i,n;
+    RealOpenMM *    p1;
+    RealOpenMM *    p2;
+    n=fft->n;
+    if(n==1)
+    {
+        p1 = (RealOpenMM *)in_data;
+        p2 = (RealOpenMM *)out_data;        
+        p2[0] = p1[0];
+        p2[1] = p1[1];
+    }
+    /* FFTPACK only does in-place transforms, so emulate out-of-place
+     * by copying data to the output array first.
+     */
+    if( in_data != out_data )
+    {
+        p1 = (RealOpenMM *)in_data;
+        p2 = (RealOpenMM *)out_data;
+        /* n complex = 2*n RealOpenMM elements */
+        for(i=0;i<2*n;i++)
+        {
+            p2[i] = p1[i];
+        }
+    }
+    /* Elements 0   .. 2*n-1 in work are used for ffac values,
+     * Elements 2*n .. 4*n-1 are internal FFTPACK work space.
+     */
+    if(dir == FFTPACK_FORWARD) 
+    {
+        fftpack_cfftf1(n,(RealOpenMM *)out_data,fft->work+2*n,fft->work,fft->ifac, -1);
+    }
+    else if(dir == FFTPACK_BACKWARD)
+    {
+        fftpack_cfftf1(n,(RealOpenMM *)out_data,fft->work+2*n,fft->work,fft->ifac, 1); 
+    }
+    else
+    {
+        fprintf(stderr,"FFT plan mismatch - bad plan or direction.");
+        return EINVAL;
+    }    
+    return 0;
+}
+int 
+fftpack_exec_2d          (fftpack_t                  fft,
+                          enum fftpack_direction     dir,
+                          t_complex *                in_data,
+                          t_complex *                out_data)
+{
+    int                i,nx,ny;
+    t_complex *    data;
+    nx = fft->n;
+    ny = fft->next->n;
+    /* FFTPACK only does in-place transforms, so emulate out-of-place
+     * by copying data to the output array first.
+     * For 2D there is likely enough data to benefit from memcpy().
+     */
+    if( in_data != out_data )
+    {
+        memcpy(out_data,in_data,sizeof(t_complex)*nx*ny);
+    }
+    /* Much easier to do pointer arithmetic when base has the correct type */
+    data = (t_complex *)out_data;
+    /* y transforms */
+    for(i=0;i<nx;i++)
+    {
+        fftpack_exec_1d(fft->next,dir,data+i*ny,data+i*ny);
+    }
+    /* Transpose in-place to get data in place for x transform now */
+    fftpack_transpose_2d(data,data,nx,ny);
+    /* x transforms */
+    for(i=0;i<ny;i++)
+    {
+        fftpack_exec_1d(fft,dir,data+i*nx,data+i*nx);
+    }
+    /* Transpose in-place to get data back in original order */
+    fftpack_transpose_2d(data,data,ny,nx);
+    return 0;
+}
+int 
+fftpack_exec_3d     (fftpack_t                  fft,
+                     enum fftpack_direction     dir,
+                     t_complex *                in_data,
+                     t_complex *                out_data)
+{
+    int              i,nx,ny,nz,rc;
+    t_complex *      data;
+    nx=fft->n;
+    ny=fft->next->n;
+    nz=fft->next->next->n;
+    /* FFTPACK only does in-place transforms, so emulate out-of-place
+     * by copying data to the output array first.
+     * For 3D there is likely enough data to benefit from memcpy().
+     */
+    if( in_data != out_data )
+    {
+        memcpy(out_data,in_data,sizeof(t_complex)*nx*ny*nz);
+    }
+    /* Much easier to do pointer arithmetic when base has the correct type */
+    data = (t_complex *)out_data;
+    /* Perform z transforms */
+    for(i=0;i<nx*ny;i++)
+        fftpack_exec_1d(fft->next->next,dir,data+i*nz,data+i*nz);
+    /* For each X slice, transpose the y & z dimensions inside the slice */
+    for(i=0;i<nx;i++)
+    {
+        fftpack_transpose_2d(data+i*ny*nz,data+i*ny*nz,ny,nz);
+    }
+    /* Array is now (nx,nz,ny) - perform y transforms */
+    for(i=0;i<nx*nz;i++)
+    {
+        fftpack_exec_1d(fft->next,dir,data+i*ny,data+i*ny);
+    }
+    /* Transpose back to (nx,ny,nz) */
+    for(i=0;i<nx;i++)
+    {
+        fftpack_transpose_2d(data+i*ny*nz,data+i*ny*nz,nz,ny);
+    }
+    /* Transpose entire x & y slices to go from
+     * (nx,ny,nz) to (ny,nx,nz).
+     */
+    rc=fftpack_transpose_2d_nelem(data,data,nx,ny,nz);
+    if( rc != 0)
+    {
+        fprintf(stderr,"Fatal error - cannot transpose X & Y/Z in fftpack_exec_3d().");
+        return rc;
+    }
+    /* Then go from (ny,nx,nz) to (ny,nz,nx) */
+    for(i=0;i<ny;i++)
+    {
+        fftpack_transpose_2d(data+i*nx*nz,data+i*nx*nz,nx,nz);
+    }
+    /* Perform x transforms */
+    for(i=0;i<ny*nz;i++)
+    {
+        fftpack_exec_1d(fft,dir,data+i*nx,data+i*nx);
+    }
+    /* Transpose back from (ny,nz,nx) to (ny,nx,nz) */
+    for(i=0;i<ny;i++)
+    {
+        fftpack_transpose_2d(data+i*nz*nx,data+i*nz*nx,nz,nx);
+    }
+    /* Transpose from (ny,nx,nz) to (nx,ny,nz). 
+     */
+    rc = fftpack_transpose_2d_nelem(data,data,ny,nx,nz);
+    if( rc != 0)
+    {
+        fprintf(stderr,"Fatal error - cannot transpose Y/Z & X in fftpack_exec_3d().");
+        return rc;
+    }
+    return 0;
+}
+void
+fftpack_destroy(fftpack_t      fft)
+{
+    if(fft != NULL) 
+    {
+        free(fft->work);
+        if(fft->next != NULL)
+            fftpack_destroy(fft->next);
+        free(fft);
+    }
+}
--- a/platforms/reference/src/SimTKReference/fftpack.h
+++ b/platforms/reference/src/SimTKReference/fftpack.h
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- 
+ *
+ *
+ * Gromacs 4.0                         Copyright (c) 1991-2003
+ * David van der Spoel, Erik Lindahl, University of Groningen.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gnomes, ROck Monsters And Chili Sauce
+ */
+#include "../SimTKUtilities/SimTKOpenMMCommon.h"
+#ifndef _FFTPACK_H_
+#define _FFTPACK_H_
+#include <stdio.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+typedef struct {
+	RealOpenMM re;
+	RealOpenMM im;
+} t_complex;
+/*! \brief Datatype for FFT setup 
+ *
+ *  The fftpack_t type contains all the setup information, e.g. twiddle
+ *  factors, necessary to perform an FFT. Internally it is mapped to 
+ *  whatever FFT library we are using, or the built-in FFTPACK if no fast
+ *  external library is available.
+ */
+typedef struct fftpack *
+fftpack_t;
+/*! \brief Specifier for FFT direction. 
+ *
+ *  The definition of the 1D forward transform from input x[] to output y[] is
+ *  \f[
+ *  y_{k} = \sum_{j=0}^{N-1} x_{j} \exp{-i 2 \pi j k /N}
+ *  \f]
+ *
+ *  while the corresponding backward transform is
+ *
+ *  \f[
+ *  y_{k} = \sum_{j=0}^{N-1} x_{j} \exp{i 2 \pi j k /N}
+ *  \f]
+ *
+ *  A forward-backward transform pair will this result in data scaled by N.
+ *
+ */
+typedef enum fftpack_direction
+{
+    FFTPACK_FORWARD,         /*!< Forward complex-to-complex transform  */
+    FFTPACK_BACKWARD,        /*!< Backward complex-to-complex transform */
+} fftpack_direction;
+/*! \brief Setup a 1-dimensional complex-to-complex transform 
+ *
+ *  \param fft    Pointer to opaque Gromacs FFT datatype
+ *  \param nx     Length of transform 
+ *
+ *  \return status - 0 or a standard error message.
+ */
+int
+fftpack_init_1d        (fftpack_t *       fft,
+                        int               nx);
+/*! \brief Setup a 2-dimensional complex-to-complex transform 
+ *
+ *  \param fft    Pointer to opaque Gromacs FFT datatype
+ *  \param nx     Length of transform in first dimension
+ *  \param ny     Length of transform in second dimension
+ *
+ *  \return status - 0 or a standard error message.
+ *   
+ */
+int
+fftpack_init_2d        (fftpack_t *         fft,
+                        int                 nx, 
+                        int                 ny);
+/*! \brief Setup a 3-dimensional complex-to-complex transform 
+ *
+ *  \param fft    Pointer to opaque Gromacs FFT datatype
+ *  \param nx     Length of transform in first dimension
+ *  \param ny     Length of transform in second dimension
+ *  \param nz     Length of transform in third dimension
+ *
+ *  \return status - 0 or a standard error message.
+ *   
+ */
+int
+fftpack_init_3d        (fftpack_t *         fft,
+                        int                 nx, 
+                        int                 ny,
+                        int                 nz);
+/*! \brief Perform a 1-dimensional complex-to-complex transform
+ *
+ *  Performs an instance of a transform previously initiated.
+ *
+ *  \param setup     Setup returned from fftpack_init_1d()
+ *  \param dir       Forward or Backward
+ *  \param in_data   Input grid data. 
+ *  \param out_data  Output grid data. 
+ *                   You can provide the same pointer for in_data and out_data
+ *                   to perform an in-place transform.
+ *
+ * \return 0 on success, or an error code.
+ *
+ * \note Data pointers are declared as void, to avoid casting pointers 
+ *       depending on your grid type.
+ */
+int 
+fftpack_exec_1d          (fftpack_t                  setup,
+                          enum fftpack_direction     dir,
+                          t_complex *                in_data,
+                          t_complex *                out_data);
+/*! \brief Perform a 2-dimensional complex-to-complex transform
+ *
+ *  Performs an instance of a transform previously initiated.
+ *
+ *  \param setup     Setup returned from fftpack_init_1d()
+ *  \param dir       Forward or Backward
+ *  \param in_data   Input grid data. 
+ *  \param out_data  Output grid data. 
+ *                   You can provide the same pointer for in_data and out_data
+ *                   to perform an in-place transform.
+ *
+ * \return 0 on success, or an error code.
+ *
+ * \note Data pointers are declared as void, to avoid casting pointers 
+ *       depending on your grid type.
+ */
+int 
+fftpack_exec_2d          (fftpack_t                  setup,
+                          enum fftpack_direction     dir,
+                          t_complex *                in_data,
+                          t_complex *                out_data);
+/*! \brief Perform a 3-dimensional complex-to-complex transform
+ *
+ *  Performs an instance of a transform previously initiated.
+ *
+ *  \param setup     Setup returned from fftpack_init_1d()
+ *  \param dir       Forward or Backward
+ *  \param in_data   Input grid data. 
+ *  \param out_data  Output grid data. 
+ *                   You can provide the same pointer for in_data and out_data
+ *                   to perform an in-place transform.
+ *
+ * \return 0 on success, or an error code.
+ *
+ * \note Data pointers are declared as void, to avoid casting pointers 
+ *       depending on your grid type.
+ */
+int 
+fftpack_exec_3d          (fftpack_t                  setup,
+                          enum fftpack_direction     dir,
+                          t_complex *                in_data,
+                          t_complex *                out_data);
+/*! \brief Release an FFT setup structure 
+ *
+ *  Destroy setup and release all allocated memory.
+ *
+ *  \param setup Setup returned from fftpack_init_1d(), or one
+ *		 of the other initializers.
+ *
+ */
+void
+fftpack_destroy          (fftpack_t                 setup);
+#ifdef __cplusplus
+}
+#endif
+#endif /* _FFTPACK_H_ */
--- a/platforms/reference/src/SimTKReference/pme.cpp
+++ b/platforms/reference/src/SimTKReference/pme.cpp
+/* 
+ * Reference implementation of PME reciprocal space interactions.
+ * 
+ * Copyright (c) 2009, Erik Lindahl, Rossen Apostolov, Szilard Pall
+ * All rights reserved.
+ * Contact: lindahl@cbr.su.se Stockholm University, Sweden.
+ * 
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and 
+ * the following disclaimer in the documentation and/or other materials provided 
+ * with the distribution.
+ * Neither the name of the author/university nor the names of its contributors may 
+ * be used to endorse or promote products derived from this software without 
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "pme.h"
+#include "fftpack.h"
+//#define ONE_4PI_EPS0 (33.20636930*4.184) /* Units of kJ/mol and nm */
+// In OpenMM, atom charges are already scaled by sqrt(ONE_4PI_EPS0), so we don't need this
+#define ONE_4PI_EPS0 (1.0) /* Units of kJ/mol and nm */
+typedef int    ivec[3];
+struct pme
+{
+	int          natoms;
+	RealOpenMM       ewaldcoeff;
+	t_complex *  grid;                 /* Memory for the grid we spread charges on.
+                                        * Element (i,j,k) is accessed as:
+                                        * grid[i*ngrid[1]*ngrid[2] + j*ngrid[2] + k]
+                                        */
+	int          ngrid[3];             /* Total grid dimensions (all data is complex!) */
+	fftpack_t    fftplan;              /* Handle to fourier transform setup  */
+	int          order;                /* PME interpolation order. Almost always 4 */
+    /* Data for bspline interpolation, see the Essman PME paper */
+	RealOpenMM *     bsplines_moduli[3];   /* 3 pointers, to x/y/z bspline moduli, each of length ngrid[x/y/z]   */
+	RealOpenMM *     bsplines_theta[3];    /* each of x/y/z has length order*natoms */
+	RealOpenMM *     bsplines_dtheta[3];   /* each of x/y/z has length order*natoms */
+	ivec *       particleindex;        /* Array of length natoms. Each element is 
+                                        * an ivec (3 ints) that specify the grid
+                                        * indices for that particular atom. Updated every step!
+                                        */
+    rvec *       particlefraction;     /* Array of length natoms. Fractional offset in the grid for 
+                                        * each atom in all three dimensions.
+                                        */
+    /* Further explanation of index/fraction:
+     *
+     * Assume we have a cell of size 10*10*10nm, and a total grid dimension of 100*100*100 cells.
+     * In other words, each cell is 0.1*0.1*0.1 nm.
+     *
+     * If particle i has coordinates { 0.543 , 6.235 , -0.73 }, we will get:
+     *
+     * particleindex[i]    = { 5 , 62 , 92 }         (-0.73 + 10 = 9.27, we always apply PBC for grid calculations!)
+     * particlefraction[i] = { 0.43 , 0.35 , 0.7 }   ( this is the fraction of the cell length where the atom is)
+     *
+     * (The reason for precaculating / storing these is that it gets a bit more complex for triclinic cells :-)
+     *
+     * In the current code version we might assume that a particle is not more than a whole box length away from
+     * the central cell, i.e., in this case we would assume all coordinates fall in -10 nm < x,y,z < 20 nm.
+     */
+	RealOpenMM       epsilon_r;             /* Dielectric coefficient to use, typically 1.0 */
+}; 
+/* Internal setup routines */
+/* Only called once from init_pme(), performance does not matter! */
+static void 
+pme_calculate_bsplines_moduli(pme_t pme)
+{
+	int       nmax;
+	int       i,j,k,l,d;
+	int       order;
+	int       ndata;
+	RealOpenMM *  data;
+	RealOpenMM *  ddata;
+	RealOpenMM *  bsplines_data;
+	RealOpenMM    div;
+	RealOpenMM    sc,ss,arg;
+	nmax = 0;
+	for(d=0;d<3;d++)
+	{
+		nmax = (pme->ngrid[d] > nmax) ? pme->ngrid[d] : nmax;
+		pme->bsplines_moduli[d] = (RealOpenMM *) malloc(sizeof(RealOpenMM)*pme->ngrid[d]);
+	}
+	order = pme->order;
+	/* temp storage in this routine */
+	data          = (RealOpenMM *) malloc(sizeof(RealOpenMM)*order);
+	ddata         = (RealOpenMM *) malloc(sizeof(RealOpenMM)*order);
+	bsplines_data = (RealOpenMM *) malloc(sizeof(RealOpenMM)*nmax);
+	data[order-1]=0;
+	data[1]=0;
+	data[0]=1;
+	for(k=3;k<order;k++)
+	{
+		div=1.0/(k-1.0);
+		data[k-1]=0;
+		for(l=1;l<(k-1);l++)
+		{
+			data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
+		}
+		data[0]=div*data[0];
+	}
+	/* differentiate */
+	ddata[0]=-data[0];
+	for(k=1;k<order;k++)
+	{
+		ddata[k]=data[k-1]-data[k];
+	}
+	div=1.0/(order-1);
+	data[order-1]=0;
+	for(l=1;l<(order-1);l++)
+	{
+		data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
+	}
+	data[0]=div*data[0]; 
+	for(i=0;i<nmax;i++)
+	{
+		bsplines_data[i]=0;
+	}
+	for(i=1;i<=order;i++)
+	{
+		bsplines_data[i]=data[i-1];
+    }
+	/* Evaluate the actual bspline moduli for X/Y/Z */
+	for(d=0;d<3;d++)
+	{
+		ndata = pme->ngrid[d];
+		for(i=0;i<ndata;i++)
+		{
+			sc=ss=0;
+			for(j=0;j<ndata;j++) 
+			{
+				arg=(2.0*M_PI*i*j)/ndata;
+				sc+=bsplines_data[j]*cos(arg);
+				ss+=bsplines_data[j]*sin(arg);
+			}
+			pme->bsplines_moduli[d][i]=sc*sc+ss*ss;
+		}
+		for(i=0;i<ndata;i++)
+		{
+			if(pme->bsplines_moduli[d][i]<1.0e-7)
+			{
+				pme->bsplines_moduli[d][i]=(pme->bsplines_moduli[d][i-1]+pme->bsplines_moduli[d][i+1])*0.5;
+			}
+		}
+	}		
+	/* Release temp storage */
+	free(data);
+	free(ddata);
+	free(bsplines_data);
+}
+static void
+pme_update_grid_index_and_fraction(pme_t    pme,
+								   RealOpenMM ** atomCoordinates,
+								   const RealOpenMM   periodicBoxSize[3])
+{
+	int    i;
+	int    d;
+	RealOpenMM t;
+	int    ti;
+	for(i=0;i<pme->natoms;i++)
+	{
+		for(d=0;d<3;d++)
+		{
+			/* Index calculation (Look mom, no conditionals!):
+             *
+             * Both for Cuda and modern CPUs it is nice to avoid conditionals, but we still need to apply periodic boundary conditions.
+             * Instead of having loops to add/subtract the box dimension, we do it this way:
+             *
+			 * 1. First add the box size, to make sure this atom coordinate isnt -0.1 or something.
+             *    After this we assume all fractional box positions are *positive*.
+             *    The reason for this is that we always want to round coordinates _down_ to get 
+             *    their grid index, and when taking the integer part of -3.4 we would get -3, not -4 as we want.
+             *    Since we anyway need the grid indices to fall in the central box, it is more convenient
+             *    to first manipulate the coordinates to be positive.
+			 * 2. Convert to integer grid index
+             *    Since we have added a whole box unit in step 1, this index might actually be larger than
+             *    the grid dimension. Examples, assuming 10*10*10nm box and grid dimension 100*100*100 (spacing 0.1 nm):
+             *
+             *    coordinate is { 0.543 , 6.235 , -0.73 }
+             *
+             *    x[i][d]/box[d]                      becomes   { 0.0543 , 0.6235 , -0.073 }
+             *    (x[i][d]/box[d] + 1.0)              becomes   { 1.0543 , 1.6235 , 0.927 }
+             *    (x[i][d]/box[d] + 1.0)*ngrid[d]     becomes   { 105.43 , 162.35 , 92.7 }
+             * 
+             *    integer part is now { 105 , 162 , 92 }
+             *
+             *    The fraction is calculates as t-ti, which becomes { 0.43 , 0.35 , 0.7 }
+             *
+			 * 3. Take the first integer index part (which can be larger than the grid) modulo the grid dimension
+             *
+             *    Now we get { 5 , 62 , 92 } 
+             *
+             *    Voila, both index and fraction, entirely without conditionals. The one limitation here is that
+             *    we only add one box length, so if the particle had a coordinate <=-10.0, we would be screwed.
+             *    In principle we can of course add 100.0, but that just moves the problem, it doesnt solve it.
+             *    In practice, MD programs will apply PBC to reset particles inside the central box to avoid
+             *    numerical problems, so this shouldnt cause any problems.
+             *    (And, by adding 100.0 box lengths, we would lose a bit of numerical accuracy here!)
+			 */
+			t  = (atomCoordinates[i][d] / periodicBoxSize[d] + 1.0)*pme->ngrid[d];
+			ti = t;
+			pme->particlefraction[i][d] = t - ti; 
+			pme->particleindex[i][d]    = ti % pme->ngrid[d];
+		}		
+	}
+}
+/* Ugly bspline calculation taken from Tom Dardens reference equations. 
+ * This probably very sub-optimal in Cuda? Separate kernel?
+ *
+ * In practice, it might help to require order=4 for the cuda port.
+ */
+static void 
+pme_update_bsplines(pme_t    pme)
+{
+	int       i,j,k,l;
+	int       order;
+	RealOpenMM    dr,div;
+	RealOpenMM *  data;
+	RealOpenMM *  ddata;
+	order = pme->order;
+    for(i=0; (i<pme->natoms); i++) 
+	{
+		for(j=0; j<3; j++) 
+		{
+			/* dr is relative offset from lower cell limit */
+			dr = pme->particlefraction[i][j];
+			data  = &(pme->bsplines_theta[j][i*order]);
+			ddata = &(pme->bsplines_dtheta[j][i*order]);
+			data[order-1] = 0;
+			data[1]       = dr;
+			data[0]       = 1-dr;
+			for(k=3; k<order; k++) 
+			{
+				div = 1.0/(k-1.0);    
+				data[k-1] = div*dr*data[k-2];
+				for(l=1; l<(k-1); l++) 
+				{
+					data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)*data[k-l-1]);
+				}
+				data[0] = div*(1-dr)*data[0];
+			}
+			/* differentiate */
+			ddata[0] = -data[0];
+			for(k=1; k<order; k++) 
+			{
+				ddata[k] = data[k-1]-data[k];
+			}
+			div           = 1.0/(order-1);
+			data[order-1] = div*dr*data[order-2];
+			for(l=1; l<(order-1); l++) 
+			{
+				data[order-l-1] = div*((dr+l)*data[order-l-2]+(order-l-dr)*data[order-l-1]);
+			}
+			data[0] = div*(1-dr)*data[0]; 
+		}
+    }
+}
+static void
+pme_grid_spread_charge(pme_t      pme,
+					   RealOpenMM **   atomParameters)
+{
+        static const int   QIndex = 2; // atom charges are stored in atomParameters[atomID][2]
+	int       order;
+	int       i;
+	int       ix,iy,iz;
+	int       x0index,y0index,z0index;
+	int       xindex,yindex,zindex;
+	int       index;
+	RealOpenMM    q;
+	RealOpenMM *  thetax;
+	RealOpenMM *  thetay;
+	RealOpenMM *  thetaz;
+	order = pme->order;
+    /* Reset the grid */
+	for(i=0;i<pme->ngrid[0]*pme->ngrid[1]*pme->ngrid[2];i++)
+	{
+		pme->grid[i].re = pme->grid[i].im = 0;
+	}
+	for(i=0;i<pme->natoms;i++)
+	{
+		q = atomParameters[i][QIndex];
+        /* Grid index for the actual atom position */
+		x0index = pme->particleindex[i][0];
+		y0index = pme->particleindex[i][1];
+		z0index = pme->particleindex[i][2];
+        /* Bspline factors for this atom in each dimension , calculated from fractional coordinates */
+		thetax  = &(pme->bsplines_theta[0][i*order]);
+		thetay  = &(pme->bsplines_theta[1][i*order]);
+		thetaz  = &(pme->bsplines_theta[2][i*order]);
+        /* Loop over norder*norder*norder (typically 4*4*4) neighbor cells.
+         *
+         * As a neat optimization, we only spread in the forward direction, but apply PBC!
+         * 
+         * Since we are going to do an FFT on the grid, it doesnt matter where the data is,
+         * in frequency space the result will be the same. 
+         *
+         * So, the influence function (bsplines) will probably be something like (0.15,0.35,0.35,0.15),
+         * with largest weight 2-3 steps forward (you dont need to understand that for the implementation :-)
+         * Effectively, you can look at this as translating the entire grid.
+         *
+         * Why do we do this stupid thing? 
+         *
+         * 1) The loops get much simpler 
+         * 2) Just looking forward will hopefully get us more cache hits
+         * 3) When we parallelize things, we only need to communicate in one direction instead of two!
+         */
+		for(ix=0;ix<order;ix++)
+		{
+            /* Calculate index, apply PBC so we spread to index 0/1/2 when a particle is close to the upper limit of the grid */
+			xindex = (x0index + ix) % pme->ngrid[0];
+			for(iy=0;iy<order;iy++)
+			{
+				yindex = (y0index + iy) % pme->ngrid[1];
+				for(iz=0;iz<order;iz++)
+				{
+					/* Can be optimized, but we keep it simple here */
+					zindex               = (z0index + iz) % pme->ngrid[2];	
+                    /* Calculate index in the charge grid */
+					index                = xindex*pme->ngrid[1]*pme->ngrid[2] + yindex*pme->ngrid[2] + zindex;
+                    /* Add the charge times the bspline spread/interpolation factors to this grid position */
+					pme->grid[index].re += q*thetax[ix]*thetay[iy]*thetaz[iz];       
+				}
+			}
+		}
+	}
+}
+static void
+pme_reciprocal_convolution(pme_t     pme,
+						   const RealOpenMM    periodicBoxSize[3],
+						   RealOpenMM *  energy,
+						   RealOpenMM    pme_virial[3][3])
+{
+	int kx,ky,kz;
+	int nx,ny,nz;
+	RealOpenMM mx,my,mz;
+	RealOpenMM mhx,mhy,mhz,m2;
+	RealOpenMM one_4pi_eps;
+	RealOpenMM virxx,virxy,virxz,viryy,viryz,virzz;
+	RealOpenMM bx,by,bz;
+	RealOpenMM d1,d2;
+	RealOpenMM eterm,vfactor,struct2,ets2;
+	RealOpenMM esum;
+	RealOpenMM factor;
+	RealOpenMM denom;
+	RealOpenMM boxfactor;
+	RealOpenMM maxkx,maxky,maxkz;
+	t_complex *ptr;
+	nx = pme->ngrid[0];
+	ny = pme->ngrid[1];
+	nz = pme->ngrid[2];
+	one_4pi_eps=ONE_4PI_EPS0/pme->epsilon_r;
+	factor=M_PI*M_PI/(pme->ewaldcoeff*pme->ewaldcoeff);
+	boxfactor = M_PI*periodicBoxSize[0]*periodicBoxSize[1]*periodicBoxSize[2];
+	esum = 0;
+	virxx = 0;
+	virxy = 0;
+	virxz = 0;
+	viryy = 0;
+	viryz = 0;
+	virzz = 0;
+	maxkx = (nx+1)/2;
+	maxky = (ny+1)/2;
+	maxkz = (nz+1)/2;
+    for(kx=0;kx<nx;kx++)
+    {
+        /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+        mx  = (kx<maxkx) ? kx : (kx-nx);
+        mhx = mx/periodicBoxSize[0];
+        bx  = boxfactor*pme->bsplines_moduli[0][kx];
+        for(ky=0;ky<ny;ky++)
+        {
+            /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+            my  = (ky<maxky) ? ky : (ky-ny);
+            mhy = my/periodicBoxSize[1];
+            by  = pme->bsplines_moduli[1][ky];
+			for(kz=0;kz<nz;kz++)
+			{
+                /* If the net charge of the system is 0.0, there will not be any DC (direct current, zero frequency) component. However,
+                 * we can still handle charged systems through a charge correction, in which case the DC
+                 * component should be excluded from recprocal space. We will anyway run into problems below when dividing with the
+                 * frequency if it is zero...
+                 *
+                 * In cuda you could probably work around this by setting something to 0.0 instead, but the short story is that we
+                 * should skip the zero frequency case!
+                 */
+				if (kx==0 && ky==0 && kz==0)
+				{
+					continue;
+				}
+                /* Calculate frequency. Grid indices in the upper half correspond to negative frequencies! */
+				mz        = (kz<maxkz) ? kz : (kz-nz);
+				mhz       = mz/periodicBoxSize[2];
+                /* Pointer to the grid cell in question */
+				ptr       = pme->grid + kx*ny*nz + ky*nz + kz;				
+                /* Get grid data for this frequency */
+				d1        = ptr->re;
+				d2        = ptr->im;
+                /* Calculate the convolution - see the Essman/Darden paper for the equation! */
+				m2        = mhx*mhx+mhy*mhy+mhz*mhz;
+				bz        = pme->bsplines_moduli[2][kz];
+				denom     = m2*bx*by*bz;
+				eterm     = one_4pi_eps*exp(-factor*m2)/denom;
+                /* write back convolution data to grid */
+				ptr->re   = d1*eterm;
+				ptr->im   = d2*eterm;
+				struct2   = (d1*d1+d2*d2);
+				/* Long-range PME contribution to the energy for this frequency */
+				ets2      = eterm*struct2;
+				esum     += ets2;
+				/* PME long-range contribution to atomic virial. Since it is symmetric, we only calculate half the matrix inside this loop. */
+				vfactor   = (factor*m2+1.0)*2.0/m2;
+				virxx    += ets2*(vfactor*mhx*mhx-1.0);
+                virxy    += ets2*vfactor*mhx*mhy;
+                virxz    += ets2*vfactor*mhx*mhz;
+                viryy    += ets2*(vfactor*mhy*mhy-1.0);
+                viryz    += ets2*vfactor*mhy*mhz;
+                virzz    += ets2*(vfactor*mhz*mhz-1.0);				
+			}
+		}
+	}
+	pme_virial[0][0] = 0.25*virxx;
+    pme_virial[1][1] = 0.25*viryy;
+    pme_virial[2][2] = 0.25*virzz;
+    pme_virial[0][1] = pme_virial[1][0] = 0.25*virxy;
+    pme_virial[0][2] = pme_virial[2][0] = 0.25*virxz;
+    pme_virial[1][2] = pme_virial[2][1] = 0.25*viryz;
+    /* The factor 0.5 is nothing special, but it is better to have it here than inside the loop :-) */
+	*energy = 0.5*esum;
+}
+static void
+pme_grid_interpolate_force(pme_t      pme,
+						   const RealOpenMM     periodicBoxSize[3],
+						   RealOpenMM **   atomParameters,
+						   RealOpenMM **   forces)
+{
+        static const int   QIndex = 2; // atom charges are stored in atomParameters[atomID][2]
+	int       i;
+	int       ix,iy,iz;
+	int       x0index,y0index,z0index;
+	int       xindex,yindex,zindex;
+	int       index;
+	int       order;
+	RealOpenMM    q;
+	RealOpenMM *  thetax;
+	RealOpenMM *  thetay;
+	RealOpenMM *  thetaz;
+	RealOpenMM *  dthetax;
+	RealOpenMM *  dthetay;
+	RealOpenMM *  dthetaz;
+	RealOpenMM    tx,ty,tz;
+	RealOpenMM    dtx,dty,dtz;
+	RealOpenMM    fx,fy,fz;
+	RealOpenMM    gridvalue;
+	int       nx,ny,nz;
+	nx    = pme->ngrid[0];
+	ny    = pme->ngrid[1];
+	nz    = pme->ngrid[2];
+	order = pme->order;
+    /* This is almost identical to the charge spreading routine! */
+	for(i=0;i<pme->natoms;i++)
+	{
+		fx = fy = fz = 0;
+		q = atomParameters[i][QIndex];
+        /* Grid index for the actual atom position */
+		x0index = pme->particleindex[i][0];
+		y0index = pme->particleindex[i][1];
+		z0index = pme->particleindex[i][2];
+        /* Bspline factors for this atom in each dimension , calculated from fractional coordinates */
+		thetax  = &(pme->bsplines_theta[0][i*order]);
+		thetay  = &(pme->bsplines_theta[1][i*order]);
+		thetaz  = &(pme->bsplines_theta[2][i*order]);
+		dthetax = &(pme->bsplines_dtheta[0][i*order]);
+		dthetay = &(pme->bsplines_dtheta[1][i*order]);
+		dthetaz = &(pme->bsplines_dtheta[2][i*order]);
+		/* See pme_grid_spread_charge() for comments about the order here, and only interpolation in one direction */
+        /* Since we will add order^3 (typically 4*4*4=64) terms to the force on each particle, we use temporary fx/fy/fz
+         * variables, and only add it to memory forces[] at the end.
+         */
+		for(ix=0;ix<order;ix++)
+		{
+			xindex = (x0index + ix) % pme->ngrid[0];
+            /* Get both the bspline factor and its derivative with respect to the x coordinate! */
+			tx     = thetax[ix];
+			dtx    = dthetax[ix];
+			for(iy=0;iy<order;iy++)
+			{
+				yindex = (y0index + iy) % pme->ngrid[1];
+                /* bspline + derivative wrt y */
+				ty     = thetay[iy];
+				dty    = dthetay[iy];
+				for(iz=0;iz<order;iz++)
+				{
+					/* Can be optimized, but we keep it simple here */
+					zindex               = (z0index + iz) % pme->ngrid[2];
+                    /* bspline + derivative wrt z */
+					tz                   = thetaz[iz];
+					dtz                  = dthetaz[iz];
+					index                = xindex*pme->ngrid[1]*pme->ngrid[2] + yindex*pme->ngrid[2] + zindex;
+                    /* Get the fft+convoluted+ifft:d data from the grid, which must be real by definition */
+                    /* Checking that the imaginary part is indeed zero might be a good check :-) */
+					gridvalue            = pme->grid[index].re;
+					/* The d component of the force is calculated by taking the derived bspline in dimension d, normal bsplines in the other two */
+					fx                  += dtx*ty*tz*gridvalue;
+					fy                  += tx*dty*tz*gridvalue;
+					fz                  += tx*ty*dtz*gridvalue;
+				}
+			}
+		}
+        /* Update memory force, note that we multiply by charge and some box stuff */
+		forces[i][0] -= q*fx*nx/periodicBoxSize[0];
+		forces[i][1] -= q*fy*ny/periodicBoxSize[1];
+		forces[i][2] -= q*fz*nz/periodicBoxSize[2];
+	}
+}
+/* EXPORTED ROUTINES */
+int 
+pme_init(pme_t *       ppme,
+		 RealOpenMM        ewaldcoeff,
+		 int           natoms,
+		 int           ngrid[3],
+		 int           pme_order,
+         RealOpenMM        epsilon_r)
+{
+    pme_t pme;
+    int   d;
+    pme = (pme_t) malloc(sizeof(struct pme));
+    pme->order       = pme_order;
+    pme->epsilon_r   = epsilon_r;
+    pme->ewaldcoeff  = ewaldcoeff;
+	pme->natoms      = natoms;
+    for(d=0;d<3;d++)
+	{
+		pme->ngrid[d]            = ngrid[d];
+		pme->bsplines_theta[d]   = (RealOpenMM *)malloc(sizeof(RealOpenMM)*pme_order*natoms);
+		pme->bsplines_dtheta[d]  = (RealOpenMM *)malloc(sizeof(RealOpenMM)*pme_order*natoms);
+	}
+	pme->particlefraction = (rvec *)malloc(sizeof(rvec)*natoms);
+	pme->particleindex    = (ivec *)malloc(sizeof(ivec)*natoms);
+	/* Allocate charge grid storage */
+	pme->grid        = (t_complex *)malloc(sizeof(t_complex)*ngrid[0]*ngrid[1]*ngrid[2]);
+	fftpack_init_3d(&pme->fftplan,ngrid[0],ngrid[1],ngrid[2]);
+	/* Setup bspline moduli (see Essman paper) */
+	pme_calculate_bsplines_moduli(pme);	
+	*ppme = pme;
+    return 0;
+}
+int pme_exec(pme_t       pme,
+			 RealOpenMM **   atomCoordinates,
+			 RealOpenMM **   forces,
+			 RealOpenMM **   atomParameters,
+			 const RealOpenMM      periodicBoxSize[3],
+			 RealOpenMM *    energy,
+			 RealOpenMM      pme_virial[3][3])
+{
+    /* Routine is called with coordinates in x, a box, and charges in q */
+    /* Before we can do the actual interpolation, we need to recalculate and update
+     * the indices for each particle in the charge grid (initialized in pme_init()),
+     * and what its fractional offset in this grid cell is.
+     */
+	/* Update charge grid indices and fractional offsets for each atom.
+     * The indices/fractions are stored internally in the pme datatype 
+     */
+	pme_update_grid_index_and_fraction(pme,atomCoordinates,periodicBoxSize);
+	/* Calculate bsplines (and their differentials) from current fractional coordinates, store in pme structure */
+	pme_update_bsplines(pme);
+	/* Spread the charges on grid (using newly calculated bsplines in the pme structure) */
+	pme_grid_spread_charge(pme,atomParameters);
+	/* do 3d-fft */ 
+	fftpack_exec_3d(pme->fftplan,FFTPACK_FORWARD,pme->grid,pme->grid);
+	/* solve in k-space */
+	pme_reciprocal_convolution(pme,periodicBoxSize,energy,pme_virial);
+	/* do 3d-invfft */
+	fftpack_exec_3d(pme->fftplan,FFTPACK_BACKWARD,pme->grid,pme->grid);
+	/* Get the particle forces from the grid and bsplines in the pme structure */
+	pme_grid_interpolate_force(pme,periodicBoxSize,atomParameters,forces);
+    return 0;
+}
+int
+pme_destroy(pme_t    pme)
+{
+	int d;
+	free(pme->grid);
+	for(d=0;d<3;d++)
+	{
+		free(pme->bsplines_moduli[d]);
+		free(pme->bsplines_theta[d]);
+		free(pme->bsplines_dtheta[d]);
+	}
+	free(pme->particlefraction);
+	free(pme->particleindex);
+	fftpack_destroy(pme->fftplan);
+	/* destroy structure itself */
+	free(pme);
+	return 0;
+}
--- a/platforms/reference/src/SimTKReference/pme.h
+++ b/platforms/reference/src/SimTKReference/pme.h
+/* 
+ * Reference implementation of PME reciprocal space interactions.
+ * 
+ * Copyright (c) 2009, Erik Lindahl, Rossen Apostolov, Szilard Pall
+ * All rights reserved.
+ * Contact: lindahl@cbr.su.se Stockholm University, Sweden.
+ * 
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer. Redistributions in binary 
+ * form must reproduce the above copyright notice, this list of conditions and 
+ * the following disclaimer in the documentation and/or other materials provided 
+ * with the distribution.
+ * Neither the name of the author/university nor the names of its contributors may 
+ * be used to endorse or promote products derived from this software without 
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "../SimTKUtilities/SimTKOpenMMCommon.h"
+typedef RealOpenMM rvec[3];
+typedef struct pme *
+pme_t;
+/*
+ * Initialize a PME calculation and set up data structures
+ *
+ * Arguments:
+ * 
+ * ppme        Pointer to an opaque pme_t object
+ * ewaldcoeff  Coefficient derived from the beta factor to participate
+ *             direct/reciprocal space. See gromacs code for documentation!
+ *             We assume that you are using nm units...
+ * natoms      Number of atoms to set up data structure sof
+ * ngrid       Size of the full pme grid
+ * pme_order   Interpolation order, almost always 4
+ * epsilon_r   Dielectric coefficient, typically 1.0.
+ */
+int 
+pme_init(pme_t *       ppme,
+		 RealOpenMM        ewaldcoeff,
+		 int           natoms,
+		 int           ngrid[3],
+		 int           pme_order,
+	         RealOpenMM        epsilon_r);
+/*
+ * Evaluate reciprocal space PME energy and forces.
+ *
+ * Args:
+ *
+ * pme         Opaque pme_t object, must have been initialized with pme_init()
+ * x           Pointer to coordinate data array (nm)
+ * f           Pointer to force data array (will be written as kJ/mol/nm)
+ * charge      Array of charges (units of e)
+ * box         Simulation cell dimensions (nm)
+ * energy      Total energy (will be written in units of kJ/mol)
+ * pme_virial  Long-range part of the virial, output.
+ */
+int
+pme_exec(pme_t       pme,
+		 RealOpenMM **  atomCoordinates,
+		 RealOpenMM **  forces,
+		 RealOpenMM **  atomParameters,
+		 const RealOpenMM  periodicBoxSize[3],
+		 RealOpenMM *    energy,
+		 RealOpenMM      pme_virial[3][3]);
+/* Release all memory in pme structure */
+int
+pme_destroy(pme_t    pme);
--- a/platforms/reference/tests/TestReferenceEwald.cpp
+++ b/platforms/reference/tests/TestReferenceEwald.cpp
@@ -76,85 +76,88 @@ void testEwald() {
    cout << "PotentialEnergy: " << state.getPotentialEnergy() << endl;
    ASSERT_EQUAL_VEC(Vec3(-123.711, 64.1877, -302.716), forces[0], 10*TOL);
    ASSERT_EQUAL_VEC(Vec3(123.711, -64.1877, 302.716), forces[1], 10*TOL);
-//    const double eps = 78.3;
-//    const double krf = (1.0/(cutoff*cutoff*cutoff))*(eps-1.0)/(2.0*eps+1.0);
-//    const double crf = (1.0/cutoff)*(3.0*eps)/(2.0*eps+1.0);
-//    const double force = 138.935485*(1.0)*(1.0-2.0*krf*1.0);
-//    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[0], TOL);
-//    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[1], TOL);
-//    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
-//    ASSERT_EQUAL_TOL(2*138.935485*(1.0)*(1.0+krf*1.0-crf), state.getPotentialEnergy(), TOL);
 }
-/*
+void testPME() {
-void testEwald4() {
    ReferencePlatform platform;
-    System system(4, 0);
+    System system;
+    for (int i = 0 ; i < 42 ; i++)
+    {
+       system.addParticle(1.0);
+    }
    VerletIntegrator integrator(0.01);
-    NonbondedForce* nonbonded = new NonbondedForce(4, 0);
+    NonbondedForce* nonbonded = new NonbondedForce();
-    nonbonded->setParticleParameters(0, 1.0, 1, 0);
+    for (int i = 0 ; i < 14 ; i++)
-    nonbonded->setParticleParameters(1, 1.0, 1, 0);
+    {
-    nonbonded->setParticleParameters(2, -1.0, 1, 0);
+      nonbonded->addParticle(-0.82, 1, 0);
-    nonbonded->setParticleParameters(3, -1.0, 1, 0);
+      nonbonded->addParticle(0.41, 1, 0);
-    nonbonded->setNonbondedMethod(NonbondedForce::Ewald);
+      nonbonded->addParticle(0.41, 1, 0);
-    const double cutoff = 2.0;
+    }
+    nonbonded->setNonbondedMethod(NonbondedForce::PME);
+    const double cutoff = 0.8;
    nonbonded->setCutoffDistance(cutoff);
-    nonbonded->setPeriodicBoxVectors(Vec3(6, 0, 0), Vec3(0, 6, 0), Vec3(0, 0, 6));
+    nonbonded->setPeriodicBoxVectors(Vec3(1.86206, 0, 0), Vec3(0, 1.86206, 0), Vec3(0, 0, 1.86206));
+    nonbonded->setEwaldErrorTolerance(TOL);
    system.addForce(nonbonded);
    OpenMMContext context(system, integrator, platform);
-    vector<Vec3> positions(4);
+    vector<Vec3> positions(42);
-    positions[0] = Vec3(3.048000,2.764000,3.156000);
+positions[0] = Vec3( 0.23,0.628,0.113);
-    positions[1] = Vec3(3.348000,2.764000,3.156000);
+positions[1] = Vec3(0.137,0.626, 0.15);
-    positions[2] = Vec3(2.809000,2.888000,2.571000);
+positions[2] = Vec3(0.231,0.589,0.021);
-    positions[3] = Vec3(2.509000,2.888000,2.571000);
+positions[3] = Vec3(-0.307,-0.351,0.703);
+positions[4] = Vec3(-0.364,-0.367,0.784);
+positions[5] = Vec3(-0.366,-0.341,0.623);
+positions[6] = Vec3(-0.569,-0.634,-0.439);
+positions[7] = Vec3(-0.532,-0.707,-0.497);
+positions[8] = Vec3(-0.517,-0.629,-0.354);
+positions[9] = Vec3(-0.871, 0.41,-0.62);
+positions[10] = Vec3(-0.948,0.444,-0.566);
+positions[11] = Vec3(-0.905,0.359,-0.699);
+positions[12] = Vec3(0.249,-0.077,-0.621);
+positions[13] = Vec3(0.306,-0.142,-0.571);
+positions[14] = Vec3(0.233,-0.11,-0.714);
+positions[15] = Vec3(0.561,0.222,-0.715);
+positions[16] = Vec3(0.599,0.138,-0.678);
+positions[17] = Vec3(0.473,0.241,-0.671);
+positions[18] = Vec3(-0.515,-0.803,-0.628);
+positions[19] = Vec3(-0.491,-0.866,-0.702);
+positions[20] = Vec3(-0.605,-0.763,-0.646);
+positions[21] = Vec3(-0.021,0.175,-0.899);
+positions[22] = Vec3(0.018, 0.09,-0.935);
+positions[23] = Vec3(-0.119,0.177,-0.918);
+positions[24] = Vec3(-0.422,0.856,-0.464);
+positions[25] = Vec3(-0.479,0.908,-0.527);
+positions[26] = Vec3(-0.326,0.868,-0.488);
+positions[27] = Vec3(-0.369,-0.095,-0.903);
+positions[28] = Vec3(-0.336,-0.031,-0.972);
+positions[29] = Vec3(-0.303,-0.101,-0.828);
+positions[30] = Vec3(0.594,0.745,0.652);
+positions[31] = Vec3(0.644, 0.83,0.633);
+positions[32] = Vec3(0.506,0.747,0.604);
+positions[33] = Vec3(-0.157,-0.375,-0.758);
+positions[34] = Vec3(-0.25, -0.4,-0.785);
+positions[35] = Vec3(-0.131,-0.425,-0.676);
+positions[36] = Vec3(0.618,-0.295,-0.578);
+positions[37] = Vec3(0.613,-0.213,-0.521);
+positions[38] = Vec3(0.707,-0.298,-0.623);
+positions[39] = Vec3(0.039,-0.785,  0.3);
+positions[40] = Vec3(0.138,-0.796,0.291);
+positions[41] = Vec3(-0.001,-0.871,0.332);
    context.setPositions(positions);
    State state = context.getState(State::Forces | State::Energy);
    const vector<Vec3>& forces = state.getForces();
-//    cout << "force 0: " << forces[0] << endl;
+   // for (int i = 0 ; i < 42 ; i++)
-//    cout << "force 1: " << forces[1] << endl;
+   //   cout << "f [" << i << " : ]" << forces[i] << endl;
-    cout << "energyPoten: " << state.getPotentialEnergy() << endl;
+   // cout << "PotentialEnergy: " << state.getPotentialEnergy() << endl;
- //   const double eps = 78.3;
+//    ASSERT_EQUAL_VEC(Vec3(-123.711, 64.1877, -302.716), forces[0], 10*TOL);
- //   const double krf = (1.0/(cutoff*cutoff*cutoff))*(eps-1.0)/(2.0*eps+1.0);
+//    ASSERT_EQUAL_VEC(Vec3(123.711, -64.1877, 302.716), forces[1], 10*TOL);
- //   const double crf = (1.0/cutoff)*(3.0*eps)/(2.0*eps+1.0);
-//    const double force = 138.935485*(1.0)*(1.0-2.0*krf*1.0);
-//    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[0], TOL);
-//    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[1], TOL);
-//    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
-//    ASSERT_EQUAL_TOL(2*138.935485*(1.0)*(1.0+krf*1.0-crf), state.getPotentialEnergy(), TOL);
 }
-*/
-/*
-void testPeriodic() {
-    ReferencePlatform platform;
-    System system(2, 0);
-    VerletIntegrator integrator(0.01);
-    NonbondedForce* nonbonded = new NonbondedForce(2, 0);
-    nonbonded->setParticleParameters(0, 1.0, 1, 0);
-    nonbonded->setParticleParameters(1, -1.0, 1, 0);
-    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodci);
-    const double cutoff = 1.0;
-    nonbonded->setCutoffDistance(cutoff);
-    nonbonded->setPeriodicBoxVectors(Vec3(6, 0, 0), Vec3(0, 6, 0), Vec3(0, 0, 6));
-    system.addForce(nonbonded);
-    OpenMMContext context(system, integrator, platform);
-    vector<Vec3> positions(2);
-    positions[0] = Vec3(3.044293,2.765923,3.146914);
-    positions[1] = Vec3(2.812707,2.886077,2.580086);
-    context.setPositions(positions);
-    State state = context.getState(State::Forces | State::Energy);
-    const vector<Vec3>& forces = state.getForces();
-    cout << "force 0: " << forces[0] << endl;
-    cout << "force 1: " << forces[1] << endl;
-    cout << "energyPoten: " << state.getPotentialEnergy() << endl;
-}
-*/
 int main() {
    try {
-//        testPeriodic();
        testEwald();
-//        testEwald4();
+        testPME();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;