Completed Cuda implementation of Ewald method

fe1e6ffa · Rossen Apostolov · 57606845 · fe1e6ffa · fe1e6ffa · fe1e6ffa
Commit fe1e6ffa authored May 15, 2009 by Rossen Apostolov
9 changed files
--- a/platforms/cuda/src/kernels/kCalculateCDLJEwaldReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJEwaldReciprocal.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Rossen P. Apostolov, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+/**
+ * This file contains the kernel for evaluating nonbonded forces using the
+ * Ewald summation method (Reciprocal space summation).
+ */
+__global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
+{
+    float alphaEwald       =  3.123413;
+    float PI               = 3.14159265358979323846f;
+    float TWO_PI           = 2.0 * PI;
+    float SQRT_PI          = sqrt(PI);
+    float eps0             = 5.72765E-4;
+    int numRx              = 20+1;
+    int numRy              = 20+1;
+    int numRz              = 20+1;
+    int lowry, lowrz;
+    float kx, ky, kz, k2, ek;
+    float Qi, Qj, SinI, SinJ, CosI, CosJ;
+    float factorEwald   = -1 / (4*alphaEwald*alphaEwald);
+    float recipBoxSizeX = TWO_PI / cSim.periodicBoxSizeX;
+    float recipBoxSizeY = TWO_PI / cSim.periodicBoxSizeY;
+    float recipBoxSizeZ = TWO_PI / cSim.periodicBoxSizeZ;
+    float V = cSim.periodicBoxSizeX * cSim.periodicBoxSizeY * cSim.periodicBoxSizeZ;
+    float4 apos1, apos2 ;
+    unsigned int atomID1    = threadIdx.x + blockIdx.x * blockDim.x;
+    while (atomID1 < cSim.atoms)
+    {
+        apos1             = cSim.pPosq[atomID1];
+        unsigned int atomID2    = 0;
+        while (atomID2 < cSim.atoms)
+        {
+                apos2             = cSim.pPosq[atomID2];
+                lowry = 0;
+                lowrz = 1;
+                for(int rx = 0; rx < numRx; rx++) {
+                  kx = rx * recipBoxSizeX;
+                  for(int ry = lowry; ry < numRy; ry++) {
+                    ky = ry * recipBoxSizeY;
+                    for (int rz = lowrz; rz < numRz; rz++) {
+                      kz = rz * recipBoxSizeZ;
+                      k2 = kx * kx + ky * ky + kz * kz;
+                      ek = exp (  k2 * factorEwald);
+                      Qi = apos1.w ;
+                      Qj = apos2.w ;
+                      SinI = sin ( kx * apos1.x + ky * apos1.y + kz * apos1.z );
+                      SinJ = sin ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
+                      CosI = cos ( kx * apos1.x + ky * apos1.y + kz * apos1.z );
+                      CosJ = cos ( kx * apos2.x + ky * apos2.y + kz * apos2.z );
+                      cSim.pForce4[atomID1].x -= (2.0 / (V * eps0 ))  * Qi * ( kx/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
+                      cSim.pForce4[atomID1].y -= (2.0 / (V * eps0 ))  * Qi * ( ky/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
+                      cSim.pForce4[atomID1].z -= (2.0 / (V * eps0 ))  * Qi * ( kz/k2) * ek * ( - SinI * Qj * CosJ + CosI * Qj * SinJ);
+                      lowrz = 1 - numRz;
+                    }
+                    lowry = 1 - numRy;
+                  }
+                }
+                atomID2++;
+       }
+       atomID1                            += blockDim.x * gridDim.x;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -104,8 +104,7 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
 // Include version of the kernel with Ewald method
-    // Real Space Ewald uses almost the same kernels as Periodic
+// Real Space Ewald summation utilizes almost the same kernel as Periodic
 #undef METHOD_NAME
 #undef USE_OUTPUT_BUFFER_PER_WARP
 #define USE_PERIODIC
@@ -118,8 +117,9 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
 #define METHOD_NAME(a, b) a##EwaldDirectByWarp##b
 #include "kCalculateCDLJForces.h"
-     // Reciprocal Space Ewald summation is in a separate kernel
+// Reciprocal Space Ewald summation is in a separate kernel
-//#include "kCalculateEwaldReciprocal.h"
+#include "kCalculateCDLJEwaldReciprocal.h"
 __global__ extern void kCalculateCDLJCutoffForces_12_kernel();
@@ -197,12 +197,20 @@ void kCalculateCDLJForces(gpuContext gpu)
            kFindInteractionsWithinBlocksEwaldDirect_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            if (gpu->bOutputBufferPerWarp)
+            {
                kCalculateCDLJEwaldDirectByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+                kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+                LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
+            }
            else
+            {
                kCalculateCDLJEwaldDirectForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
+                LAUNCHERROR("kCalculateCDLJEwaldDirectForces");
+                kCalculateCDLJEwaldReciprocalForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+                LAUNCHERROR("kCalculateCDLJEwaldReciprocalForces");
+            }
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.h
@@ -43,9 +43,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
 #endif
 #ifdef USE_EWALD
-    float alphaEwald       =  3.123413;
+    float alphaEwald   =  3.123413;
-    float PI = 3.14159265358979323846f;
+    float PI           = 3.14159265358979323846f;
-    float SQRT_PI  = sqrt(PI);
+    float SQRT_PI      = sqrt(PI);
 #endif
    unsigned int lasty = 0xFFFFFFFF;
@@ -113,7 +113,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r               = sqrt(r2);
                    float alphaR    = alphaEwald * r;
-                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+                    dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
    #else
                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
    #endif
@@ -162,7 +162,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r               = sqrt(r2);
                    float alphaR    = alphaEwald * r;
-                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+                    dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
    #else
                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
    #endif
@@ -260,7 +260,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r               = sqrt(r2);
                    float alphaR    = alphaEwald * r;
-                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+                    dEdR += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
    #else
                        dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
    #endif
@@ -315,7 +315,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r               = sqrt(r2);
                    float alphaR    = alphaEwald * r;
-                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+                    dEdR += apos.w * psA[j].q * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
    #else
                            dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
    #endif
@@ -406,7 +406,7 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    #ifdef USE_EWALD
                    float r               = sqrt(r2);
                    float alphaR    = alphaEwald * r;
-                    dEdR += apos.w * psA[tj].q * invR * invR * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
+                    dEdR += apos.w * psA[tj].q * invR * (erfc(alphaR) + 2.0 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI );
    #else
                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
    #endif

--- a/platforms/cuda/src/kernels/kCalculateEwaldReciprocal.h
+++ b/platforms/cuda/src/kernels/kCalculateEwaldReciprocal.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-/**
- * This file contains the kernel for evalauating nonbonded forces
- * using the Ewald summation method.
- */
-#include <cuComplex.h>
-/* Define complex multiply operations */
-__device__ cuComplex ComplexMul(cuComplex a, cuComplex b)
-{
-     cuComplex c;
-     c.x = a.x * b.x - a.y * b.y;
-     c.y = a.x * b.y + a.y * b.x;
-     return c;
-}
-__device__ cuComplex ComplexConjMul(cuComplex a, cuComplex b)
-{
-     cuComplex c;
-     c.x = a.x*b.x + a.y*b.y;
-     c.y = a.y*b.x - a.x*b.y;
-     return c;
-}
-__device__ cuComplex FloatComplexMul(float r, cuComplex a)
-{
-     cuComplex b;
-     b.x = r*a.x;
-     b.y = r*a.y;
-     return b;
-}
-/* This kernel is under development */
-__global__ void  kCalculateCDLJEwaldForces_kernel(unsigned int* workUnit, int numWorkUnits)
-{
-// *******************************************************************
-    float alphaEwald       =  3.123413f;
-    float factorEwald = -1.0 / (4*alphaEwald*alphaEwald);
-    float PI = 3.14159265358979323846f;
-    float SQRT_PI  = sqrt(PI);
-    float TWO_PI   = 2.0f * PI;
-    float epsilon  =  1.0f;
-/////##############################################################################
-    float recipCoeff = 4.0*PI/(cSim.periodicBoxSizeX * cSim.periodicBoxSizeY * cSim.periodicBoxSizeZ) /epsilon;
-// setup reciprocal box
-    float recipBoxSizeX = TWO_PI / cSim.periodicBoxSizeX;
-    float recipBoxSizeY = TWO_PI / cSim.periodicBoxSizeY;
-    float recipBoxSizeZ = TWO_PI / cSim.periodicBoxSizeZ;
-// setup K-vectors
-    unsigned int numRx = 60+1;
-    unsigned int numRy = 60+1;
-    unsigned int numRz = 60+1;
-    const int kmax = 61;
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    cuComplex eir[kmax][cSim.atoms][3];
-    cuComplex tab_xy[cSim.atoms];
-    cuComplex tab_qxyz[cSim.atoms];
-  while (pos < cSim.atoms)
-  {
-      float4 apos             = cSim.pPosq[pos];
-      for(unsigned int m = 0; (m < 3); m++) {
-        eir[0][pos][m].x = 1;
-        eir[0][pos][m].y = 0;
-      }
-      eir[1][pos][0].x = cos(apos.x*recipBoxSizeX);
-      eir[1][pos][0].y = sin(apos.x*recipBoxSizeX);
-      eir[1][pos][1].x = cos(apos.y*recipBoxSizeY);
-      eir[1][pos][1].y = sin(apos.y*recipBoxSizeY);
-      eir[1][pos][2].x = cos(apos.z*recipBoxSizeZ);
-      eir[1][pos][2].y = sin(apos.z*recipBoxSizeZ);
-      for(unsigned int j=2; (j<kmax); j++) {
-        for(unsigned int m=0; (m<3); m++) {
-          eir[j][pos][m] = ComplexMul (eir[j-1][pos][m] , eir[1][pos][m]);
-        }
-      }
-      pos                    += blockDim.x * gridDim.x;
-  }
-// *******************************************************************
-    int lowry = 0;
-    int lowrz = 1;
-    for(int rx = 0; rx < numRx; rx++) {
-      float kx = rx * recipBoxSizeX;
-      for(int ry = lowry; ry < numRy; ry++) {
-        float ky = ry * recipBoxSizeY;
-        if(ry >= 0) {
-          while (pos < cSim.atoms)
-          {
-            tab_xy[pos] = ComplexMul (eir[rx][pos][0] , eir[ry][pos][1]);
-            pos                    += blockDim.x * gridDim.x;
-          }
-        }
-        else {
-          while (pos < cSim.atoms)
-          {
-            tab_xy[pos]= ComplexConjMul (eir[rx][pos][0] , (eir[-ry][pos][1]));
-            pos                    += blockDim.x * gridDim.x;
-          }
-        }
-        for (int rz = lowrz; rz < numRz; rz++) {
-          float kz = rz * recipBoxSizeZ;
-          float k2 = kx * kx + ky * ky + kz * kz;
-          float ak = exp(k2*factorEwald) / k2;
-          float akv = 2.0 * ak * (1.0/k2 - factorEwald);
-          if( rz >= 0) {
-           while (pos < cSim.atoms)
-           {
-             float4 apos         = cSim.pPosq[pos];
-             apos.w             *= cSim.epsfac;
-             tab_qxyz[pos] = FloatComplexMul ( apos.w * ComplexMul (tab_xy[pos] , eir[rz][pos][2]));
-             pos                    += blockDim.x * gridDim.x;
-           }
-          }
-          else {
-            while (pos < cSim.atoms)
-            {
-              float4 apos         = cSim.pPosq[pos];
-              apos.w             *= cSim.epsfac;
-              tab_qxyz[pos] = FloatComplexMul( apos.w * ComplexConjMul (tab_xy[pos] , conj(eir[-rz][pos][2])) );
-              pos                    += blockDim.x * gridDim.x;
-            }
-          }
-          float cs = 0.0f;
-          float ss = 0.0f;
-          while (pos < cSim.atoms)
-          {
-            cs += tab_qxyz[pos].x;
-            ss += tab_qxyz[pos].y;
-            pos                    += blockDim.x * gridDim.x;
-          }
-          recipEnergy += ak * ( cs * cs + ss * ss);
-          float vir =  akv * ( cs * cs + ss * ss);
-          while (pos < cSim.atoms)
-          {
-            float4 force            = cSim.pForce4[pos];
-            float dEdR = ak * (cs * tab_qxyz[pos].y - ss * tab_qxyz[pos].x);
-            force.x += 2.0 * recipCoeff * dEdR * kx ;
-            force.y += 2.0 * recipCoeff * dEdR * ky ;
-            force.z += 2.0 * recipCoeff * dEdR * kz ;
-          } 
-          lowrz = 1 - numRz;
-        }
-        lowry = 1 - numRy;
-      }
-    }
-//###########################################################################
-//###########################################################################
-// END EWALD RECIP SPACE
-//###########################################################################
-}
--- a/platforms/cuda/tests/TestCudaEwald.cpp
+++ b/platforms/cuda/tests/TestCudaEwald.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-/**
- * This tests all the different force terms in the reference implementation of NonbondedForce.
- */
-#include "../../../tests/AssertionUtilities.h"
-#include "openmm/OpenMMContext.h"
-#include "CudaPlatform.h"
-#include "ReferencePlatform.h"
-#include "openmm/HarmonicBondForce.h"
-#include "openmm/NonbondedForce.h"
-#include "openmm/System.h"
-#include "openmm/LangevinIntegrator.h"
-#include "openmm/VerletIntegrator.h"
-#include "openmm/internal/OpenMMContextImpl.h"
-#include "kernels/gputypes.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
-#include "../src/sfmt/SFMT.h"
-#include <iostream>
-#include <vector>
-using namespace OpenMM;
-using namespace std;
-const double TOL = 1e-5;
-void testEwald() {
-    CudaPlatform platform;
-    System system;
-    system.addParticle(1.0);
-    system.addParticle(1.0);
-    VerletIntegrator integrator(0.01);
-    NonbondedForce* nonbonded = new NonbondedForce();
-    nonbonded->addParticle(1.0, 1, 0);
-    nonbonded->addParticle(-1.0, 1, 0);
-    nonbonded->setNonbondedMethod(NonbondedForce::Ewald);
-    const double cutoff = 2.0;
-    nonbonded->setCutoffDistance(cutoff);
-    nonbonded->setPeriodicBoxVectors(Vec3(6, 0, 0), Vec3(0, 6, 0), Vec3(0, 0, 6));
-    system.addForce(nonbonded);
-    OpenMMContext context(system, integrator, platform);
-    vector<Vec3> positions(2);
-    positions[0] = Vec3(3.048000,2.764000,3.156000);
-    positions[1] = Vec3(2.809000,2.888000,2.571000);
-    context.setPositions(positions);
-    State state = context.getState(State::Forces | State::Energy);
-    const vector<Vec3>& forces = state.getForces();
-    cout << "    force 0: " << forces[0] << endl;
-    cout << "    force 1: " << forces[1] << endl;
-    cout << "    energyPoten: " << state.getPotentialEnergy() << endl;
-    ASSERT_EQUAL_VEC(Vec3(-123.711, 64.1877, -302.716), forces[0], TOL);
-    ASSERT_EQUAL_VEC(Vec3(123.711, -64.1877, 302.716), forces[1], TOL);
-    //ASSERT_EQUAL_TOL(2*138.935485*(1.0)*(1.0+krf*1.0-crf), state.getPotentialEnergy(), TOL);
-}
-void testPeriodic() {
-    CudaPlatform platform;
-    System system;
-    system.addParticle(1.0);
-    system.addParticle(1.0);
-    system.addParticle(1.0);
-    LangevinIntegrator integrator(0.0, 0.1, 0.01);
-    HarmonicBondForce* bonds = new HarmonicBondForce();
-    bonds->addBond(0, 1, 1, 0);
-    system.addForce(bonds);
-    NonbondedForce* nonbonded = new NonbondedForce();
-    nonbonded->addParticle(1.0, 1, 0);
-    nonbonded->addParticle(1.0, 1, 0);
-    nonbonded->addParticle(1.0, 1, 0);
-    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
-    const double cutoff = 2.0;
-    nonbonded->setCutoffDistance(cutoff);
-    nonbonded->setPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 4, 0), Vec3(0, 0, 4));
-    system.addForce(nonbonded);
-    OpenMMContext context(system, integrator, platform);
-    vector<Vec3> positions(3);
-    positions[0] = Vec3(0, 0, 0);
-    positions[1] = Vec3(2, 0, 0);
-    positions[2] = Vec3(3, 0, 0);
-    context.setPositions(positions);
-    State state = context.getState(State::Forces | State::Energy);
-    const vector<Vec3>& forces = state.getForces();
-    cout << ""  << endl;
-    cout << "    force 0: " << forces[0] << endl;
-    cout << "    force 1: " << forces[1] << endl;
-    cout << "    force 2: " << forces[2] << endl;
-    cout << "energyPoten: " << state.getPotentialEnergy() << endl;
-    cout << ""  << endl;
-    cout << "    no cutoff force: 138.935485"  << endl;
-    cout << ""  << endl;
-    const double eps = 78.3;
-    const double krf = (1.0/(cutoff*cutoff*cutoff))*(eps-1.0)/(2.0*eps+1.0);
-    const double crf = (1.0/cutoff)*(3.0*eps)/(2.0*eps+1.0);
-    const double force = 138.935485*(1.0)*(1.0-2.0*krf*1.0);
-    ASSERT_EQUAL_VEC(Vec3(force, 0, 0), forces[0], TOL);
-    ASSERT_EQUAL_VEC(Vec3(-force, 0, 0), forces[1], TOL);
-    ASSERT_EQUAL_VEC(Vec3(0, 0, 0), forces[2], TOL);
-    ASSERT_EQUAL_TOL(2*138.935485*(1.0)*(1.0+krf*1.0-crf), state.getPotentialEnergy(), TOL);
-}
-int main() {
-    try {
-    cout << ""  << endl;
-    cout << "Executing Periodic..." << endl;
-        testPeriodic();
-    cout << ""  << endl;
-    cout << "Executing Ewald..." << endl;
-        testEwald();
-    cout << ""  << endl;
-    cout << "Done" << endl;
-    }
-    catch(const exception& e) {
-        cout << "exception: " << e.what() << endl;
-        return 1;
-    }
-    cout << "Done" << endl;
-    return 0;
-}
--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -357,6 +357,34 @@ void testPeriodic() {
    ASSERT_EQUAL_TOL(2*138.935485*(1.0)*(1.0+krf*1.0-crf), state.getPotentialEnergy(), TOL);
 }
+void testEwald() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->addParticle(1.0, 1, 0);
+    nonbonded->addParticle(-1.0, 1, 0);
+    nonbonded->setNonbondedMethod(NonbondedForce::Ewald);
+    const double cutoff = 2.0;
+    nonbonded->setCutoffDistance(cutoff);
+    nonbonded->setPeriodicBoxVectors(Vec3(6, 0, 0), Vec3(0, 6, 0), Vec3(0, 0, 6));
+    system.addForce(nonbonded);
+    OpenMMContext context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(3.048000,2.764000,3.156000);
+    positions[1] = Vec3(2.809000,2.888000,2.571000);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+    ASSERT_EQUAL_VEC(Vec3(-123.711,  64.1877, -302.716), forces[0], TOL);
+    ASSERT_EQUAL_VEC(Vec3( 123.711, -64.1877,  302.716), forces[1], TOL);
+    ASSERT_EQUAL_TOL(-217.276, state.getPotentialEnergy(), TOL);
+}
 void testLargeSystem() {
    const int numMolecules = 600;
    const int numParticles = numMolecules*2;
@@ -602,6 +630,7 @@ int main() {
        testCutoff();
        testCutoff14();
        testPeriodic();
+        testEwald();
        testLargeSystem();
        testBlockInteractions(false);
        testBlockInteractions(true);

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp
@@ -234,14 +234,13 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
    #include "../SimTKUtilities/RealTypeSimTk.h"
    typedef std::complex<RealOpenMM> d_complex;
-    typedef std::complex<int> int_complex;
 // Number of R-vectors (real space vectors)
 // to be calculated automatically eventually from alphaEwald and desired precision
-    int numRx = 60+1;
+    int numRx = 20+1;
-    int numRy = 60+1;
+    int numRy = 20+1;
-    int numRz = 60+1;
+    int numRz = 20+1;
    int kmax = std::max(numRx, std::max(numRy,numRz));
    static const RealOpenMM alphaEwald       =  (RealOpenMM) 3.123413;
@@ -253,7 +252,7 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
    static const RealOpenMM epsilon     =  1.0;
    static const RealOpenMM one         =  1.0;
-    RealOpenMM recipCoeff = (RealOpenMM)(4*M_PI/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
+    RealOpenMM recipCoeff = (RealOpenMM)(4*PI/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
    RealOpenMM selfEwaldEnergy = 0.0;
    RealOpenMM realSpaceEwaldEnergy = 0.0;
@@ -283,9 +282,6 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
  d_complex* eir = new d_complex[kmax*numberOfAtoms*3];
  d_complex* tab_xy = new d_complex[numberOfAtoms];
  d_complex* tab_qxyz = new d_complex[numberOfAtoms];
-  d_complex a,b,c;
-  int  i,j,m;
  if (kmax < 1) {
      std::stringstream message;
@@ -293,16 +289,16 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
      SimTKOpenMMLog::printError( message );
  }
-  for(i = 0; (i < numberOfAtoms); i++) {
+  for(int i = 0; (i < numberOfAtoms); i++) {
-    for(m = 0; (m < 3); m++)
+    for(int m = 0; (m < 3); m++)
      EIR(0, i, m) = d_complex(1,0);
-    for(m=0; (m<3); m++)
+    for(int m=0; (m<3); m++)
      EIR(1, i, m) = d_complex(cos(atomCoordinates[i][m]*recipBoxSize[m]),
                               sin(atomCoordinates[i][m]*recipBoxSize[m]));
-    for(j=2; (j<kmax); j++)
+    for(int j=2; (j<kmax); j++)
-      for(m=0; (m<3); m++)
+      for(int m=0; (m<3); m++)
        EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
  }
@@ -353,10 +349,8 @@ int ReferenceLJCoulombIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** at
          RealOpenMM kz = rz * recipBoxSize[2];
          RealOpenMM k2 = kx * kx + ky * ky + kz * kz;
          RealOpenMM ak = exp(k2*factorEwald) / k2;
-          RealOpenMM akv = 2 * ak * (1/k2 - factorEwald);
          recipEnergy += ak * ( cs * cs + ss * ss);
-          RealOpenMM vir =  akv * ( cs * cs + ss * ss);
          for(int n = 0; n < numberOfAtoms; n++) {
            RealOpenMM force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());

--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp.Ewald-optimized
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp.Ewald-optimized
--- a/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp.Ewald-vanilla
+++ b/platforms/reference/src/SimTKReference/ReferenceLJCoulombIxn.cpp.Ewald-vanilla