Continuing converting AMOEBA to the new CUDA platform: starting on AmoebaMultipoleForce

d0426ba9 · Peter Eastman · a99a2d0e · d0426ba9 · d0426ba9 · d0426ba9
Commit d0426ba9 authored Jul 20, 2012 by Peter Eastman
13 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -78,3 +78,14 @@ void CudaArray::download(void* data, bool blocking) const {
        throw OpenMMException(str.str());
    }
 }
+void CudaArray::copyTo(CudaArray& dest) const {
+    if (dest.getSize() != size || dest.getElementSize() != elementSize)
+        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
+    CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, 0);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -127,6 +127,12 @@ public:
     *                 the destination array must be in page-locked memory.
     */
    void download(void* data, bool blocking = true) const;
+    /**
+     * Copy the values in the device memory to a second array.
+     * 
+     * @param dest     the destination array to copy to
+     */
+    void copyTo(CudaArray& dest) const;
 private:
    CudaContext& context;
    CUdeviceptr pointer;

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -468,6 +468,10 @@ void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
    executeKernel(clearBufferKernel, args, size, 128);
 }
+void CudaContext::addAutoclearBuffer(CudaArray& array) {
+    addAutoclearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
+}
 void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
    autoclearBuffers.push_back(memory);
    autoclearBufferSizes.push_back(size/4);

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -231,6 +231,10 @@ public:
     * @param size       the size of the buffer in bytes
     */
    void clearBuffer(CUdeviceptr memory, int size);
+    /**
+     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+     */
+    void addAutoclearBuffer(CudaArray& array);
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -1454,7 +1454,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
        pmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-        cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*2*elementSize);
+        cu.addAutoclearBuffer(*pmeGrid);
        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
@@ -1928,8 +1928,8 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
    }
    bornSum = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornSum");
    bornForce = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornForce");
-    cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize()*sizeof(long long));
+    cu.addAutoclearBuffer(*bornSum);
-    cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize()*sizeof(long long));
+    cu.addAutoclearBuffer(*bornForce);
    CudaArray& posq = cu.getPosq();
    float4* posqf = (float4*) cu.getPinnedBuffer();
    double4* posqd = (double4*) cu.getPinnedBuffer();
@@ -2757,7 +2757,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            cu.getNonbondedUtilities().addArgument(arguments[i]);
    }
    cu.addForce(new CudaCustomGBForceInfo(force));
-    cu.addAutoclearBuffer(longEnergyDerivs->getDevicePointer(), sizeof(long long)*longEnergyDerivs->getSize());
+    cu.addAutoclearBuffer(*longEnergyDerivs);
 }
 double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -2766,7 +2766,7 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
        hasInitializedKernels = true;
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
        valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
-        cu.addAutoclearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
+        cu.addAutoclearBuffer(*valueBuffers);
        cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
        pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -177,9 +177,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    // Record the exclusion data.
    exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
-    vector<unsigned int> exclusionVec(exclusions->getSize());
+    vector<unsigned int> exclusionVec(exclusions->getSize(), 0xFFFFFFFF);
-    for (int i = 0; i < exclusions->getSize(); ++i)
-        exclusionVec[i] = 0xFFFFFFFF;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/CudaContext::TileSize;
        int offset1 = atom1-x*CudaContext::TileSize;
@@ -249,7 +247,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    // Create kernels.
-    forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
+    if (kernelSource.size() > 0)
+        forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
    if (useCutoff) {
        map<string, string> defines;
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
@@ -291,6 +290,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
 }
 int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
+    if (x < y)
+        throw OpenMMException("Internal error: called findExclusionIndex with x<y");
    int start = exclusionRowIndices[x];
    int end = exclusionRowIndices[x+1];
    for (int i = start; i < end; i++)
@@ -317,7 +318,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
 }
 void CudaNonbondedUtilities::computeInteractions() {
-    if (cutoff != -1.0)
+    if (kernelSource.size() > 0)
        context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
 }

--- a/platforms/cuda2/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.h
@@ -232,8 +232,20 @@ public:
     * @param isSymmetric   specifies whether the interaction is symmetric
     */
    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
-private:
+    /**
+     * This is a utility routine for locating data in the exclusions array.  It takes the (x,y) indices of a tile,
+     * and returns the location in the array where the data for that tile begins.
+     * 
+     * This routine requires that x >= y.  If not, it will throw an exception.
+     * 
+     * @param x                   the x index of the tile
+     * @param y                   the y index of the tile
+     * @param exclusionIndices    the content of the exclusionIndices array
+     * @param exclusionRowIndices the content of the exclusionRowIndices array
+     * @return the index in the exclusions array at which the data for that tile begins
+     */
    static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
+private:
    CudaContext& context;
    CUfunction forceKernel;
    CUfunction findBlockBoundsKernel;

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernelFactory.cpp
@@ -97,9 +97,9 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if (name == CalcAmoebaTorsionTorsionForceKernel::Name())
        return new CudaCalcAmoebaTorsionTorsionForceKernel(name, platform, cu, context.getSystem());
-//    if (name == CalcAmoebaMultipoleForceKernel::Name())
+    if (name == CalcAmoebaMultipoleForceKernel::Name())
-//        return new CudaCalcAmoebaMultipoleForceKernel(name, platform, cu, context.getSystem());
+        return new CudaCalcAmoebaMultipoleForceKernel(name, platform, cu, context.getSystem());
-//
 //    if (name == CalcAmoebaGeneralizedKirkwoodForceKernel::Name())
 //        return new CudaCalcAmoebaGeneralizedKirkwoodForceKernel(name, platform, cu, context.getSystem());

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -30,8 +30,10 @@
 #include "openmm/amoebaKernels.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
-#include "CudaContext.h"
 #include "CudaArray.h"
+#include "CudaContext.h"
+#include "CudaSort.h"
+#include <cufft.h>
 namespace OpenMM {
@@ -371,9 +373,46 @@ public:
 private:
    class ForceInfo;
+    void initializeScaleFactors();
    int numMultipoles;
+    bool hasInitializedScaleFactors;
    CudaContext& cu;
    System& system;
+    std::vector<int3> covalentFlagValues;
+    std::vector<int2> polarizationFlagValues;
+    CudaArray* multipoleParticles;
+    CudaArray* torqueBufferIndices;
+    CudaArray* molecularDipoles;
+    CudaArray* molecularQuadrupoles;
+    CudaArray* labFrameDipoles;
+    CudaArray* labFrameQuadrupoles;
+    CudaArray* field;
+    CudaArray* fieldPolar;
+    CudaArray* dampingAndThole;
+    CudaArray* inducedDipole;
+    CudaArray* inducedDipolePolar;
+    CudaArray* currentEpsilon;
+    CudaArray* polarizability;
+    CudaArray* covalentFlags;
+    CudaArray* polarizationGroupFlags;
+    CudaArray* pmeGrid;
+    CudaArray* pmeBsplineModuliX;
+    CudaArray* pmeBsplineModuliY;
+    CudaArray* pmeBsplineModuliZ;
+    CudaArray* pmeTheta1;
+    CudaArray* pmeTheta2;
+    CudaArray* pmeIgrid;
+    CudaArray* pmePhi;
+    CudaArray* pmePhid;
+    CudaArray* pmePhip;
+    CudaArray* pmePhidp;
+    CudaArray* pmeBsplineTheta;
+    CudaArray* pmeBsplineDTheta;
+    CudaArray* pmeAtomRange;
+    CudaArray* pmeAtomGridIndex;
+    CudaSort* sort;
+    cufftHandle fft;
+    CUfunction computeMomentsKernel, computeFixedFieldKernel;
 };
 /**

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
+#define TILE_SIZE 32
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+typedef struct {
+    real4 posq;
+    real3 field, fieldPolar, dipole;
+    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
+    real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
+    float thole, damp;
+} AtomData;
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
+    data.posq = posq[atom];
+    data.dipole.x = labFrameDipole[atom*3];
+    data.dipole.y = labFrameDipole[atom*3+1];
+    data.dipole.z = labFrameDipole[atom*3+2];
+    data.quadrupoleXX = labFrameQuadrupole[atom*9];
+    data.quadrupoleXY = labFrameQuadrupole[atom*9+1];
+    data.quadrupoleXZ = labFrameQuadrupole[atom*9+2];
+    data.quadrupoleYY = labFrameQuadrupole[atom*9+4];
+    data.quadrupoleYZ = labFrameQuadrupole[atom*9+5];
+    data.quadrupoleZZ = labFrameQuadrupole[atom*9+8];
+    float2 temp = dampingAndThole[atom];
+    data.damp = temp.x;
+    data.thole = temp.y;
+}
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, real3& field1, real3& field2) {
+    real rI = RSQRT(dot(deltaR, deltaR));
+    real r = RECIP(rI);
+    real r2I = rI*rI;
+    real rr3 = rI*r2I;
+    real rr5 = 3*rr3*r2I;
+    real rr7 = 5*rr5*r2I;
+    // get scaling factors, if needed
+    float damp = atom1.damp*atom2.damp;
+    real dampExp;
+    if (damp != 0 && r < SCALING_DISTANCE_CUTOFF) {
+        // get scaling factors
+        real ratio = r/damp;
+        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole; 
+        damp = ratio*ratio*ratio*pGamma;
+        dampExp = EXP(-damp);
+    }
+    else
+        dampExp = 0;
+    rr3 *= 1 - dampExp;
+    rr5 *= 1 - (1+damp)*dampExp;
+    rr7 *= 1 - (1+damp+(0.6f*damp*damp))*dampExp;
+    real rr5_2 = 2*rr5;
+    real3 qDotDelta;
+    qDotDelta.x = deltaR.x*atom2.quadrupoleXX + deltaR.y*atom2.quadrupoleXY + deltaR.z*atom2.quadrupoleXZ;
+    qDotDelta.y = deltaR.x*atom2.quadrupoleXY + deltaR.y*atom2.quadrupoleYY + deltaR.z*atom2.quadrupoleYZ;
+    qDotDelta.z = deltaR.x*atom2.quadrupoleXZ + deltaR.y*atom2.quadrupoleYZ + deltaR.z*atom2.quadrupoleZZ;
+    real dotdd = dot(deltaR, atom2.dipole);
+    real dotqd = dot(deltaR, qDotDelta);
+    real factor = -rr3*atom2.posq.w + rr5*dotdd - rr7*dotqd;
+    field1 = deltaR*factor - rr3*atom2.dipole + rr5_2*qDotDelta;
+    qDotDelta.x = deltaR.x*atom1.quadrupoleXX + deltaR.y*atom1.quadrupoleXY + deltaR.z*atom1.quadrupoleXZ;
+    qDotDelta.y = deltaR.x*atom1.quadrupoleXY + deltaR.y*atom1.quadrupoleYY + deltaR.z*atom1.quadrupoleYZ;
+    qDotDelta.z = deltaR.x*atom1.quadrupoleXZ + deltaR.y*atom1.quadrupoleYZ + deltaR.z*atom1.quadrupoleZZ;
+    dotdd = dot(deltaR, atom1.dipole);
+    dotqd = dot(deltaR, qDotDelta);
+    factor = rr3*atom1.posq.w + rr5*dotdd + rr7*dotqd;
+    field2 = deltaR*factor - rr3*atom1.dipole - rr5_2*qDotDelta;
+}
+__device__ real computeDScaleFactor(unsigned int polarizationGroup) {
+    return (polarizationGroup & 1 ? 0 : 1);
+}
+__device__ float computeDScaleFactor(uint2 covalent) {
+//        int f1 = (value == 0 || value == 1 ? 1 : 0);
+//        int f2 = (value == 0 || value == 2 ? 1 : 0);
+    // 0 = 12 or 13: x and y: 0
+    // 1 = 14: x: 0.4
+    // 2 = 15: y: 0.8
+    bool x = (covalent.x & 1);
+    bool y = (covalent.y & 1);
+    return (x ? (y ? 0.0f : 0.4f) : (y ? 0.8f : 1.0f));
+}
+__device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGroup) {
+    bool x = (covalent.x & 1);
+    bool y = (covalent.y & 1);
+    bool p = (polarizationGroup & 1);
+    return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
+}
+/**
+ * Compute nonbonded interactions.
+ */
+extern "C" __global__ void computeFixedField(
+        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
+        const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+#endif
+        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+#endif
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        AtomData data;
+        data.field = make_real3(0);
+        data.fieldPolar = make_real3(0);
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
+            // Locate the exclusion data for this tile.
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].posq = data.posq;
+                localData[localAtomIndex].dipole = data.dipole;
+                localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
+                localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
+                localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
+                localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
+                localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
+                localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
+                localData[localAtomIndex].thole = data.thole; // IS THIS CORRECT?
+                localData[localAtomIndex].damp = data.damp; // IS THIS CORRECT?
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
+                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    bool isExcluded = !(excl & 0x1);
+                    int atom2 = tbx+j;
+                    real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real3 field1;
+                    real3 field2;
+                    computeOneInteraction(data, localData[atom2], delta, field1, field2);
+                    if (!isExcluded) {
+                        float d = computeDScaleFactor(covalent);
+                        data.field += d*field1;
+                        float p = computePScaleFactor(covalent, polarizationGroup);
+                        data.fieldPolar += p*field1;
+                    }
+                    excl >>= 1;
+                    covalent.x >>= 1;
+                    covalent.y >>= 1;
+                    polarizationGroup >>= 1;
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+                const unsigned int localAtomIndex = threadIdx.x;
+                unsigned int j = y*TILE_SIZE + tgx;
+                loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
+                localData[localAtomIndex].field = make_real3(0);
+                localData[localAtomIndex].fieldPolar = make_real3(0);
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+                        for (j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                bool isExcluded = false;
+                                int atom2 = tbx+j;
+                                int bufferIndex = 3*threadIdx.x;
+                                real3 dEdR1 = make_real3(0);
+                                real3 dEdR2 = make_real3(0);
+                                real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                                if (r2 < CUTOFF_SQUARED) {
+#endif
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    COMPUTE_INTERACTION
+#ifdef USE_CUTOFF
+                                }
+#endif
+#ifdef ENABLE_SHUFFLE
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
+                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
+                                    dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += dEdR2.x;
+                                    localData[tbx+j].fy += dEdR2.y;
+                                    localData[tbx+j].fz += dEdR2.z;
+                                }
+#else
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                tempBuffer[bufferIndex] = dEdR2.x;
+                                tempBuffer[bufferIndex+1] = dEdR2.y;
+                                tempBuffer[bufferIndex+2] = dEdR2.z;
+                                // Sum the forces on atom2.
+                                if (tgx % 4 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
+                                }
+#endif
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+                    covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
+                    covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
+                    polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
+                    unsigned int tj = tgx;
+                    for (j = 0; j < TILE_SIZE; j++) {
+                        bool isExcluded = !(excl & 0x1);
+                        int atom2 = tbx+tj;
+                        real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real3 field1;
+                        real3 field2;
+                        computeOneInteraction(data, localData[atom2], delta, field1, field2);
+                        if (!isExcluded) {
+                            float d = computeDScaleFactor(covalent);
+                            data.field += d*field1;
+                            localData[atom2].field += d*field2;
+                            float p = computePScaleFactor(covalent, polarizationGroup);
+                            data.fieldPolar += p*field1;
+                            localData[atom2].fieldPolar += p*field2;
+                        }
+                        excl >>= 1;
+                        covalent.x >>= 1;
+                        covalent.y >>= 1;
+                        polarizationGroup >>= 1;
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        // Write results.
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0xFFFFFFFF)));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0xFFFFFFFF)));
+        }
+        pos++;
+    } while (pos < end);
+}
--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
+extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4* __restrict__ multipoleParticles, float* __restrict__ molecularDipoles,
+        float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles) {
+    // get coordinates of this atom and the z & x axis atoms
+    // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
+    // this atom and the axis atom
+    // this atom is referred to as the k-atom in notes below
+    // code common to ZThenX and Bisector
+    for (int particleIndex = blockIdx.x*blockDim.x+threadIdx.x; particleIndex < NUM_ATOMS; particleIndex += gridDim.x*blockDim.x) {
+        int4 particles = multipoleParticles[particleIndex];
+        if (particles.x >= 0 && particles.z >= 0) {
+            real4 thisParticlePos = posq[particleIndex];
+            real4 posZ = posq[particles.z];
+            real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
+            real4 posX = posq[particles.x];
+            real3 vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
+            int axisType = particles.w; 
+            /*
+                z-only
+                   (1) norm z
+                   (2) select random x
+                   (3) x = x - (x.z)z
+                   (4) norm x
+                z-then-x
+                   (1) norm z
+                   (2) norm x (not needed)
+                   (3) x = x - (x.z)z
+                   (4) norm x
+                bisector
+                   (1) norm z
+                   (2) norm x 
+                   (3) z = x + z
+                   (4) norm z
+                   (5) x = x - (x.z)z 
+                   (6) norm x 
+                z-bisect
+                   (1) norm z
+                   (2) norm x 
+                   (3) norm y 
+                   (3) x = x + y
+                   (4) norm x
+                   (5) x = x - (x.z)z 
+                   (6) norm x 
+                3-fold
+                   (1) norm z
+                   (2) norm x 
+                   (3) norm y 
+                   (4) z = x + y + z
+                   (5) norm z
+                   (6) x = x - (x.z)z 
+                   (7) norm x 
+            */
+            // branch based on axis type
+            vectorZ = normalize(vectorZ);
+            if (axisType == 1) {
+                // bisector
+                vectorX = normalize(vectorX);
+                vectorZ += vectorX;
+                vectorZ = normalize(vectorZ);
+            }
+            else if (axisType == 2 || axisType == 3) { 
+                // z-bisect
+                if (particles.y >= 0 && particles.y < NUM_ATOMS) {
+                    real4 posY = posq[particles.y];
+                    real3 vectorY = make_real3(posY.x-thisParticlePos.x, posY.y-thisParticlePos.y, posY.z-thisParticlePos.z);
+                    vectorY = normalize(vectorY);
+                    vectorX = normalize(vectorX);
+                    if (axisType == 2) {
+                        vectorX += vectorY;
+                        vectorX = normalize(vectorX);
+                    }
+                    else { 
+                        // 3-fold
+                        vectorZ += vectorX + vectorY;
+                        vectorZ = normalize(vectorZ);
+                    }
+                }
+            }
+            else if (axisType >= 4)
+                vectorX = make_real3((real) 0.1f);
+            // x = x - (x.z)z
+            vectorX -= dot(vectorZ, vectorX)*vectorZ;
+            vectorX = normalize(vectorX);
+            real3 vectorY = cross(vectorZ, vectorX);
+            // use identity rotation matrix for unrecognized axis types
+            if (axisType < 0 || axisType > 4) {
+                vectorX.x = 1;
+                vectorX.y = 0;
+                vectorX.z = 0;
+                vectorY.x = 0;
+                vectorY.y = 1;
+                vectorY.z = 0;
+                vectorZ.x = 0;
+                vectorZ.y = 0;
+                vectorZ.z = 1;
+            }
+            // Check the chirality and see whether it needs to be reversed
+            bool reverse = false;
+            if (axisType != 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) {
+                real4 posY = posq[particles.y];
+                real delta[4][3];
+                delta[0][0] = thisParticlePos.x - posY.x;
+                delta[0][1] = thisParticlePos.y - posY.y;
+                delta[0][2] = thisParticlePos.z - posY.z;
+                delta[1][0] = posZ.x - posY.x;
+                delta[1][1] = posZ.y - posY.y;
+                delta[1][2] = posZ.z - posY.z;
+                delta[2][0] = posX.x - posY.x;
+                delta[2][1] = posX.y - posY.y;
+                delta[2][2] = posX.z - posY.z;
+                delta[3][0] = delta[1][1]*delta[2][2] - delta[1][2]*delta[2][1];
+                delta[3][1] = delta[2][1]*delta[0][2] - delta[2][2]*delta[0][1];
+                delta[3][2] = delta[0][1]*delta[1][2] - delta[0][2]*delta[1][1];
+                real volume = delta[3][0]*delta[0][0] + delta[3][1]*delta[1][0] + delta[3][2]*delta[2][0];
+                reverse = (volume < 0);
+            }
+            // Transform the dipole
+            unsigned int offset = 3*particleIndex;
+            real molDipole[3];
+            molDipole[0] = molecularDipoles[offset];
+            molDipole[1] = molecularDipoles[offset+1];
+            molDipole[2] = molecularDipoles[offset+2];
+            if (reverse)
+                molDipole[1] *= -1;
+            labFrameDipoles[offset] = molDipole[0]*vectorX.x + molDipole[1]*vectorY.x + molDipole[2]*vectorZ.x;
+            labFrameDipoles[offset+1] = molDipole[0]*vectorX.y + molDipole[1]*vectorY.y + molDipole[2]*vectorZ.y;
+            labFrameDipoles[offset+2] = molDipole[0]*vectorX.z + molDipole[1]*vectorY.z + molDipole[2]*vectorZ.z;
+            // ---------------------------------------------------------------------------------------
+            // Transform the quadrupole
+            real mPole[3][3];
+            offset = 9*particleIndex;
+            mPole[0][0] = molecularQuadrupoles[offset];
+            mPole[0][1] = molecularQuadrupoles[offset+1];
+            mPole[0][2] = molecularQuadrupoles[offset+2];
+            mPole[1][0] = molecularQuadrupoles[offset+3];
+            mPole[1][1] = molecularQuadrupoles[offset+4];
+            mPole[1][2] = molecularQuadrupoles[offset+5];
+            mPole[2][0] = molecularQuadrupoles[offset+6];
+            mPole[2][1] = molecularQuadrupoles[offset+7];
+            mPole[2][2] = molecularQuadrupoles[offset+8];
+            if (reverse) {
+                mPole[0][1] *= -1;
+                mPole[1][0] *= -1;
+                mPole[1][2] *= -1;
+                mPole[2][1] *= -1;
+            }
+            labFrameQuadrupoles[offset+8] = vectorX.z*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset+8] += vectorY.z*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+8] += vectorZ.z*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
+            labFrameQuadrupoles[offset+4] += vectorY.y*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
+            labFrameQuadrupoles[offset+4] += vectorZ.y*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
+            labFrameQuadrupoles[offset+5] = vectorX.y*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset+5] += vectorY.y*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+5] += vectorZ.y*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+            labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPole[0][0] + vectorY.x*mPole[0][1] + vectorZ.x*mPole[0][2]);
+            labFrameQuadrupoles[offset] += vectorY.x*(vectorX.x*mPole[1][0] + vectorY.x*mPole[1][1] + vectorZ.x*mPole[1][2]);
+            labFrameQuadrupoles[offset] += vectorZ.x*(vectorX.x*mPole[2][0] + vectorY.x*mPole[2][1] + vectorZ.x*mPole[2][2]);
+            labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
+            labFrameQuadrupoles[offset+1] += vectorY.x*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
+            labFrameQuadrupoles[offset+1] += vectorZ.x*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
+            labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset+2] += vectorY.x*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+2] += vectorZ.x*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+            labFrameQuadrupoles[offset+3] = labFrameQuadrupoles[offset+1];
+            labFrameQuadrupoles[offset+6] = labFrameQuadrupoles[offset+2];
+            labFrameQuadrupoles[offset+7] = labFrameQuadrupoles[offset+5];
+        }
+    }
+}
--- a/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaPME.cpp
+++ b/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMMAmoeba                             *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2010 Stanford University and the Authors.      *
+ * Authors: Mark Friedrichs                                                   *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This tests the CUDA implementation of AmoebaMultipoleForce.
+ */
+#include "openmm/internal/AssertionUtilities.h"
+#include "AmoebaTinkerParameterFile.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/AmoebaMultipoleForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include <iostream>
+#include <vector>
+using namespace OpenMM;
+using namespace std;
+const double TOL = 1e-5;
+void testPMEWater() {
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    System system;
+    system.addParticle(16);
+    system.addParticle(1);
+    system.addParticle(1);
+    VerletIntegrator integrator(0.01);
+    AmoebaMultipoleForce* mp = new AmoebaMultipoleForce();
+    mp->setNonbondedMethod(AmoebaMultipoleForce::PME);
+    vector<double> dipole(3, 0.0);
+    dipole[2] = 7.556121361e-2;
+    vector<double> quadrupole(9, 0.0);
+    quadrupole[0] = 3.540307211e-2;
+    quadrupole[4] = -3.902570771e-2;
+    quadrupole[8] = 3.622635596e-3;
+    double damp = 9.707801995e-01*sqrt(0.1);
+    double polarity = 0.837*0.001;
+    mp->addParticle(-0.51966, dipole, quadrupole, 1, 1, 2, -1, 0.39, damp, polarity);
+    dipole[0] = -2.042094848e-2;
+    dipole[2] = -3.078753000e-2;
+    quadrupole[0] = -3.428482490e-3;
+    quadrupole[2] = -1.894859639e-4;
+    quadrupole[4] = -1.002408752e-2;
+    quadrupole[6] = -1.894859639e-4;
+    quadrupole[8] = 1.345257001e-2;
+    damp          = 8.897068742e-01*sqrt(0.1);
+    polarity      = 0.496*0.001;
+    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 2, -1, 0.39, damp, polarity);
+    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 1, -1, 0.39, damp, polarity);
+    mp->setCutoffDistance(1.0);
+    std::vector<int> intVector;
+    intVector.push_back( 0 );
+    intVector.push_back( 1 );
+    intVector.push_back( 2 );
+    mp->setCovalentMap( 0, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    mp->setCovalentMap( 1, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    mp->setCovalentMap( 2, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    intVector.resize(0); intVector.push_back( 1 ); intVector.push_back( 2 );
+    mp->setCovalentMap( 0, AmoebaMultipoleForce::Covalent12, intVector );
+    intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 2 );
+    mp->setCovalentMap( 1, AmoebaMultipoleForce::Covalent12, intVector );
+    intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 1 );
+    mp->setCovalentMap( 2, AmoebaMultipoleForce::Covalent12, intVector );
+    mp->setEwaldErrorTolerance(TOL);
+    system.setDefaultPeriodicBoxVectors(Vec3(2, 0, 0), Vec3(0, 2, 0), Vec3(0, 0, 2));
+    system.addForce(mp);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    const double angle = 109.47*M_PI/180;
+    const double dOH = 0.1;
+    positions[0] = Vec3();
+    positions[1] = Vec3(dOH, 0, 0);
+    positions[2] = Vec3(dOH*std::cos(angle), dOH*std::sin(angle), 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+#ifdef AMOEBA_DEBUG
+    (void) fprintf( stderr, "PME forces\n" );
+        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+            (void) fprintf( stderr, "%6u [%14.7e %14.7e %14.7e]\n", ii, 
+                            forces[ii][0], forces[ii][1], forces[ii][2] );
+    }
+    (void) fflush( stderr );
+#endif
+}
+int main() {
+    try {
+        registerAmoebaCudaKernelFactories();
+        testPMEWater();
+    } catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}