Continuing converting AMOEBA to the new CUDA platform: starting on AmoebaMultipoleForce

d0426ba9 · Peter Eastman · a99a2d0e · d0426ba9 · d0426ba9 · d0426ba9
Commit d0426ba9 authored Jul 20, 2012 by Peter Eastman
13 changed files
--- a/platforms/cuda2/src/CudaArray.cpp
+++ b/platforms/cuda2/src/CudaArray.cpp
@@ -78,3 +78,14 @@ void CudaArray::download(void* data, bool blocking) const {
        throw OpenMMException(str.str());
    }
 }
+
+void CudaArray::copyTo(CudaArray& dest) const {
+    if (dest.getSize() != size || dest.getElementSize() != elementSize)
+        throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
+    CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, 0);
+    if (result != CUDA_SUCCESS) {
+        std::stringstream str;
+        str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -127,6 +127,12 @@ public:
     *                 the destination array must be in page-locked memory.
     */
    void download(void* data, bool blocking = true) const;
+    /**
+     * Copy the values in the device memory to a second array.
+     * 
+     * @param dest     the destination array to copy to
+     */
+    void copyTo(CudaArray& dest) const;
 private:
    CudaContext& context;
    CUdeviceptr pointer;

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -468,6 +468,10 @@ void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
    executeKernel(clearBufferKernel, args, size, 128);
 }

+void CudaContext::addAutoclearBuffer(CudaArray& array) {
+    addAutoclearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
+}
+
 void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
    autoclearBuffers.push_back(memory);
    autoclearBufferSizes.push_back(size/4);

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -231,6 +231,10 @@ public:
     * @param size       the size of the buffer in bytes
     */
    void clearBuffer(CUdeviceptr memory, int size);
+    /**
+     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+     */
+    void addAutoclearBuffer(CudaArray& array);
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
@@ -1454,7 +1454,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon

        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
        pmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-        cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*2*elementSize);
+        cu.addAutoclearBuffer(*pmeGrid);
        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
@@ -1928,8 +1928,8 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
    }
    bornSum = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornSum");
    bornForce = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornForce");
-    cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize()*sizeof(long long));
-    cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize()*sizeof(long long));
+    cu.addAutoclearBuffer(*bornSum);
+    cu.addAutoclearBuffer(*bornForce);
    CudaArray& posq = cu.getPosq();
    float4* posqf = (float4*) cu.getPinnedBuffer();
    double4* posqd = (double4*) cu.getPinnedBuffer();
@@ -2757,7 +2757,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            cu.getNonbondedUtilities().addArgument(arguments[i]);
    }
    cu.addForce(new CudaCustomGBForceInfo(force));
-    cu.addAutoclearBuffer(longEnergyDerivs->getDevicePointer(), sizeof(long long)*longEnergyDerivs->getSize());
+    cu.addAutoclearBuffer(*longEnergyDerivs);
 }

 double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
@@ -2766,7 +2766,7 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
        hasInitializedKernels = true;
        maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
        valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
-        cu.addAutoclearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
+        cu.addAutoclearBuffer(*valueBuffers);
        cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
        pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
        pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -177,9 +177,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    // Record the exclusion data.

    exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
-    vector<unsigned int> exclusionVec(exclusions->getSize());
-    for (int i = 0; i < exclusions->getSize(); ++i)
-        exclusionVec[i] = 0xFFFFFFFF;
+    vector<unsigned int> exclusionVec(exclusions->getSize(), 0xFFFFFFFF);
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/CudaContext::TileSize;
        int offset1 = atom1-x*CudaContext::TileSize;
@@ -249,6 +247,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {

    // Create kernels.

+    if (kernelSource.size() > 0)
        forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
    if (useCutoff) {
        map<string, string> defines;
@@ -291,6 +290,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
 }

 int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
+    if (x < y)
+        throw OpenMMException("Internal error: called findExclusionIndex with x<y");
    int start = exclusionRowIndices[x];
    int end = exclusionRowIndices[x+1];
    for (int i = start; i < end; i++)
@@ -317,7 +318,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
 }

 void CudaNonbondedUtilities::computeInteractions() {
-    if (cutoff != -1.0)
+    if (kernelSource.size() > 0)
        context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
 }


--- a/platforms/cuda2/src/CudaNonbondedUtilities.h
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.h
@@ -232,8 +232,20 @@ public:
     * @param isSymmetric   specifies whether the interaction is symmetric
     */
    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
-private:
+    /**
+     * This is a utility routine for locating data in the exclusions array.  It takes the (x,y) indices of a tile,
+     * and returns the location in the array where the data for that tile begins.
+     * 
+     * This routine requires that x >= y.  If not, it will throw an exception.
+     * 
+     * @param x                   the x index of the tile
+     * @param y                   the y index of the tile
+     * @param exclusionIndices    the content of the exclusionIndices array
+     * @param exclusionRowIndices the content of the exclusionRowIndices array
+     * @return the index in the exclusions array at which the data for that tile begins
+     */
    static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
+private:
    CudaContext& context;
    CUfunction forceKernel;
    CUfunction findBlockBoundsKernel;

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernelFactory.cpp
@@ -97,9 +97,9 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if (name == CalcAmoebaTorsionTorsionForceKernel::Name())
        return new CudaCalcAmoebaTorsionTorsionForceKernel(name, platform, cu, context.getSystem());

-//    if (name == CalcAmoebaMultipoleForceKernel::Name())
-//        return new CudaCalcAmoebaMultipoleForceKernel(name, platform, cu, context.getSystem());
-//
+    if (name == CalcAmoebaMultipoleForceKernel::Name())
+        return new CudaCalcAmoebaMultipoleForceKernel(name, platform, cu, context.getSystem());
+
 //    if (name == CalcAmoebaGeneralizedKirkwoodForceKernel::Name())
 //        return new CudaCalcAmoebaGeneralizedKirkwoodForceKernel(name, platform, cu, context.getSystem());


--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -754,10 +754,10 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
    return 0.0;
 }

-///* -------------------------------------------------------------------------- *
-// *                             AmoebaMultipole                                *
-// * -------------------------------------------------------------------------- */
-//
+/* -------------------------------------------------------------------------- *
+ *                             AmoebaMultipole                                *
+ * -------------------------------------------------------------------------- */
+
 //static void computeAmoebaMultipoleForce( CudaContext& cu ) {
 //
 //    amoebaGpuContext gpu = data.getAmoebaGpu();
@@ -833,47 +833,330 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
 //    kCalculateAmoebaSystemMultipoleMoments( gpu, origin, outputMultipoleMonents );
 //
 //}
-//
-//class CudaCalcAmoebaMultipoleForceKernel::ForceInfo : public CudaForceInfo {
-//public:
-//    ForceInfo(const AmoebaMultipoleForce& force) : force(force) {
+
+class CudaCalcAmoebaMultipoleForceKernel::ForceInfo : public CudaForceInfo {
+public:
+    ForceInfo(const AmoebaMultipoleForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        double charge1, charge2, thole1, thole2, damping1, damping2, polarity1, polarity2;
+        int axis1, axis2, multipole11, multipole12, multipole21, multipole22, multipole31, multipole32;
+        vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
+        force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1);
+        force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2);
+        if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2){
+            return false;
+        }
+        for (int i = 0; i < (int) dipole1.size(); ++i){
+            if (dipole1[i] != dipole2[i]){
+                return false;
+            }
+        }
+        for (int i = 0; i < (int) quadrupole1.size(); ++i){
+            if (quadrupole1[i] != quadrupole2[i]){
+                return false;
+            }
+        }
+        return true;
+    }
+private:
+    const AmoebaMultipoleForce& force;
+};
+
+CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
+        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false),
+        multipoleParticles(NULL), torqueBufferIndices(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL),
+        labFrameDipoles(NULL), labFrameQuadrupoles(NULL), field(NULL), fieldPolar(NULL), dampingAndThole(NULL),
+        inducedDipole(NULL), inducedDipolePolar(NULL), currentEpsilon(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
+        pmeGrid(NULL) {
+}
+
+CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
+    cu.setAsCurrent();
+    if (multipoleParticles != NULL)
+        delete multipoleParticles;
+    if (torqueBufferIndices != NULL)
+        delete torqueBufferIndices;
+    if (molecularDipoles != NULL)
+        delete molecularDipoles;
+    if (molecularQuadrupoles != NULL)
+        delete molecularQuadrupoles;
+    if (labFrameDipoles != NULL)
+        delete labFrameDipoles;
+    if (labFrameQuadrupoles != NULL)
+        delete labFrameQuadrupoles;
+    if (field != NULL)
+        delete field;
+    if (fieldPolar != NULL)
+        delete fieldPolar;
+    if (dampingAndThole != NULL)
+        delete dampingAndThole;
+    if (inducedDipole != NULL)
+        delete inducedDipole;
+    if (inducedDipolePolar != NULL)
+        delete inducedDipolePolar;
+    if (currentEpsilon != NULL)
+        delete currentEpsilon;
+    if (polarizability != NULL)
+        delete polarizability;
+    if (covalentFlags != NULL)
+        delete covalentFlags;
+    if (polarizationGroupFlags != NULL)
+        delete polarizationGroupFlags;
+}
+
+void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
+    cu.setAsCurrent();
+
+    // Initialize multipole parameters.
+
+    numMultipoles = force.getNumMultipoles();
+    CudaArray& posq = cu.getPosq();
+    float4* posqf = (float4*) cu.getPinnedBuffer();
+    double4* posqd = (double4*) cu.getPinnedBuffer();
+    vector<float2> dampingAndTholeVec;
+    vector<float> polarizabilityVec;
+    vector<float> molecularDipolesVec;
+    vector<float> molecularQuadrupolesVec;
+    vector<int4> multipoleParticlesVec;
+    for (int i = 0; i < numMultipoles; i++) {
+        double charge, thole, damping, polarity;
+        int axisType, atomX, atomY, atomZ;
+        vector<double> dipole, quadrupole;
+        force.getMultipoleParameters(i, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
+        if (cu.getUseDoublePrecision())
+            posqd[i] = make_double4(0, 0, 0, charge);
+        else
+            posqf[i] = make_float4(0, 0, 0, (float) charge);
+        dampingAndTholeVec.push_back(make_float2((float) damping, (float) thole));
+        polarizabilityVec.push_back((float) polarity);
+        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
+        for (int j = 0; j < 3; j++)
+            molecularDipolesVec.push_back((float) dipole[j]);
+        for (int j = 0; j < 9; j++)
+            molecularQuadrupolesVec.push_back((float) quadrupole[j]);
+    }
+    int paddedNumAtoms = cu.getPaddedNumAtoms();
+    for (int i = numMultipoles; i < paddedNumAtoms; i++) {
+        dampingAndTholeVec.push_back(make_float2(0, 0));
+        polarizabilityVec.push_back(0);
+        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
+        for (int j = 0; j < 3; j++)
+            molecularDipolesVec.push_back(0);
+        for (int j = 0; j < 9; j++)
+            molecularQuadrupolesVec.push_back(0);
+    }
+    dampingAndThole = CudaArray::create<float2>(cu, paddedNumAtoms, "dampingAndThole");
+    polarizability = CudaArray::create<float>(cu, paddedNumAtoms, "polarizability");
+    multipoleParticles = CudaArray::create<int4>(cu, paddedNumAtoms, "multipoleParticles");
+    molecularDipoles = CudaArray::create<float>(cu, 3*paddedNumAtoms, "molecularDipoles");
+    molecularQuadrupoles = CudaArray::create<float>(cu, 9*paddedNumAtoms, "molecularQuadrupoles");
+    dampingAndThole->upload(dampingAndTholeVec);
+    polarizability->upload(polarizabilityVec);
+    multipoleParticles->upload(multipoleParticlesVec);
+    molecularDipoles->upload(molecularDipolesVec);
+    molecularQuadrupoles->upload(molecularQuadrupolesVec);
+    posq.upload(cu.getPinnedBuffer());
+    
+    // Create workspace arrays.
+    
+    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
+    labFrameQuadrupoles = new CudaArray(cu, 9*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
+    field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field");
+    fieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "fieldPolar");
+    inducedDipole = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
+    inducedDipolePolar = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolar");
+    cu.addAutoclearBuffer(*field);
+    cu.addAutoclearBuffer(*fieldPolar);
+    
+    // Record which atoms should be flagged as exclusions based on covalent groups, and determine
+    // the values for the covalent group flags.
+    
+    vector<vector<int> > exclusions(numMultipoles);
+    for (int i = 0; i < numMultipoles; i++) {
+        vector<int> atoms;
+        set<int> allAtoms;
+        allAtoms.insert(i);
+        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent12, atoms);
+        allAtoms.insert(atoms.begin(), atoms.end());
+        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent13, atoms);
+        allAtoms.insert(atoms.begin(), atoms.end());
+        for (set<int>::const_iterator iter = allAtoms.begin(); iter != allAtoms.end(); ++iter)
+            covalentFlagValues.push_back(make_int3(i, *iter, 0));
+        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent14, atoms);
+        allAtoms.insert(atoms.begin(), atoms.end());
+        for (int j = 0; j < (int) atoms.size(); j++)
+            covalentFlagValues.push_back(make_int3(i, atoms[j], 1));
+        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent15, atoms);
+        for (int j = 0; j < (int) atoms.size(); j++)
+            covalentFlagValues.push_back(make_int3(i, atoms[j], 2));
+        allAtoms.insert(atoms.begin(), atoms.end());
+        force.getCovalentMap(i, AmoebaMultipoleForce::PolarizationCovalent11, atoms);
+        allAtoms.insert(atoms.begin(), atoms.end());
+        exclusions[i].insert(exclusions[i].end(), allAtoms.begin(), allAtoms.end());
+        for (int j = 0; j < (int) atoms.size(); j++)
+            polarizationFlagValues.push_back(make_int2(i, atoms[j]));
+    }
+    
+    // Create the kernels.
+
+    
+    // Create the other kernels.
+    
+    map<string, string> defines;
+    defines["NUM_ATOMS"] = cu.intToString(numMultipoles);
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["SCALING_DISTANCE_CUTOFF"] = cu.doubleToString(50.0);
+    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines);
+    computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
+    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
+    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
+
+    // Set up PME.
+    
+    bool usePME = (force.getNonbondedMethod() == AmoebaMultipoleForce::PME);
+//    map<string, string> defines;
+//    alpha = 0;
+//    if (usePME) {
+//        // Compute the PME parameters.
+//
+//        int gridSizeX, gridSizeY, gridSizeZ;
+//        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
+//        gridSizeX = findFFTDimension(gridSizeX);
+//        gridSizeY = findFFTDimension(gridSizeY);
+//        gridSizeZ = findFFTDimension(gridSizeZ);
+//        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
+//        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
+//        defines["USE_EWALD"] = "1";
+//        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
+//        pmeDefines["NUM_ATOMS"] = cu.intToString(numMultipoles);
+//        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+//        pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
+//        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
+//        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
+//        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
+//        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
+//        pmeDefines["M_PI"] = cu.doubleToString(M_PI);
+//        if (cu.getUseDoublePrecision())
+//            pmeDefines["USE_DOUBLE_PRECISION"] = "1";
+//        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
+//        pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
+//        pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
+//        pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
+//        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
+//        pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
+//        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
+//        cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
+//
+//        // Create required data structures.
+//
+//        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+//        pmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
+//        cu.addAutoclearBuffer(*pmeGrid);
+//        pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
+//        pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
+//        pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
+//        pmeBsplineTheta = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeBsplineTheta");
+//        pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
+//        pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex");
+//        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
+//        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
+//        if (result != CUFFT_SUCCESS)
+//            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+//        hasInitializedFFT = true;
+//
+//        // Initialize the b-spline moduli.
+//
+//        int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
+//        vector<double> data(PmeOrder);
+//        vector<double> ddata(PmeOrder);
+//        vector<double> bsplines_data(maxSize);
+//        data[PmeOrder-1] = 0.0;
+//        data[1] = 0.0;
+//        data[0] = 1.0;
+//        for (int i = 3; i < PmeOrder; i++) {
+//            double div = 1.0/(i-1.0);
+//            data[i-1] = 0.0;
+//            for (int j = 1; j < (i-1); j++)
+//                data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+//            data[0] = div*data[0];
 //        }
-//    bool areParticlesIdentical(int particle1, int particle2) {
-//        double charge1, charge2, thole1, thole2, damping1, damping2, polarity1, polarity2;
-//        int axis1, axis2, multipole11, multipole12, multipole21, multipole22, multipole31, multipole32;
-//        vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
-//        force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1);
-//        force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2);
-//        if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2){
-//            return false;
+//
+//        // Differentiate.
+//
+//        ddata[0] = -data[0];
+//        for (int i = 1; i < PmeOrder; i++)
+//            ddata[i] = data[i-1]-data[i];
+//        double div = 1.0/(PmeOrder-1);
+//        data[PmeOrder-1] = 0.0;
+//        for (int i = 1; i < (PmeOrder-1); i++)
+//            data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
+//        data[0] = div*data[0];
+//        for (int i = 0; i < maxSize; i++)
+//            bsplines_data[i] = 0.0;
+//        for (int i = 1; i <= PmeOrder; i++)
+//            bsplines_data[i] = data[i-1];
+//
+//        // Evaluate the actual bspline moduli for X/Y/Z.
+//
+//        for(int dim = 0; dim < 3; dim++) {
+//            int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
+//            vector<double> moduli(ndata);
+//            for (int i = 0; i < ndata; i++) {
+//                double sc = 0.0;
+//                double ss = 0.0;
+//                for (int j = 0; j < ndata; j++) {
+//                    double arg = (2.0*M_PI*i*j)/ndata;
+//                    sc += bsplines_data[j]*cos(arg);
+//                    ss += bsplines_data[j]*sin(arg);
 //                }
-//        for (int i = 0; i < (int) dipole1.size(); ++i){
-//            if (dipole1[i] != dipole2[i]){
-//                return false;
+//                moduli[i] = sc*sc+ss*ss;
 //            }
+//            for (int i = 0; i < ndata; i++)
+//                if (moduli[i] < 1.0e-7)
+//                    moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
+//            if (cu.getUseDoublePrecision()) {
+//                if (dim == 0)
+//                    pmeBsplineModuliX->upload(moduli);
+//                else if (dim == 1)
+//                    pmeBsplineModuliY->upload(moduli);
+//                else
+//                    pmeBsplineModuliZ->upload(moduli);
 //            }
-//        for (int i = 0; i < (int) quadrupole1.size(); ++i){
-//            if (quadrupole1[i] != quadrupole2[i]){
-//                return false;
+//            else {
+//                vector<float> modulif(ndata);
+//                for (int i = 0; i < ndata; i++)
+//                    modulif[i] = (float) moduli[i];
+//                if (dim == 0)
+//                    pmeBsplineModuliX->upload(modulif);
+//                else if (dim == 1)
+//                    pmeBsplineModuliY->upload(modulif);
+//                else
+//                    pmeBsplineModuliZ->upload(modulif);
 //            }
 //        }
-//        return true;
 //    }
-//private:
-//    const AmoebaMultipoleForce& force;
-//};
-//
-//CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
-//         CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system) {
-//    data.incrementKernelCount();
-//}
-//
-//CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
-//    data.decrementKernelCount();
-//}
-//
-//void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
-//
+
+    // Add an interaction to the default nonbonded kernel.  This doesn't actually do any calculations.  It's
+    // just so that CudaNonbondedUtilities will build the exclusion flags and maintain the neighbor list.
+    
+    cu.getNonbondedUtilities().addInteraction(usePME, usePME, true, force.getCutoffDistance(), exclusions, "", force.getForceGroup());
+    cu.addForce(new ForceInfo(force));
+
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
 //    numMultipoles   = force.getNumMultipoles();
 //
 //    data.setHasAmoebaMultipole( true );
@@ -1036,25 +1319,116 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
 //        gpu->sim.nonbondedMethod              = PARTICLE_MESH_EWALD;
 //    }
 //    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-//}
-//
-//double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-//    computeAmoebaMultipoleForce( data );
-//    return 0.0;
-//}
-//
-//void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context,  const std::vector< Vec3 >& inputGrid,
-//                                                                   std::vector< double >& outputElectrostaticPotential) {
+}
+
+void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
+    hasInitializedScaleFactors = true;
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    
+    // Figure out the covalent flag values to use for each atom pair.
+    
+    vector<unsigned int> exclusionIndices;
+    vector<unsigned int> exclusionRowIndices;
+    nb.getExclusionIndices().download(exclusionIndices);
+    nb.getExclusionRowIndices().download(exclusionRowIndices);
+    covalentFlags = CudaArray::create<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");
+    vector<uint2> covalentFlagsVec(nb.getExclusions().getSize(), make_uint2(0, 0));
+    for (int i = 0; i < (int) covalentFlagValues.size(); i++) {
+        int atom1 = covalentFlagValues[i].x;
+        int atom2 = covalentFlagValues[i].y;
+        int value = covalentFlagValues[i].z;
+        int x = atom1/CudaContext::TileSize;
+        int offset1 = atom1-x*CudaContext::TileSize;
+        int y = atom2/CudaContext::TileSize;
+        int offset2 = atom2-y*CudaContext::TileSize;
+        int f1 = (value == 0 || value == 1 ? 1 : 0);
+        int f2 = (value == 0 || value == 2 ? 1 : 0);
+        if (x > y) {
+            int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices);
+            covalentFlagsVec[index+offset1].x |= f1<<offset2;
+            covalentFlagsVec[index+offset1].y |= f2<<offset2;
+        }
+        else {
+            int index = CudaNonbondedUtilities::findExclusionIndex(y, x, exclusionIndices, exclusionRowIndices);
+            covalentFlagsVec[index+offset2].x |= f1<<offset1;
+            covalentFlagsVec[index+offset2].y |= f2<<offset1;
+        }
+    }
+    covalentFlags->upload(covalentFlagsVec);
+    
+    // Do the same for the polarization flags.
+    
+    polarizationGroupFlags = CudaArray::create<unsigned int>(cu, nb.getExclusions().getSize(), "polarizationGroupFlags");
+    vector<unsigned int> polarizationGroupFlagsVec(nb.getExclusions().getSize(), 0);
+    for (int i = 0; i < (int) polarizationFlagValues.size(); i++) {
+        int atom1 = polarizationFlagValues[i].x;
+        int atom2 = polarizationFlagValues[i].y;
+        int x = atom1/CudaContext::TileSize;
+        int offset1 = atom1-x*CudaContext::TileSize;
+        int y = atom2/CudaContext::TileSize;
+        int offset2 = atom2-y*CudaContext::TileSize;
+        if (x > y) {
+            int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices);
+            polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
+        }
+        else {
+            int index = CudaNonbondedUtilities::findExclusionIndex(y, x, exclusionIndices, exclusionRowIndices);
+            polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
+        }
+    }
+    polarizationGroupFlags->upload(polarizationGroupFlagsVec);
+}
+
+double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (!hasInitializedScaleFactors)
+        initializeScaleFactors();
+    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
+    
+    // Compute the lab frame moments.
+
+    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer(),
+        &molecularDipoles->getDevicePointer(), &molecularQuadrupoles->getDevicePointer(),
+        &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer()};
+    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getPaddedNumAtoms());
+    vector<float> d, q;
+    labFrameDipoles->download(d);
+    labFrameQuadrupoles->download(q);
+    for (int i = 0; i < cu.getNumAtoms(); i++)
+        printf("%d %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2]);
+    for (int i = 0; i < cu.getNumAtoms(); i++)
+        printf("%d %g %g %g %g %g %g %g %g %g\n", i, q[9*i], q[9*i+1], q[9*i+2], q[9*i+3], q[9*i+4], q[9*i+5], q[9*i+6], q[9*i+7], q[9*i+8]);
+    int startTileIndex = nb.getStartTileIndex();
+    int numTileIndices = nb.getNumTiles();
+    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
+    int forceThreadBlockSize = nb.getForceThreadBlockSize();
+    if (pmeGrid == NULL) {
+        void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+            &nb.getExclusions().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
+            &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
+            &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+        cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        vector<unsigned long long> f;
+        field->download(f);
+        int pad = cu.getPaddedNumAtoms();
+        for (int i = 0; i < cu.getNumAtoms(); i++) {
+            printf("%d %g %g %g\n", i, f[i]/(double)0xFFFFFFFF, f[i+pad]/(double)0xFFFFFFFF, f[i+pad*2]/(double)0xFFFFFFFF);
+        }
+    }
+    return 0.0;
+}
+
+void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context,  const std::vector< Vec3 >& inputGrid,
+                                                                   std::vector< double >& outputElectrostaticPotential) {
 //    computeAmoebaMultipolePotential( data, inputGrid, outputElectrostaticPotential );
-//    return;
-//}
-//
-//void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context,  const Vec3& origin,
-//                                                                   std::vector< double >& outputMultipoleMonents) {
+    return;
+}
+
+void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context,  const Vec3& origin,
+                                                                   std::vector< double >& outputMultipoleMonents) {
 //    computeAmoebaSystemMultipoleMoments( data, origin, outputMultipoleMonents);
-//    return;
-//}
-//
+    return;
+}
+
 ///* -------------------------------------------------------------------------- *
 // *                       AmoebaGeneralizedKirkwood                            *
 // * -------------------------------------------------------------------------- */
@@ -1264,9 +1638,8 @@ double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeF
        hasInitializedNonbonded = true;
        nonbonded->initialize(system);
    }
-    const char* errorMessage = "Error copying array";
-    CHECK_RESULT(cuMemcpyDtoDAsync(tempPosq->getDevicePointer(), cu.getPosq().getDevicePointer(), tempPosq->getSize()*tempPosq->getElementSize(), 0));
-    CHECK_RESULT(cuMemcpyDtoDAsync(tempForces->getDevicePointer(), cu.getForce().getDevicePointer(), tempForces->getSize()*tempForces->getElementSize(), 0));
+    cu.getPosq().copyTo(*tempPosq);
+    cu.getForce().copyTo(*tempForces);
    void* prepareArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &tempPosq->getDevicePointer(),
        &bondReductionAtoms->getDevicePointer(), &bondReductionFactors->getDevicePointer()};
    cu.executeKernel(prepareKernel, prepareArgs, cu.getPaddedNumAtoms());
@@ -1274,8 +1647,8 @@ double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeF
    nonbonded->computeInteractions();
    void* spreadArgs[] = {&cu.getForce().getDevicePointer(), &tempForces->getDevicePointer(), &bondReductionAtoms->getDevicePointer(), &bondReductionFactors->getDevicePointer()};
    cu.executeKernel(spreadKernel, spreadArgs, cu.getPaddedNumAtoms());
-    CHECK_RESULT(cuMemcpyDtoDAsync(cu.getPosq().getDevicePointer(), tempPosq->getDevicePointer(), tempPosq->getSize()*tempPosq->getElementSize(), 0));
-    CHECK_RESULT(cuMemcpyDtoDAsync(cu.getForce().getDevicePointer(), tempForces->getDevicePointer(), tempForces->getSize()*tempForces->getElementSize(), 0));
+    tempPosq->copyTo(cu.getPosq());
+    tempForces->copyTo(cu.getForce());
    return 0.0;
 }


--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -30,8 +30,10 @@
 #include "openmm/amoebaKernels.h"
 #include "openmm/kernels.h"
 #include "openmm/System.h"
-#include "CudaContext.h"
 #include "CudaArray.h"
+#include "CudaContext.h"
+#include "CudaSort.h"
+#include <cufft.h>

 namespace OpenMM {

@@ -371,9 +373,46 @@ public:

 private:
    class ForceInfo;
+    void initializeScaleFactors();
    int numMultipoles;
+    bool hasInitializedScaleFactors;
    CudaContext& cu;
    System& system;
+    std::vector<int3> covalentFlagValues;
+    std::vector<int2> polarizationFlagValues;
+    CudaArray* multipoleParticles;
+    CudaArray* torqueBufferIndices;
+    CudaArray* molecularDipoles;
+    CudaArray* molecularQuadrupoles;
+    CudaArray* labFrameDipoles;
+    CudaArray* labFrameQuadrupoles;
+    CudaArray* field;
+    CudaArray* fieldPolar;
+    CudaArray* dampingAndThole;
+    CudaArray* inducedDipole;
+    CudaArray* inducedDipolePolar;
+    CudaArray* currentEpsilon;
+    CudaArray* polarizability;
+    CudaArray* covalentFlags;
+    CudaArray* polarizationGroupFlags;
+    CudaArray* pmeGrid;
+    CudaArray* pmeBsplineModuliX;
+    CudaArray* pmeBsplineModuliY;
+    CudaArray* pmeBsplineModuliZ;
+    CudaArray* pmeTheta1;
+    CudaArray* pmeTheta2;
+    CudaArray* pmeIgrid;
+    CudaArray* pmePhi;
+    CudaArray* pmePhid;
+    CudaArray* pmePhip;
+    CudaArray* pmePhidp;
+    CudaArray* pmeBsplineTheta;
+    CudaArray* pmeBsplineDTheta;
+    CudaArray* pmeAtomRange;
+    CudaArray* pmeAtomGridIndex;
+    CudaSort* sort;
+    cufftHandle fft;
+    CUfunction computeMomentsKernel, computeFixedFieldKernel;
 };

 /**

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleFixedField.cu
+#define TILE_SIZE 32
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+
+typedef struct {
+    real4 posq;
+    real3 field, fieldPolar, dipole;
+    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
+    real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
+    float thole, damp;
+} AtomData;
+
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
+    data.posq = posq[atom];
+    data.dipole.x = labFrameDipole[atom*3];
+    data.dipole.y = labFrameDipole[atom*3+1];
+    data.dipole.z = labFrameDipole[atom*3+2];
+    data.quadrupoleXX = labFrameQuadrupole[atom*9];
+    data.quadrupoleXY = labFrameQuadrupole[atom*9+1];
+    data.quadrupoleXZ = labFrameQuadrupole[atom*9+2];
+    data.quadrupoleYY = labFrameQuadrupole[atom*9+4];
+    data.quadrupoleYZ = labFrameQuadrupole[atom*9+5];
+    data.quadrupoleZZ = labFrameQuadrupole[atom*9+8];
+    float2 temp = dampingAndThole[atom];
+    data.damp = temp.x;
+    data.thole = temp.y;
+}
+
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, real3& field1, real3& field2) {
+    real rI = RSQRT(dot(deltaR, deltaR));
+    real r = RECIP(rI);
+    real r2I = rI*rI;
+
+    real rr3 = rI*r2I;
+    real rr5 = 3*rr3*r2I;
+    real rr7 = 5*rr5*r2I;
+ 
+    // get scaling factors, if needed
+    
+    float damp = atom1.damp*atom2.damp;
+    real dampExp;
+    if (damp != 0 && r < SCALING_DISTANCE_CUTOFF) {
+
+        // get scaling factors
+      
+        real ratio = r/damp;
+        float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole; 
+        damp = ratio*ratio*ratio*pGamma;
+        dampExp = EXP(-damp);
+    }
+    else
+        dampExp = 0;
+      
+    rr3 *= 1 - dampExp;
+    rr5 *= 1 - (1+damp)*dampExp;
+    rr7 *= 1 - (1+damp+(0.6f*damp*damp))*dampExp;
+      
+    real rr5_2 = 2*rr5;
+ 
+    real3 qDotDelta;
+    qDotDelta.x = deltaR.x*atom2.quadrupoleXX + deltaR.y*atom2.quadrupoleXY + deltaR.z*atom2.quadrupoleXZ;
+    qDotDelta.y = deltaR.x*atom2.quadrupoleXY + deltaR.y*atom2.quadrupoleYY + deltaR.z*atom2.quadrupoleYZ;
+    qDotDelta.z = deltaR.x*atom2.quadrupoleXZ + deltaR.y*atom2.quadrupoleYZ + deltaR.z*atom2.quadrupoleZZ;
+ 
+    real dotdd = dot(deltaR, atom2.dipole);
+    real dotqd = dot(deltaR, qDotDelta);
+
+    real factor = -rr3*atom2.posq.w + rr5*dotdd - rr7*dotqd;
+ 
+    field1 = deltaR*factor - rr3*atom2.dipole + rr5_2*qDotDelta;
+ 
+    qDotDelta.x = deltaR.x*atom1.quadrupoleXX + deltaR.y*atom1.quadrupoleXY + deltaR.z*atom1.quadrupoleXZ;
+    qDotDelta.y = deltaR.x*atom1.quadrupoleXY + deltaR.y*atom1.quadrupoleYY + deltaR.z*atom1.quadrupoleYZ;
+    qDotDelta.z = deltaR.x*atom1.quadrupoleXZ + deltaR.y*atom1.quadrupoleYZ + deltaR.z*atom1.quadrupoleZZ;
+ 
+    dotdd = dot(deltaR, atom1.dipole);
+    dotqd = dot(deltaR, qDotDelta);
+    factor = rr3*atom1.posq.w + rr5*dotdd + rr7*dotqd;
+ 
+    field2 = deltaR*factor - rr3*atom1.dipole - rr5_2*qDotDelta;
+}
+
+__device__ real computeDScaleFactor(unsigned int polarizationGroup) {
+    return (polarizationGroup & 1 ? 0 : 1);
+}
+
+__device__ float computeDScaleFactor(uint2 covalent) {
+//        int f1 = (value == 0 || value == 1 ? 1 : 0);
+//        int f2 = (value == 0 || value == 2 ? 1 : 0);
+    // 0 = 12 or 13: x and y: 0
+    // 1 = 14: x: 0.4
+    // 2 = 15: y: 0.8
+    bool x = (covalent.x & 1);
+    bool y = (covalent.y & 1);
+    return (x ? (y ? 0.0f : 0.4f) : (y ? 0.8f : 1.0f));
+}
+
+__device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGroup) {
+    bool x = (covalent.x & 1);
+    bool y = (covalent.y & 1);
+    bool p = (polarizationGroup & 1);
+    return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
+}
+
+/**
+ * Compute nonbonded interactions.
+ */
+extern "C" __global__ void computeFixedField(
+        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
+        const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
+        const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+#endif
+        const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+#endif
+    
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        AtomData data;
+        data.field = make_real3(0);
+        data.fieldPolar = make_real3(0);
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
+            
+            // Locate the exclusion data for this tile.
+
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].posq = data.posq;
+                localData[localAtomIndex].dipole = data.dipole;
+                localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
+                localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
+                localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
+                localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
+                localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
+                localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
+                localData[localAtomIndex].thole = data.thole; // IS THIS CORRECT?
+                localData[localAtomIndex].damp = data.damp; // IS THIS CORRECT?
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+                uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
+                unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    bool isExcluded = !(excl & 0x1);
+                    int atom2 = tbx+j;
+                    real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real3 field1;
+                    real3 field2;
+                    computeOneInteraction(data, localData[atom2], delta, field1, field2);
+                    if (!isExcluded) {
+                        float d = computeDScaleFactor(covalent);
+                        data.field += d*field1;
+                        float p = computePScaleFactor(covalent, polarizationGroup);
+                        data.fieldPolar += p*field1;
+                    }
+                    excl >>= 1;
+                    covalent.x >>= 1;
+                    covalent.y >>= 1;
+                    polarizationGroup >>= 1;
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+
+                const unsigned int localAtomIndex = threadIdx.x;
+                unsigned int j = y*TILE_SIZE + tgx;
+                loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
+                localData[localAtomIndex].field = make_real3(0);
+                localData[localAtomIndex].fieldPolar = make_real3(0);
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+
+                        for (j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                bool isExcluded = false;
+                                int atom2 = tbx+j;
+                                int bufferIndex = 3*threadIdx.x;
+                                real3 dEdR1 = make_real3(0);
+                                real3 dEdR2 = make_real3(0);
+                                real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                                if (r2 < CUTOFF_SQUARED) {
+#endif
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    COMPUTE_INTERACTION
+#ifdef USE_CUTOFF
+                                }
+#endif
+#ifdef ENABLE_SHUFFLE
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
+                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
+                                    dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += dEdR2.x;
+                                    localData[tbx+j].fy += dEdR2.y;
+                                    localData[tbx+j].fz += dEdR2.z;
+                                }
+#else
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                tempBuffer[bufferIndex] = dEdR2.x;
+                                tempBuffer[bufferIndex+1] = dEdR2.y;
+                                tempBuffer[bufferIndex+2] = dEdR2.z;
+
+                                // Sum the forces on atom2.
+
+                                if (tgx % 4 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
+                                }
+#endif
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
+                    unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+                    covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
+                    covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
+                    polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
+                    unsigned int tj = tgx;
+                    for (j = 0; j < TILE_SIZE; j++) {
+                        bool isExcluded = !(excl & 0x1);
+                        int atom2 = tbx+tj;
+                        real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real3 field1;
+                        real3 field2;
+                        computeOneInteraction(data, localData[atom2], delta, field1, field2);
+                        if (!isExcluded) {
+                            float d = computeDScaleFactor(covalent);
+                            data.field += d*field1;
+                            localData[atom2].field += d*field2;
+                            float p = computePScaleFactor(covalent, polarizationGroup);
+                            data.fieldPolar += p*field1;
+                            localData[atom2].fieldPolar += p*field2;
+                        }
+                        excl >>= 1;
+                        covalent.x >>= 1;
+                        covalent.y >>= 1;
+                        polarizationGroup >>= 1;
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        
+        // Write results.
+        
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0xFFFFFFFF)));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0xFFFFFFFF)));
+        }
+        pos++;
+    } while (pos < end);
+}
--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoles.cu
+extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4* __restrict__ multipoleParticles, float* __restrict__ molecularDipoles,
+        float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles) {
+    // get coordinates of this atom and the z & x axis atoms
+    // compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
+    // this atom and the axis atom
+ 
+    // this atom is referred to as the k-atom in notes below
+ 
+    // code common to ZThenX and Bisector
+    
+    for (int particleIndex = blockIdx.x*blockDim.x+threadIdx.x; particleIndex < NUM_ATOMS; particleIndex += gridDim.x*blockDim.x) {
+        int4 particles = multipoleParticles[particleIndex];
+        if (particles.x >= 0 && particles.z >= 0) {
+            real4 thisParticlePos = posq[particleIndex];
+            real4 posZ = posq[particles.z];
+            real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
+            real4 posX = posq[particles.x];
+            real3 vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
+            int axisType = particles.w; 
+    
+            /*
+                z-only
+                   (1) norm z
+                   (2) select random x
+                   (3) x = x - (x.z)z
+                   (4) norm x
+        
+                z-then-x
+                   (1) norm z
+                   (2) norm x (not needed)
+                   (3) x = x - (x.z)z
+                   (4) norm x
+        
+                bisector
+                   (1) norm z
+                   (2) norm x 
+                   (3) z = x + z
+                   (4) norm z
+                   (5) x = x - (x.z)z 
+                   (6) norm x 
+        
+                z-bisect
+                   (1) norm z
+                   (2) norm x 
+                   (3) norm y 
+                   (3) x = x + y
+                   (4) norm x
+                   (5) x = x - (x.z)z 
+                   (6) norm x 
+        
+                3-fold
+                   (1) norm z
+                   (2) norm x 
+                   (3) norm y 
+                   (4) z = x + y + z
+                   (5) norm z
+                   (6) x = x - (x.z)z 
+                   (7) norm x 
+        
+            */
+        
+            // branch based on axis type
+             
+            vectorZ = normalize(vectorZ);
+        
+            if (axisType == 1) {
+        
+                // bisector
+                
+                vectorX = normalize(vectorX);
+                vectorZ += vectorX;
+                vectorZ = normalize(vectorZ);
+            }
+            else if (axisType == 2 || axisType == 3) { 
+         
+                // z-bisect
+        
+                if (particles.y >= 0 && particles.y < NUM_ATOMS) {
+                    real4 posY = posq[particles.y];
+                    real3 vectorY = make_real3(posY.x-thisParticlePos.x, posY.y-thisParticlePos.y, posY.z-thisParticlePos.z);
+                    vectorY = normalize(vectorY);
+                    vectorX = normalize(vectorX);
+                    if (axisType == 2) {
+                        vectorX += vectorY;
+                        vectorX = normalize(vectorX);
+                    }
+                    else { 
+             
+                        // 3-fold
+                
+                        vectorZ += vectorX + vectorY;
+                        vectorZ = normalize(vectorZ);
+                    }
+                }
+         
+            }
+            else if (axisType >= 4)
+                vectorX = make_real3((real) 0.1f);
+            
+            // x = x - (x.z)z
+        
+            vectorX -= dot(vectorZ, vectorX)*vectorZ;
+            vectorX = normalize(vectorX);
+            real3 vectorY = cross(vectorZ, vectorX);
+         
+            // use identity rotation matrix for unrecognized axis types
+        
+            if (axisType < 0 || axisType > 4) {
+        
+                vectorX.x = 1;
+                vectorX.y = 0;
+                vectorX.z = 0;
+        
+                vectorY.x = 0;
+                vectorY.y = 1;
+                vectorY.z = 0;
+        
+                vectorZ.x = 0;
+                vectorZ.y = 0;
+                vectorZ.z = 1;
+            }
+            
+            // Check the chirality and see whether it needs to be reversed
+            
+            bool reverse = false;
+            if (axisType != 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) {
+                real4 posY = posq[particles.y];
+                real delta[4][3];
+
+                delta[0][0] = thisParticlePos.x - posY.x;
+                delta[0][1] = thisParticlePos.y - posY.y;
+                delta[0][2] = thisParticlePos.z - posY.z;
+
+                delta[1][0] = posZ.x - posY.x;
+                delta[1][1] = posZ.y - posY.y;
+                delta[1][2] = posZ.z - posY.z;
+
+                delta[2][0] = posX.x - posY.x;
+                delta[2][1] = posX.y - posY.y;
+                delta[2][2] = posX.z - posY.z;
+
+                delta[3][0] = delta[1][1]*delta[2][2] - delta[1][2]*delta[2][1];
+                delta[3][1] = delta[2][1]*delta[0][2] - delta[2][2]*delta[0][1];
+                delta[3][2] = delta[0][1]*delta[1][2] - delta[0][2]*delta[1][1];
+
+                real volume = delta[3][0]*delta[0][0] + delta[3][1]*delta[1][0] + delta[3][2]*delta[2][0];
+                reverse = (volume < 0);
+            }
+        
+            // Transform the dipole
+            
+            unsigned int offset = 3*particleIndex;
+            real molDipole[3];
+            molDipole[0] = molecularDipoles[offset];
+            molDipole[1] = molecularDipoles[offset+1];
+            molDipole[2] = molecularDipoles[offset+2];
+            if (reverse)
+                molDipole[1] *= -1;
+            labFrameDipoles[offset] = molDipole[0]*vectorX.x + molDipole[1]*vectorY.x + molDipole[2]*vectorZ.x;
+            labFrameDipoles[offset+1] = molDipole[0]*vectorX.y + molDipole[1]*vectorY.y + molDipole[2]*vectorZ.y;
+            labFrameDipoles[offset+2] = molDipole[0]*vectorX.z + molDipole[1]*vectorY.z + molDipole[2]*vectorZ.z;
+            
+            // ---------------------------------------------------------------------------------------
+            
+            // Transform the quadrupole
+            
+            real mPole[3][3];
+            offset = 9*particleIndex;
+            mPole[0][0] = molecularQuadrupoles[offset];
+            mPole[0][1] = molecularQuadrupoles[offset+1];
+            mPole[0][2] = molecularQuadrupoles[offset+2];
+        
+            mPole[1][0] = molecularQuadrupoles[offset+3];
+            mPole[1][1] = molecularQuadrupoles[offset+4];
+            mPole[1][2] = molecularQuadrupoles[offset+5];
+        
+            mPole[2][0] = molecularQuadrupoles[offset+6];
+            mPole[2][1] = molecularQuadrupoles[offset+7];
+            mPole[2][2] = molecularQuadrupoles[offset+8];
+        
+            if (reverse) {
+                mPole[0][1] *= -1;
+                mPole[1][0] *= -1;
+                mPole[1][2] *= -1;
+                mPole[2][1] *= -1;
+            }
+            
+            labFrameQuadrupoles[offset+8] = vectorX.z*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset+8] += vectorY.z*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+8] += vectorZ.z*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+    
+            labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
+            labFrameQuadrupoles[offset+4] += vectorY.y*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
+            labFrameQuadrupoles[offset+4] += vectorZ.y*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
+    
+            labFrameQuadrupoles[offset+5] = vectorX.y*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset+5] += vectorY.y*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+5] += vectorZ.y*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+    
+            labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPole[0][0] + vectorY.x*mPole[0][1] + vectorZ.x*mPole[0][2]);
+            labFrameQuadrupoles[offset] += vectorY.x*(vectorX.x*mPole[1][0] + vectorY.x*mPole[1][1] + vectorZ.x*mPole[1][2]);
+            labFrameQuadrupoles[offset] += vectorZ.x*(vectorX.x*mPole[2][0] + vectorY.x*mPole[2][1] + vectorZ.x*mPole[2][2]);
+    
+            labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
+            labFrameQuadrupoles[offset+1] += vectorY.x*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
+            labFrameQuadrupoles[offset+1] += vectorZ.x*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
+    
+            labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
+            labFrameQuadrupoles[offset+2] += vectorY.x*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
+            labFrameQuadrupoles[offset+2] += vectorZ.x*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
+     
+            labFrameQuadrupoles[offset+3] = labFrameQuadrupoles[offset+1];
+            labFrameQuadrupoles[offset+6] = labFrameQuadrupoles[offset+2];
+            labFrameQuadrupoles[offset+7] = labFrameQuadrupoles[offset+5];
+        }
+    }
+}
+
--- a/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaPME.cpp
+++ b/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaPME.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMMAmoeba                             *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2010 Stanford University and the Authors.      *
+ * Authors: Mark Friedrichs                                                   *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of AmoebaMultipoleForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "AmoebaTinkerParameterFile.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "openmm/AmoebaMultipoleForce.h"
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+void testPMEWater() {
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    System system;
+    system.addParticle(16);
+    system.addParticle(1);
+    system.addParticle(1);
+    VerletIntegrator integrator(0.01);
+    AmoebaMultipoleForce* mp = new AmoebaMultipoleForce();
+    mp->setNonbondedMethod(AmoebaMultipoleForce::PME);
+    vector<double> dipole(3, 0.0);
+    dipole[2] = 7.556121361e-2;
+    vector<double> quadrupole(9, 0.0);
+    quadrupole[0] = 3.540307211e-2;
+    quadrupole[4] = -3.902570771e-2;
+    quadrupole[8] = 3.622635596e-3;
+    double damp = 9.707801995e-01*sqrt(0.1);
+    double polarity = 0.837*0.001;
+    mp->addParticle(-0.51966, dipole, quadrupole, 1, 1, 2, -1, 0.39, damp, polarity);
+    dipole[0] = -2.042094848e-2;
+    dipole[2] = -3.078753000e-2;
+    quadrupole[0] = -3.428482490e-3;
+    quadrupole[2] = -1.894859639e-4;
+    quadrupole[4] = -1.002408752e-2;
+    quadrupole[6] = -1.894859639e-4;
+    quadrupole[8] = 1.345257001e-2;
+    damp          = 8.897068742e-01*sqrt(0.1);
+    polarity      = 0.496*0.001;
+    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 2, -1, 0.39, damp, polarity);
+    mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 1, -1, 0.39, damp, polarity);
+    mp->setCutoffDistance(1.0);
+
+    std::vector<int> intVector;
+    intVector.push_back( 0 );
+    intVector.push_back( 1 );
+    intVector.push_back( 2 );
+    mp->setCovalentMap( 0, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    mp->setCovalentMap( 1, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+    mp->setCovalentMap( 2, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
+
+    intVector.resize(0); intVector.push_back( 1 ); intVector.push_back( 2 );
+    mp->setCovalentMap( 0, AmoebaMultipoleForce::Covalent12, intVector );
+
+    intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 2 );
+    mp->setCovalentMap( 1, AmoebaMultipoleForce::Covalent12, intVector );
+
+    intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 1 );
+    mp->setCovalentMap( 2, AmoebaMultipoleForce::Covalent12, intVector );
+
+    mp->setEwaldErrorTolerance(TOL);
+    system.setDefaultPeriodicBoxVectors(Vec3(2, 0, 0), Vec3(0, 2, 0), Vec3(0, 0, 2));
+    system.addForce(mp);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(3);
+    const double angle = 109.47*M_PI/180;
+    const double dOH = 0.1;
+    positions[0] = Vec3();
+    positions[1] = Vec3(dOH, 0, 0);
+    positions[2] = Vec3(dOH*std::cos(angle), dOH*std::sin(angle), 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    const vector<Vec3>& forces = state.getForces();
+
+#ifdef AMOEBA_DEBUG
+    (void) fprintf( stderr, "PME forces\n" );
+        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+            (void) fprintf( stderr, "%6u [%14.7e %14.7e %14.7e]\n", ii, 
+                            forces[ii][0], forces[ii][1], forces[ii][2] );
+    }
+    (void) fflush( stderr );
+#endif
+
+}
+
+int main() {
+    try {
+        registerAmoebaCudaKernelFactories();
+        testPMEWater();
+    } catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}
+