Merge remote-tracking branch 'upstream/master'

047934e2 · Rafal P. Wiewiora · ce3a5dc0 · d12c9bd1 · 047934e2 · 047934e2
Commit 047934e2 authored Mar 01, 2017 by Rafal P. Wiewiora
20 changed files
--- a/platforms/reference/tests/TestReferenceNeighborList.cpp
+++ b/platforms/reference/tests/TestReferenceNeighborList.cpp
@@ -41,14 +41,14 @@ using namespace OpenMM;

 void testNeighborList()
 {
-    vector<RealVec> particleList(2);
-    particleList[0] = RealVec(13.6, 0, 0);
-    particleList[1] = RealVec(0, 0, 0);
+    vector<Vec3> particleList(2);
+    particleList[0] = Vec3(13.6, 0, 0);
+    particleList[1] = Vec3(0, 0, 0);
    vector<set<int> > exclusions(2);
    
    NeighborList neighborList;

-    RealVec boxVectors[3];
+    Vec3 boxVectors[3];
    computeNeighborListNaive(neighborList, 2, particleList, exclusions, boxVectors, false, 13.7, 0.01);
    assert(neighborList.size() == 1);
    
@@ -62,15 +62,15 @@ void testNeighborList()
    assert(neighborList.size() == 0);
 }

-double distance2(RealVec& pos1, RealVec& pos2, const RealVec* periodicBoxVectors) {
-    RealVec diff = pos1-pos2;
+double distance2(Vec3& pos1, Vec3& pos2, const Vec3* periodicBoxVectors) {
+    Vec3 diff = pos1-pos2;
    diff -= periodicBoxVectors[2]*floor(diff[2]/periodicBoxVectors[2][2]+0.5);
    diff -= periodicBoxVectors[1]*floor(diff[1]/periodicBoxVectors[1][1]+0.5);
    diff -= periodicBoxVectors[0]*floor(diff[0]/periodicBoxVectors[0][0]+0.5);
    return diff.dot(diff);
 }

-void verifyNeighborList(NeighborList& list, int numParticles, vector<RealVec>& positions, const RealVec* periodicBoxVectors, double cutoff) {
+void verifyNeighborList(NeighborList& list, int numParticles, vector<Vec3>& positions, const Vec3* periodicBoxVectors, double cutoff) {
    for (int i = 0; i < (int) list.size(); i++) {
        int particle1 = list[i].first;
        int particle2 = list[i].second;
@@ -87,18 +87,18 @@ void verifyNeighborList(NeighborList& list, int numParticles, vector<RealVec>& p
 void testPeriodic() {
    const int numParticles = 100;
    const double cutoff = 3.0;
-    RealVec periodicBoxVectors[3];
-    periodicBoxVectors[0] = RealVec(20, 0, 0);
-    periodicBoxVectors[1] = RealVec(0, 15, 0);
-    periodicBoxVectors[2] = RealVec(0, 0, 22);
-    vector<RealVec> particleList(numParticles);
+    Vec3 periodicBoxVectors[3];
+    periodicBoxVectors[0] = Vec3(20, 0, 0);
+    periodicBoxVectors[1] = Vec3(0, 15, 0);
+    periodicBoxVectors[2] = Vec3(0, 0, 22);
+    vector<Vec3> particleList(numParticles);
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

    for (int i = 0; i <numParticles; i++) {
-        particleList[i][0] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[0][0]*3);
-        particleList[i][1] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[1][1]*3);
-        particleList[i][2] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[2][2]*3);
+        particleList[i][0] = genrand_real2(sfmt)*periodicBoxVectors[0][0]*3;
+        particleList[i][1] = genrand_real2(sfmt)*periodicBoxVectors[1][1]*3;
+        particleList[i][2] = genrand_real2(sfmt)*periodicBoxVectors[2][2]*3;
    }
    vector<set<int> > exclusions(numParticles);
    NeighborList neighborList;
@@ -111,18 +111,18 @@ void testPeriodic() {
 void testTriclinic() {
    const int numParticles = 1000;
    const double cutoff = 3.0;
-    RealVec periodicBoxVectors[3];
-    periodicBoxVectors[0] = RealVec(20, 0, 0);
-    periodicBoxVectors[1] = RealVec(5, 15, 0);
-    periodicBoxVectors[2] = RealVec(-3, -7, 22);
-    vector<RealVec> particleList(numParticles);
+    Vec3 periodicBoxVectors[3];
+    periodicBoxVectors[0] = Vec3(20, 0, 0);
+    periodicBoxVectors[1] = Vec3(5, 15, 0);
+    periodicBoxVectors[2] = Vec3(-3, -7, 22);
+    vector<Vec3> particleList(numParticles);
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

    for (int i = 0; i <numParticles; i++) {
-        particleList[i][0] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[0][0]*3);
-        particleList[i][1] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[1][1]*3);
-        particleList[i][2] = (RealOpenMM) (genrand_real2(sfmt)*periodicBoxVectors[2][2]*3);
+        particleList[i][0] = genrand_real2(sfmt)*periodicBoxVectors[0][0]*3;
+        particleList[i][1] = genrand_real2(sfmt)*periodicBoxVectors[1][1]*3;
+        particleList[i][2] = genrand_real2(sfmt)*periodicBoxVectors[2][2]*3;
    }
    vector<set<int> > exclusions(numParticles);
    NeighborList neighborList;

--- a/plugins/amoeba/openmmapi/src/AmoebaAngleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaAngleForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaAngleForceImpl::getKernelNames() {

 void AmoebaAngleForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaAngleForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaBondForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaBondForceImpl.cpp
@@ -75,4 +75,5 @@ vector<pair<int, int> > AmoebaBondForceImpl::getBondedParticles() const {

 void AmoebaBondForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaBondForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaGeneralizedKirkwoodForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaGeneralizedKirkwoodForceImpl.cpp
@@ -66,4 +66,5 @@ std::vector<std::string> AmoebaGeneralizedKirkwoodForceImpl::getKernelNames() {

 void AmoebaGeneralizedKirkwoodForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaGeneralizedKirkwoodForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaInPlaneAngleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaInPlaneAngleForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaInPlaneAngleForceImpl::getKernelNames() {

 void AmoebaInPlaneAngleForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaInPlaneAngleForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
@@ -50,7 +50,8 @@ AmoebaMultipoleForceImpl::~AmoebaMultipoleForceImpl() {
 void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {

    const System& system = context.getSystem();
-    if (owner.getNumMultipoles() != system.getNumParticles())
+    int numParticles = system.getNumParticles();
+    if (owner.getNumMultipoles() != numParticles)
        throw OpenMMException("AmoebaMultipoleForce must have exactly as many particles as the System it belongs to.");

    // check cutoff < 0.5*boxSize
@@ -64,7 +65,7 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
    }

    double quadrupoleValidationTolerance = 1.0e-05;
-    for (int ii = 0; ii < system.getNumParticles(); ii++) {
+    for (int ii = 0; ii < numParticles; ii++) {

        int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY;
        double charge, thole, dampingFactor, polarity ;
@@ -121,6 +122,23 @@ void AmoebaMultipoleForceImpl::initialize(ContextImpl& context) {
             buffer << "] (ZThenX, Bisector, Z-Bisect, ThreeFold, NoAxisType) currently handled .";
             throw OpenMMException(buffer.str());
        }
+        if (axisType != AmoebaMultipoleForce::NoAxisType && (multipoleAtomZ < 0 || multipoleAtomZ >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "AmoebaMultipoleForce: invalid z axis particle: " << multipoleAtomZ;
+            throw OpenMMException(buffer.str());
+        }
+        if (axisType != AmoebaMultipoleForce::NoAxisType && axisType != AmoebaMultipoleForce::ZOnly &&
+                (multipoleAtomX < 0 || multipoleAtomX >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "AmoebaMultipoleForce: invalid x axis particle: " << multipoleAtomX;
+            throw OpenMMException(buffer.str());
+        }
+        if ((axisType == AmoebaMultipoleForce::ZBisect || axisType == AmoebaMultipoleForce::ThreeFold) &&
+                (multipoleAtomY < 0 || multipoleAtomY >= numParticles)) {
+            std::stringstream buffer;
+            buffer << "AmoebaMultipoleForce: invalid y axis particle: " << multipoleAtomY;
+            throw OpenMMException(buffer.str());
+        }
    }
    kernel = context.getPlatform().createKernel(CalcAmoebaMultipoleForceKernel::Name(), context);
    kernel.getAs<CalcAmoebaMultipoleForceKernel>().initialize(context.getSystem(), owner);
@@ -206,6 +224,7 @@ void AmoebaMultipoleForceImpl::getSystemMultipoleMoments(ContextImpl& context, s

 void AmoebaMultipoleForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaMultipoleForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }

 void AmoebaMultipoleForceImpl::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {

--- a/plugins/amoeba/openmmapi/src/AmoebaOutOfPlaneBendForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaOutOfPlaneBendForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaOutOfPlaneBendForceImpl::getKernelNames() {

 void AmoebaOutOfPlaneBendForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaOutOfPlaneBendForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaPiTorsionForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaPiTorsionForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaPiTorsionForceImpl::getKernelNames() {

 void AmoebaPiTorsionForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaPiTorsionForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaStretchBendForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaStretchBendForceImpl.cpp
@@ -64,4 +64,5 @@ std::vector<std::string> AmoebaStretchBendForceImpl::getKernelNames() {

 void AmoebaStretchBendForceImpl::updateParametersInContext(ContextImpl& context) {
    kernel.getAs<CalcAmoebaStretchBendForceKernel>().copyParametersToContext(context, owner);
+    context.systemChanged();
 }
--- a/plugins/amoeba/platforms/cuda/CMakeLists.txt
+++ b/plugins/amoeba/platforms/cuda/CMakeLists.txt
@@ -21,6 +21,7 @@ SET(OPENMM_SOURCE_SUBDIRS .)
 SET(OPENMMAMOEBACUDA_LIBRARY_NAME OpenMMAmoebaCUDA)

 SET(SHARED_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME})
+SET(STATIC_TARGET ${OPENMMAMOEBACUDA_LIBRARY_NAME}_static)


 # These are all the places to search for header files which are
@@ -85,17 +86,42 @@ ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
    DEPENDS ${CUDA_KERNELS}
 )
 SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
-ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
-IF (APPLE)
+# Build the shared plugin library.
+
+IF (OPENMM_BUILD_SHARED_LIB)
+    ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${SHARED_AMOEBA_TARGET})
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY")
+    IF (APPLE)
        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
-ELSE (APPLE)
+    ELSE (APPLE)
        SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
-ENDIF (APPLE)
+    ENDIF (APPLE)
+
+    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
+ENDIF (OPENMM_BUILD_SHARED_LIB)
+
+# Build the static plugin library.
+
+IF(OPENMM_BUILD_STATIC_LIB)
+    ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${STATIC_AMOEBA_TARGET})
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_STATIC_LIBRARY")
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
+    ENDIF (APPLE)
+
+    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
+ENDIF(OPENMM_BUILD_STATIC_LIB)

 INSTALL(TARGETS ${SHARED_TARGET} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/plugins)
 # Ensure that links to the main CUDA library will be resolved.

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
 * Authors: Mark Friedrichs, Peter Eastman                                    *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,10 +33,18 @@

 using namespace OpenMM;

+#ifdef OPENMM_BUILDING_STATIC_LIBRARY
+static void registerPlatforms() {
+#else
 extern "C" OPENMM_EXPORT void registerPlatforms() {
+#endif
 }

+#ifdef OPENMM_BUILDING_STATIC_LIBRARY
+static void registerKernelFactories() {
+#else
 extern "C" OPENMM_EXPORT void registerKernelFactories() {
+#endif
    try {
        Platform& platform = Platform::getPlatformByName("CUDA");
        AmoebaCudaKernelFactory* factory = new AmoebaCudaKernelFactory();

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -41,7 +41,7 @@
 #include "CudaForceInfo.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
-#include "jama_svd.h"
+#include "jama_lu.h"

 #include <algorithm>
 #include <cmath>
@@ -52,10 +52,10 @@
 using namespace OpenMM;
 using namespace std;

-#define CHECK_RESULT(result) \
+#define CHECK_RESULT(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
-        m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

@@ -813,7 +813,7 @@ private:
 };

 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
-        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
+        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
        fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
        inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL),
        extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
-        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
+        pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
 }

 CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
@@ -927,14 +927,14 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete pmePhidp;
    if (pmeCphi != NULL)
        delete pmeCphi;
-    if (pmeAtomGridIndex != NULL)
-        delete pmeAtomGridIndex;
    if (lastPositions != NULL)
        delete lastPositions;
    if (sort != NULL)
        delete sort;
    if (hasInitializedFFT)
        cufftDestroy(fft);
+    if (hasCreatedEvent)
+        cuEventDestroy(syncEvent);
 }

 void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
@@ -1021,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
        diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
        diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
+        CHECK_RESULT(cuEventCreate(&syncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for AmoebaMultipoleForce");
+        hasCreatedEvent = true;
    }
    else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
        int numOrders = force.getExtrapolationCoefficients().size();
@@ -1153,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            NonbondedForce nb;
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
-            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ);
+            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
            gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
            gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
            gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
@@ -1212,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
        recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
+        solveMatrixKernel = cu.getKernel(module, "solveDIISMatrix");
        initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
        iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
        computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
@@ -1253,7 +1256,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
            pmeDefines["EXTRAPOLATED_POLARIZATION"] = "";
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
-        pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
        pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
        pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
        pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
@@ -1285,7 +1287,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp");
        pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi");
        pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
-        pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex");
        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
        if (result != CUFFT_SUCCESS)
@@ -1569,16 +1570,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        // Reciprocal space calculation.
        
        unsigned int maxTiles = nb.getInteractingTiles().getSize();
-        void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
-            cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-        cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
-        sort->sort(*pmeAtomGridIndex);
        void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
            &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
        void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),  cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
        void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
@@ -1590,7 +1586,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
            &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        else
@@ -1598,7 +1594,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
            &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
            cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
-            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
+            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
        void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
@@ -1625,7 +1621,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in

        cu.clearBuffer(*pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
        if (cu.getUseDoublePrecision())
@@ -1634,15 +1630,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
-        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
            &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
-            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
-            &pmeAtomGridIndex->getDevicePointer()};
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
        
        // Iterate until the dipoles converge.
@@ -1771,7 +1766,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
        cu.clearBuffer(*pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
-            &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
+            &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
        if (cu.getUseDoublePrecision()) {
@@ -1784,15 +1779,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
            &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
-        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
+        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
        if (cu.getUseDoublePrecision())
            cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        else
            cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
            &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
-            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
-            &pmeAtomGridIndex->getDevicePointer()};
+            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
@@ -1831,22 +1825,24 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    float2* errors = (float2*) cu.getPinnedBuffer();
    inducedDipoleErrors->download(errors, false);
+    cuEventRecord(syncEvent, cu.getCurrentStream());
    
    // Build the DIIS matrix.
    
    int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
    void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
-    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize);
-    vector<float> matrixf;
-    vector<double> matrix;
-    if (cu.getUseDoublePrecision())
-        diisMatrix->download(matrix);
-    else
-        diisMatrix->download(matrixf);
+    int blockSize = 512;
+    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
+    
+    // Solve the matrix.
+
+    void* solveMatrixArgs[] = {&iteration, &diisMatrix->getDevicePointer(), &diisCoefficients->getDevicePointer()};
+    cu.executeKernel(solveMatrixKernel, solveMatrixArgs, 32, 32);
    
    // Determine whether the iteration has converged.
    
+    cuEventSynchronize(syncEvent);
    double total1 = 0.0, total2 = 0.0;
    for (int j = 0; j < inducedDipoleErrors->getSize(); j++) {
        total1 += errors[j].x;
@@ -1855,55 +1851,15 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
        return true;
    
-    // Compute the coefficients for selecting the new dipoles.
-
-    float* coefficients = (float*) cu.getPinnedBuffer();
-    if (iteration == 0)
-        coefficients[0] = 1;
-    else {
-        int rank = numPrev+1;
-        Array2D<double> b(rank, rank);
-        b[0][0] = 0;
-        for (int i = 1; i < rank; i++)
-            b[i][0] = b[0][i] = -1;
-        if (cu.getUseDoublePrecision()) {
-            for (int i = 0; i < numPrev; i++)
-                for (int j = 0; j < numPrev; j++)
-                    b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
-        }
-        else {
-            for (int i = 0; i < numPrev; i++)
-                for (int j = 0; j < numPrev; j++)
-                    b[i+1][j+1] = matrixf[i*MaxPrevDIISDipoles+j];
-        }
-
-        // Solve using SVD.  Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
-
-        JAMA::SVD<double> svd(b);
-        Array2D<double> u, v;
-        svd.getU(u);
-        svd.getV(v);
-        Array1D<double> s;
-        svd.getSingularValues(s);
-        int effectiveRank = svd.rank();
-        for (int i = 1; i < rank; i++) {
-            double d = 0;
-            for (int j = 0; j < effectiveRank; j++)
-                d -= u[0][j]*v[i][j]/s[j];
-            coefficients[i-1] = d;
-        }
-    }
-    diisCoefficients->upload(coefficients, false);
-    
    // Compute the dipoles.
    
    void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
        &prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
-    cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+    cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, 3*cu.getNumAtoms(), 256);
    if (gkKernel != NULL) {
        void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
            &prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
-        cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+        cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, 3*cu.getNumAtoms(), 256);
    }
    return false;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -408,7 +408,7 @@ private:
    int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
    double alpha, inducedEpsilon;
-    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
+    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid, hasCreatedEvent;
    AmoebaMultipoleForce::PolarizationType polarizationType;
    CudaContext& cu;
    const System& system;
@@ -465,16 +465,16 @@ private:
    CudaArray* pmePhidp;
    CudaArray* pmeCphi;
    CudaArray* pmeAtomRange;
-    CudaArray* pmeAtomGridIndex;
    CudaArray* lastPositions;
    CudaSort* sort;
    cufftHandle fft;
    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
-    CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
+    CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
-    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
+    CUfunction recordDIISDipolesKernel, buildMatrixKernel, solveMatrixKernel;
    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
+    CUevent syncEvent;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
    static const int MaxPrevDIISDipoles = 20;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
@@ -107,9 +107,9 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has

    real dmp = atom1.damp*atom2.damp;
    real a = min(atom1.thole, atom2.thole);
-    real u = fabs(dmp) > 1.0e-5f ? r/dmp : 1e10f;
-    real au3 = a*u*u*u;
-    real expau3 = au3 < 50 ? EXP(-au3) : 0;
+    real u = r/dmp;
+    real au3 = fabs(dmp) > 1.0e-5f ? a*u*u*u : 0;
+    real expau3 = fabs(dmp) > 1.0e-5f ? EXP(-au3) : 0;
    real a2u6 = au3*au3;
    real a3u9 = a2u6*au3;
    // Thole damping factors for energies
@@ -336,7 +336,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    iEIY -= eCoef*(qiUinpI.y*qiUindJ.x + qiUindI.y*qiUinpJ.x);
    iEJY -= eCoef*(qiUinpJ.y*qiUindI.x + qiUindJ.y*qiUinpI.x);
    fIZ += dCoef*(qiUinpI.x*qiUindJ.x + qiUindI.x*qiUinpJ.x);
-    fIZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
+    fJZ += dCoef*(qiUinpJ.x*qiUindI.x + qiUindJ.x*qiUinpI.x);
    // Uind-Uind terms (m=1)
    eCoef = 2*rInvVec[3]*thole_d1;
    dCoef = -3*rInvVec[4]*dthole_d1;
@@ -345,7 +345,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
    iEIY += eCoef*(qiUinpI.x*qiUindJ.y + qiUindI.x*qiUinpJ.y);
    iEJY += eCoef*(qiUinpJ.x*qiUindI.y + qiUindJ.x*qiUinpI.y);
    fIZ += dCoef*(qiUinpI.y*qiUindJ.y + qiUindI.y*qiUinpJ.y + qiUinpI.z*qiUindJ.z + qiUindI.z*qiUinpJ.z);
-    fIZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
+    fJZ += dCoef*(qiUinpJ.y*qiUindI.y + qiUindJ.y*qiUinpI.y + qiUinpJ.z*qiUindI.z + qiUindJ.z*qiUinpI.z);
 #endif

    // The quasi-internal frame forces and torques.  Note that the induced torque intermediates are
@@ -545,7 +545,7 @@ extern "C" __global__ void computeElectrostatics(
            data.force = make_real3(0);
            data.torque = make_real3(0);
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
@@ -655,7 +655,7 @@ extern "C" __global__ void computeFixedField(
            data.bornRadius = bornRadii[atom1];
 #endif
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
@@ -514,7 +514,7 @@ extern "C" __global__ void computeInducedField(
            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
 #endif
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
@@ -607,11 +607,6 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors,
        real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors, real* __restrict__ matrix) {
    extern __shared__ real2 buffer[];
-#ifdef USE_EWALD
-    const real ewaldScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
-#else
-    const real ewaldScale = 0;
-#endif
    const real fieldScale = 1/(real) 0x100000000;
    real sumErrors = 0;
    real sumPolarErrors = 0;
@@ -699,6 +694,126 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
    }
 }

+extern "C" __global__ void solveDIISMatrix(int iteration, const real* __restrict__ matrix, float* __restrict__ coefficients) {
+    __shared__ real b[MAX_PREV_DIIS_DIPOLES+1][MAX_PREV_DIIS_DIPOLES+1];
+    __shared__ real piv[MAX_PREV_DIIS_DIPOLES+1];
+    __shared__ real x[MAX_PREV_DIIS_DIPOLES+1];
+
+    // On the first iteration we don't need to do any calculation.
+    
+    if (iteration == 0) {
+        if (threadIdx.x == 0)
+            coefficients[0] = 1;
+        return;
+    }
+    
+    // Load the matrix.
+    
+    int numPrev = min(iteration+1, MAX_PREV_DIIS_DIPOLES);
+    int rank = numPrev+1;
+    for (int index = threadIdx.x; index < numPrev*numPrev; index += blockDim.x) {
+        int i = index/numPrev;
+        int j = index-i*numPrev;
+        b[i+1][j+1] = matrix[i*MAX_PREV_DIIS_DIPOLES+j];
+    }
+    for (int i = threadIdx.x; i < rank; i += blockDim.x) {
+        b[i][0] = -1;
+        piv[i] = i;
+    }
+    __syncthreads();
+    
+    // Compute the mean absolute value of the values we just loaded.  We use that for preconditioning it,
+    // which is essential for doing the computation in single precision.
+    
+    if (threadIdx.x == 0) {
+        real mean = 0;
+        for (int i = 0; i < numPrev; i++)
+            for (int j = 0; j < numPrev; j++)
+                mean += fabs(b[i+1][j+1]);
+        mean /= numPrev*numPrev;
+        b[0][0] = 0;
+        for (int i = 1; i < rank; i++)
+            b[0][i] = -mean;
+
+        // Compute the LU decomposition of the matrix.  This code is adapted from JAMA.
+    
+        int pivsign = 1;
+        for (int j = 0; j < rank; j++) {
+            // Apply previous transformations.
+
+            for (int i = 0; i < rank; i++) {
+                // Most of the time is spent in the following dot product.
+
+                int kmax = min(i, j);
+                real s = 0;
+                for (int k = 0; k < kmax; k++)
+                    s += b[i][k] * b[k][j];
+                b[i][j] -= s;
+            }
+
+            // Find pivot and exchange if necessary.
+
+            int p = j;
+            for (int i = j+1; i < rank; i++)
+                if (abs(b[i][j]) > abs(b[p][j]))
+                    p = i;
+            if (p != j) {
+                int k = 0;
+                for (k = 0; k < rank; k++) {
+                    real t = b[p][k];
+                    b[p][k] = b[j][k];
+                    b[j][k] = t;
+                }
+                k = piv[p];
+                piv[p] = piv[j];
+                piv[j] = k;
+                pivsign = -pivsign;
+            }
+
+            // Compute multipliers.
+
+            if ((j < rank) && (b[j][j] != 0))
+                for (int i = j+1; i < rank; i++)
+                    b[i][j] /= b[j][j];
+        }
+        for (int i = 0; i < rank; i++)
+            if (b[i][i] == 0) {
+                // The matrix is singular.
+                
+                for (int j = 0; j < rank-1; j++)
+                    coefficients[j] = 0;
+                coefficients[rank-1] = 1;
+                return;
+            }
+
+        // Solve b*Y = X(piv)
+        
+        for (int i = 0; i < rank; i++) 
+            x[i] = (piv[i] == 0 ? -1 : 0);
+        for (int k = 0; k < rank; k++)
+            for (int i = k+1; i < rank; i++)
+                x[i] -= x[k] * b[i][k];
+
+        // Solve U*X = Y;
+        
+        for (int k = rank-1; k >= 0; k--) {
+            x[k] /= b[k][k];
+            for (int i = 0; i < k; i++)
+                x[i] -= x[k] * b[i][k];
+        }
+        
+        // Record the coefficients.
+        
+        real lastCoeff = 1;
+        for (int i = 0; i < rank-1; i++) {
+            real c = x[i+1]*mean;
+            coefficients[i] = c;
+            lastCoeff -= c;
+        }
+        coefficients[rank-1] = lastCoeff;
+    }
+}
+
 extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, 
        const real* __restrict__ prevDipoles, const real* __restrict__ prevDipolesPolar, const float* __restrict__ coefficients, int numPrev) {
    for (int index = blockIdx.x*blockDim.x + threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
        thetai[i-1] = make_real4(ARRAY(PME_ORDER,i), ARRAY(PME_ORDER-1,i), ARRAY(PME_ORDER-2,i), ARRAY(PME_ORDER-3,i));
 }

-/**
- * Compute the index of the grid point each atom is associated with.
- */
-extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        real4 pos = posq[i];
-        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
-        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
-        pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
-
-        // First axis.
-
-        real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
-        real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
-        int ifr = (int) fr;
-        int igrid1 = ifr-PME_ORDER+1;
-
-        // Second axis.
-
-        w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
-        fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
-        ifr = (int) fr;
-        int igrid2 = ifr-PME_ORDER+1;
-
-        // Third axis.
-
-        w = pos.z*recipBoxVecZ.z;
-        fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
-        ifr = (int) fr;
-        int igrid3 = ifr-PME_ORDER+1;
-
-        // Record the grid point.
-
-        igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
-        igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
-        igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
-        pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
-    }
-}
-
 /**
 * Convert the fixed multipoles from Cartesian to fractional coordinates.
 */
@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
 }

 extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole,
-        const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
+        const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
 #if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
 }

 extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
-        const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
+        const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
 #if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
 extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
        long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers,  const real4* __restrict__ posq,
        const real* __restrict__ labFrameDipole, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
 #if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
 #else
@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
    }
    __syncthreads();
    
-    // Process the atoms in spatially sorted order.  This improves cache performance when loading
-    // the grid values.
-    
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
-        int m = pmeAtomGridIndex[i].x;
+    for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
        real4 pos = posq[m];
        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
        real tuv102 = 0;
        real tuv012 = 0;
        real tuv111 = 0;
-        for (int iz = 0; iz < PME_ORDER; iz++) {
-            int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-            real4 v = theta3[iz];
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            real4 v = theta1[ix];
            real tu00 = 0;
            real tu10 = 0;
            real tu01 = 0;
@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
                int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
                real4 u = theta2[iy];
                real4 t = make_real4(0, 0, 0, 0);
-                for (int ix = 0; ix < PME_ORDER; ix++) {
-                    int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
                    real tq = pmeGrid[gridIndex].x;
-                    real4 tadd = theta1[ix];
+                    real4 tadd = theta3[iz];
                    t.x += tq*tadd.x;
                    t.y += tq*tadd.y;
                    t.z += tq*tadd.z;
                    t.w += tq*tadd.w;
                }
-                tu00 += t.x*u.x;
-                tu10 += t.y*u.x;
-                tu01 += t.x*u.y;
-                tu20 += t.z*u.x;
-                tu11 += t.y*u.y;
-                tu02 += t.x*u.z;
-                tu30 += t.w*u.x;
-                tu21 += t.z*u.y;
-                tu12 += t.y*u.z;
-                tu03 += t.x*u.w;
+                tu00 += u.x*t.x;
+                tu10 += u.y*t.x;
+                tu01 += u.x*t.y;
+                tu20 += u.z*t.x;
+                tu11 += u.y*t.y;
+                tu02 += u.x*t.z;
+                tu30 += u.w*t.x;
+                tu21 += u.z*t.y;
+                tu12 += u.y*t.z;
+                tu03 += u.x*t.w;
            }
-            tuv000 += tu00*v.x;
-            tuv100 += tu10*v.x;
-            tuv010 += tu01*v.x;
-            tuv001 += tu00*v.y;
-            tuv200 += tu20*v.x;
-            tuv020 += tu02*v.x;
-            tuv002 += tu00*v.z;
-            tuv110 += tu11*v.x;
-            tuv101 += tu10*v.y;
-            tuv011 += tu01*v.y;
-            tuv300 += tu30*v.x;
-            tuv030 += tu03*v.x;
-            tuv003 += tu00*v.w;
-            tuv210 += tu21*v.x;
-            tuv201 += tu20*v.y;
-            tuv120 += tu12*v.x;
-            tuv021 += tu02*v.y;
-            tuv102 += tu10*v.z;
-            tuv012 += tu01*v.z;
-            tuv111 += tu11*v.y;
+            tuv000 += v.x*tu00;
+            tuv100 += v.y*tu00;
+            tuv010 += v.x*tu10;
+            tuv001 += v.x*tu01;
+            tuv200 += v.z*tu00;
+            tuv020 += v.x*tu20;
+            tuv002 += v.x*tu02;
+            tuv110 += v.y*tu10;
+            tuv101 += v.y*tu01;
+            tuv011 += v.x*tu11;
+            tuv300 += v.w*tu00;
+            tuv030 += v.x*tu30;
+            tuv003 += v.x*tu03;
+            tuv210 += v.z*tu10;
+            tuv201 += v.z*tu01;
+            tuv120 += v.y*tu20;
+            tuv021 += v.x*tu21;
+            tuv102 += v.y*tu02;
+            tuv012 += v.x*tu12;
+            tuv111 += v.y*tu11;
        }
        phi[m] = tuv000;
        phi[m+NUM_ATOMS] = tuv100;
@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
 extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
        real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX,
-        real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) {
+        real3 recipBoxVecY, real3 recipBoxVecZ) {
 #if __CUDA_ARCH__ < 500
    real array[PME_ORDER*PME_ORDER];
 #else
@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
    real4 theta2[PME_ORDER];
    real4 theta3[PME_ORDER];
    
-    // Process the atoms in spatially sorted order.  This improves cache performance when loading
-    // the grid values.
-    
-    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
-        int m = pmeAtomGridIndex[atom].x;
+    for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
        real4 pos = posq[m];
        pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
        pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
        real tuv102 = 0;
        real tuv012 = 0;
        real tuv111 = 0;
-        for (int iz = 0; iz < PME_ORDER; iz++) {
-            int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
-            real4 v = theta3[iz];
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            real4 v = theta1[ix];
            real tu00_1 = 0;
            real tu01_1 = 0;
            real tu10_1 = 0;
@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
                real t1_2 = 0;
                real t2_2 = 0;
                real t3 = 0;
-                for (int ix = 0; ix < PME_ORDER; ix++) {
-                    int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
                    real2 tq = pmeGrid[gridIndex];
-                    real4 tadd = theta1[ix];
+                    real4 tadd = theta3[iz];
                    t0_1 += tq.x*tadd.x;
                    t1_1 += tq.x*tadd.y;
                    t2_1 += tq.x*tadd.z;
@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
                    t2_2 += tq.y*tadd.z;
                    t3 += (tq.x+tq.y)*tadd.w;
                }
-                tu00_1 += t0_1*u.x;
-                tu10_1 += t1_1*u.x;
-                tu01_1 += t0_1*u.y;
-                tu20_1 += t2_1*u.x;
-                tu11_1 += t1_1*u.y;
-                tu02_1 += t0_1*u.z;
-                tu00_2 += t0_2*u.x;
-                tu10_2 += t1_2*u.x;
-                tu01_2 += t0_2*u.y;
-                tu20_2 += t2_2*u.x;
-                tu11_2 += t1_2*u.y;
-                tu02_2 += t0_2*u.z;
+                tu00_1 += u.x*t0_1;
+                tu10_1 += u.y*t0_1;
+                tu01_1 += u.x*t1_1;
+                tu20_1 += u.z*t0_1;
+                tu11_1 += u.y*t1_1;
+                tu02_1 += u.x*t2_1;
+                tu00_2 += u.x*t0_2;
+                tu10_2 += u.y*t0_2;
+                tu01_2 += u.x*t1_2;
+                tu20_2 += u.z*t0_2;
+                tu11_2 += u.y*t1_2;
+                tu02_2 += u.x*t2_2;
                real t0 = t0_1 + t0_2;
                real t1 = t1_1 + t1_2;
                real t2 = t2_1 + t2_2;
-                tu00 += t0*u.x;
-                tu10 += t1*u.x;
-                tu01 += t0*u.y;
-                tu20 += t2*u.x;
-                tu11 += t1*u.y;
-                tu02 += t0*u.z;
-                tu30 += t3*u.x;
-                tu21 += t2*u.y;
-                tu12 += t1*u.z;
-                tu03 += t0*u.w;
+                tu00 += u.x*t0;
+                tu10 += u.y*t0;
+                tu01 += u.x*t1;
+                tu20 += u.z*t0;
+                tu11 += u.y*t1;
+                tu02 += u.x*t2;
+                tu30 += u.w*t0;
+                tu21 += u.z*t1;
+                tu12 += u.y*t2;
+                tu03 += u.x*t3;
            }
-            tuv100_1 += tu10_1*v.x;
-            tuv010_1 += tu01_1*v.x;
-            tuv001_1 += tu00_1*v.y;
-            tuv200_1 += tu20_1*v.x;
-            tuv020_1 += tu02_1*v.x;
-            tuv002_1 += tu00_1*v.z;
-            tuv110_1 += tu11_1*v.x;
-            tuv101_1 += tu10_1*v.y;
-            tuv011_1 += tu01_1*v.y;
-            tuv100_2 += tu10_2*v.x;
-            tuv010_2 += tu01_2*v.x;
-            tuv001_2 += tu00_2*v.y;
-            tuv200_2 += tu20_2*v.x;
-            tuv020_2 += tu02_2*v.x;
-            tuv002_2 += tu00_2*v.z;
-            tuv110_2 += tu11_2*v.x;
-            tuv101_2 += tu10_2*v.y;
-            tuv011_2 += tu01_2*v.y;
-            tuv000 += tu00*v.x;
-            tuv100 += tu10*v.x;
-            tuv010 += tu01*v.x;
-            tuv001 += tu00*v.y;
-            tuv200 += tu20*v.x;
-            tuv020 += tu02*v.x;
-            tuv002 += tu00*v.z;
-            tuv110 += tu11*v.x;
-            tuv101 += tu10*v.y;
-            tuv011 += tu01*v.y;
-            tuv300 += tu30*v.x;
-            tuv030 += tu03*v.x;
-            tuv003 += tu00*v.w;
-            tuv210 += tu21*v.x;
-            tuv201 += tu20*v.y;
-            tuv120 += tu12*v.x;
-            tuv021 += tu02*v.y;
-            tuv102 += tu10*v.z;
-            tuv012 += tu01*v.z;
-            tuv111 += tu11*v.y;
+            tuv100_1 += v.y*tu00_1;
+            tuv010_1 += v.x*tu10_1;
+            tuv001_1 += v.x*tu01_1;
+            tuv200_1 += v.z*tu00_1;
+            tuv020_1 += v.x*tu20_1;
+            tuv002_1 += v.x*tu02_1;
+            tuv110_1 += v.y*tu10_1;
+            tuv101_1 += v.y*tu01_1;
+            tuv011_1 += v.x*tu11_1;
+            tuv100_2 += v.y*tu00_2;
+            tuv010_2 += v.x*tu10_2;
+            tuv001_2 += v.x*tu01_2;
+            tuv200_2 += v.z*tu00_2;
+            tuv020_2 += v.x*tu20_2;
+            tuv002_2 += v.x*tu02_2;
+            tuv110_2 += v.y*tu10_2;
+            tuv101_2 += v.y*tu01_2;
+            tuv011_2 += v.x*tu11_2;
+            tuv000 += v.x*tu00;
+            tuv100 += v.y*tu00;
+            tuv010 += v.x*tu10;
+            tuv001 += v.x*tu01;
+            tuv200 += v.z*tu00;
+            tuv020 += v.x*tu20;
+            tuv002 += v.x*tu02;
+            tuv110 += v.y*tu10;
+            tuv101 += v.y*tu01;
+            tuv011 += v.x*tu11;
+            tuv300 += v.w*tu00;
+            tuv030 += v.x*tu30;
+            tuv003 += v.x*tu03;
+            tuv210 += v.z*tu10;
+            tuv201 += v.z*tu01;
+            tuv120 += v.y*tu20;
+            tuv021 += v.x*tu21;
+            tuv102 += v.y*tu02;
+            tuv012 += v.x*tu12;
+            tuv111 += v.y*tu11;
        }
        phid[m]   = 0;
        phid[m+NUM_ATOMS] = tuv100_1;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoles.cu
@@ -24,13 +24,19 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
        // code common to ZThenX and Bisector
        
        int4 particles = multipoleParticles[atom];
-        if (particles.x >= 0 && particles.z >= 0) {
+        if (particles.z >= 0) {
            real4 thisParticlePos = posq[atom];
            real4 posZ = posq[particles.z];
            real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
-            real4 posX = posq[particles.x];
-            real3 vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
            int axisType = particles.w; 
+            real4 posX;
+            real3 vectorX;
+            if (axisType >= 4)
+                vectorX = make_real3((real) 0.1f);
+            else {
+                posX = posq[particles.x];
+                vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
+            }
    
            /*
                z-only
@@ -108,8 +114,6 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
                }
         
            }
-            else if (axisType >= 4)
-                vectorX = make_real3((real) 0.1f);
            
            // x = x - (x.z)z
        
@@ -137,7 +141,7 @@ extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4
            // Check the chirality and see whether it needs to be reversed
            
            bool reverse = false;
-            if (axisType != 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) {
+            if (axisType == 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) {
                real4 posY = posq[particles.y];
                real delta[4][3];


--- a/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
@@ -612,7 +612,7 @@ extern "C" __global__ void computeElectrostatics(
            data.force = make_real3(0);
            data.torque = make_real3(0);
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif

--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaExtrapolatedPolarization.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaExtrapolatedPolarization.cpp
@@ -284,8 +284,11 @@ vector<Vec3> setupWaterDimer(System& system,  AmoebaMultipoleForce* amoebaMultip

 static void check_finite_differences(vector<Vec3> analytic_forces, Context &context, vector<Vec3> positions)
 {
-    // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
+    double tol = 1e-5;
+    // We allow more permissive testing for single precision.
+    if(Platform::getPlatformByName("CUDA").getPropertyValue(context, "Precision") != "double") tol = 5e-4;

+    // Take a small step in the direction of the energy gradient and see whether the potential energy changes by the expected amount.
    double norm = 0.0;
    for (int i = 0; i < (int) analytic_forces.size(); ++i)
        norm += analytic_forces[i].dot(analytic_forces[i]);
@@ -303,7 +306,7 @@ static void check_finite_differences(vector<Vec3> analytic_forces, Context &cont
    State state2 = context.getState(State::Energy);
    context.setPositions(positions3);
    State state3 = context.getState(State::Energy);
-    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/stepSize, 1e-4);
+    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state3.getPotentialEnergy())/stepSize, tol);
 }


@@ -493,6 +496,8 @@ int main(int numberOfArguments, char* argv[]) {

    try {
        registerAmoebaCudaKernelFactories();
+        if (numberOfArguments > 1)
+            Platform::getPlatformByName("CUDA").setPropertyDefaultValue("Precision", std::string(argv[1]));

        /*
         * Water dimer energy / force tests under various conditions.