Continuing to convert AmoebaMultipoleForce: mutual induced dipoles now work

699fce6e · Peter Eastman · 29654b80 · 699fce6e · 699fce6e · 699fce6e
Commit 699fce6e authored Aug 06, 2012 by Peter Eastman
4 changed files
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -865,9 +865,9 @@ private:
 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false),
-        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL),
+        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL),
-        labFrameDipoles(NULL), labFrameQuadrupoles(NULL), field(NULL), fieldPolar(NULL), torque(NULL), dampingAndThole(NULL),
+        field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL),
-        inducedDipole(NULL), inducedDipolePolar(NULL), currentEpsilon(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
+        inducedDipole(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
        pmeGrid(NULL) {
 }
@@ -887,6 +887,10 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete field;
    if (fieldPolar != NULL)
        delete fieldPolar;
+    if (inducedField != NULL)
+        delete inducedField;
+    if (inducedFieldPolar != NULL)
+        delete inducedFieldPolar;
    if (torque != NULL)
        delete torque;
    if (dampingAndThole != NULL)
@@ -895,8 +899,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete inducedDipole;
    if (inducedDipolePolar != NULL)
        delete inducedDipolePolar;
-    if (currentEpsilon != NULL)
+    if (inducedDipoleErrors != NULL)
-        delete currentEpsilon;
+        delete inducedDipoleErrors;
    if (polarizability != NULL)
        delete polarizability;
    if (covalentFlags != NULL)
@@ -971,6 +975,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    torque = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "torque");
    inducedDipole = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
    inducedDipolePolar = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolar");
+    inducedDipoleErrors = new CudaArray(cu, cu.getNumThreadBlocks(), sizeof(float2), "inducedDipoleErrors");
    cu.addAutoclearBuffer(*field);
    cu.addAutoclearBuffer(*fieldPolar);
    cu.addAutoclearBuffer(*torque);
@@ -1009,9 +1014,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    if (force.getPolarizationType() == AmoebaMultipoleForce::Mutual) {
        maxInducedIterations = force.getMutualInducedMaxIterations();
        inducedEpsilon = force.getMutualInducedTargetEpsilon();
+        inducedField = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "inducedField");
+        inducedFieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "inducedFieldPolar");
    }
    else
-        maxInducedIterations == 0;
+        maxInducedIterations = 0;
    // Create the kernels.
@@ -1032,6 +1039,11 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    mapTorqueKernel = cu.getKernel(module, "mapTorqueToForce");
    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
+    if (maxInducedIterations > 0) {
+        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
+        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
+        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldBySOR");
+    }
    stringstream electrostaticsSource;
    electrostaticsSource << CudaKernelSources::vectorOps;
    electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
@@ -1436,8 +1448,26 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
            &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
        cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
+        vector<float2> errors;
        for (int i = 0; i < maxInducedIterations; i++) {
+            cu.clearBuffer(*inducedField);
+            cu.clearBuffer(*inducedFieldPolar);
+            void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+                &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+                &dampingAndThole->getDevicePointer()};
+            cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &inducedField->getDevicePointer(),
+                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
+                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
+            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize);
+            inducedDipoleErrors->download(errors);
+            double total1 = 0.0, total2 = 0.0;
+            for (int j = 0; j < (int) errors.size(); j++) {
+                total1 += errors[j].x;
+                total2 += errors[j].y;
+            }
+            if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
+                break;
        }
        // Compute electrostatic force.

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -388,11 +388,13 @@ private:
    CudaArray* labFrameQuadrupoles;
    CudaArray* field;
    CudaArray* fieldPolar;
+    CudaArray* inducedField;
+    CudaArray* inducedFieldPolar;
    CudaArray* torque;
    CudaArray* dampingAndThole;
    CudaArray* inducedDipole;
    CudaArray* inducedDipolePolar;
-    CudaArray* currentEpsilon;
+    CudaArray* inducedDipoleErrors;
    CudaArray* polarizability;
    CudaArray* covalentFlags;
    CudaArray* polarizationGroupFlags;
@@ -413,7 +415,7 @@ private:
    CudaArray* pmeAtomGridIndex;
    CudaSort* sort;
    cufftHandle fft;
-    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, electrostaticsKernel, mapTorqueKernel;
+    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
 };
 /**

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleInducedField.cu
+#define TILE_SIZE 32
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+typedef struct {
+    real3 pos;
+    real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
+    float thole, damp;
+} AtomData;
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+    real4 atomPosq = posq[atom];
+    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
+    data.inducedDipole.x = inducedDipole[atom*3];
+    data.inducedDipole.y = inducedDipole[atom*3+1];
+    data.inducedDipole.z = inducedDipole[atom*3+2];
+    data.inducedDipolePolar.x = inducedDipolePolar[atom*3];
+    data.inducedDipolePolar.y = inducedDipolePolar[atom*3+1];
+    data.inducedDipolePolar.z = inducedDipolePolar[atom*3+2];
+    float2 temp = dampingAndThole[atom];
+    data.damp = temp.x;
+    data.thole = temp.y;
+}
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, real3* fields) {
+    real rI = RSQRT(dot(deltaR, deltaR));
+    real r = RECIP(rI);
+    real r2I = rI*rI;
+    real rr3 = -rI*r2I;
+    real rr5 = -3*rr3*r2I;
+    real dampProd = atom1.damp*atom2.damp;
+    real ratio = (dampProd != 0 ? r/dampProd : 1);
+    float pGamma = (atom2.thole > atom1.thole ? atom1.thole: atom2.thole);
+    real damp = ratio*ratio*ratio*pGamma;
+    real dampExp = (dampProd != 0 ? EXP(-damp) : 0); 
+    rr3 *= 1 - dampExp;
+    rr5 *= 1 - (1+damp)*dampExp;
+    real dDotDelta = rr5*dot(deltaR, atom2.inducedDipole);
+    fields[0] = rr3*atom2.inducedDipole + dDotDelta*deltaR;
+    dDotDelta = rr5*dot(deltaR, atom2.inducedDipolePolar);
+    fields[1] = rr3*atom2.inducedDipolePolar + dDotDelta*deltaR;
+    dDotDelta = rr5*dot(deltaR, atom1.inducedDipole);
+    fields[2] = rr3*atom1.inducedDipole + dDotDelta*deltaR;
+    dDotDelta = rr5*dot(deltaR, atom1.inducedDipolePolar);
+    fields[3] = rr3*atom1.inducedDipolePolar + dDotDelta*deltaR;
+}
+/**
+ * Compute the mutual induced field.
+ */
+extern "C" __global__ void computeInducedField(
+        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
+        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+#endif
+        const float2* __restrict__ dampingAndThole) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    const unsigned int numTiles = interactionCount[0];
+    unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
+    unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
+#else
+    const unsigned int numTiles = numTileIndices;
+    unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
+    unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
+#endif
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+#ifndef ENABLE_SHUFFLE
+    __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
+#endif
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        unsigned int x, y;
+        AtomData data;
+        data.field = make_real3(0);
+        data.fieldPolar = make_real3(0);
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+                localData[threadIdx.x].pos = data.pos;
+                localData[threadIdx.x].inducedDipole = data.inducedDipole;
+                localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
+                localData[threadIdx.x].thole = data.thole;
+                localData[threadIdx.x].damp = data.damp;
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                    int atom2 = tbx+j;
+                    real3 delta = localData[atom2].pos-data.pos;
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real3 fields[4];
+                    computeOneInteraction(data, localData[atom2], delta, fields);
+                    atom2 = y*TILE_SIZE+j;
+                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                        data.field += fields[0];
+                        data.fieldPolar += fields[1];
+                    }
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+                loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+                localData[threadIdx.x].field = make_real3(0);
+                localData[threadIdx.x].fieldPolar = make_real3(0);
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                int atom2 = tbx+j;
+                                int bufferIndex = 3*threadIdx.x;
+                                real3 dEdR1 = make_real3(0);
+                                real3 dEdR2 = make_real3(0);
+                                real3 delta = localData[atom2].pos-data.pos;
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                                if (r2 < CUTOFF_SQUARED) {
+#endif
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    COMPUTE_INTERACTION
+#ifdef USE_CUTOFF
+                                }
+#endif
+#ifdef ENABLE_SHUFFLE
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                for (int i = 16; i >= 1; i /= 2) {
+                                    dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
+                                    dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
+                                    dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += dEdR2.x;
+                                    localData[tbx+j].fy += dEdR2.y;
+                                    localData[tbx+j].fz += dEdR2.z;
+                                }
+#else
+                                force.x -= dEdR1.x;
+                                force.y -= dEdR1.y;
+                                force.z -= dEdR1.z;
+                                tempBuffer[bufferIndex] = dEdR2.x;
+                                tempBuffer[bufferIndex+1] = dEdR2.y;
+                                tempBuffer[bufferIndex+2] = dEdR2.z;
+                                // Sum the forces on atom2.
+                                if (tgx % 4 == 0) {
+                                    tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
+                                    tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
+                                    tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
+                                }
+                                if (tgx == 0) {
+                                    localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
+                                    localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
+                                    localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
+                                }
+#endif
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                        int atom2 = tbx+tj;
+                        real3 delta = localData[atom2].pos-data.pos;
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real3 fields[4];
+                        computeOneInteraction(data, localData[atom2], delta, fields);
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            data.field += fields[0];
+                            data.fieldPolar += fields[1];
+                            localData[atom2].field += fields[2];
+                            localData[atom2].fieldPolar += fields[3];
+                        }
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        // Write results.
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0xFFFFFFFF)));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0xFFFFFFFF)));
+            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0xFFFFFFFF)));
+        }
+        pos++;
+    } while (pos < end);
+}
+extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
+        const long long* __restrict__ inducedField, const long long* __restrict__ inducedFieldPolar, real* __restrict__ inducedDipole,
+        real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors) {
+    extern __shared__ real2 buffer[];
+    float polarSOR = 0.55f;
+    real sumErrors = 0;
+    real sumPolarErrors = 0;
+    for (int atom = blockIdx.x*blockDim.x + threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        real scale = polarizability[atom]/(real) 0xFFFFFFFF;
+        for (int component = 0; component < 3; component++) {
+            int dipoleIndex = 3*atom+component;
+            int fieldIndex = atom+component*PADDED_NUM_ATOMS;
+            real previousDipole = inducedDipole[dipoleIndex];
+            real previousDipolePolar = inducedDipolePolar[dipoleIndex];
+            real newDipole = scale*(fixedField[fieldIndex]+inducedField[fieldIndex]);
+            real newDipolePolar = scale*(fixedFieldPolar[fieldIndex]+inducedFieldPolar[fieldIndex]);
+            newDipole = previousDipole + polarSOR*(newDipole-previousDipole);
+            newDipolePolar = previousDipolePolar + polarSOR*(newDipolePolar-previousDipolePolar);
+            inducedDipole[dipoleIndex] = newDipole;
+            inducedDipolePolar[dipoleIndex] = newDipolePolar;
+            sumErrors += (newDipole-previousDipole)*(newDipole-previousDipole);
+            sumPolarErrors += (newDipolePolar-previousDipolePolar)*(newDipolePolar-previousDipolePolar);
+        }
+    }
+    // Sum the errors over threads and store the total for this block.
+    buffer[threadIdx.x] = make_real2(sumErrors, sumPolarErrors);
+    __syncthreads();
+    for (int offset = 1; offset < blockDim.x; offset *= 2) {   
+        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0) {
+            buffer[threadIdx.x].x += buffer[threadIdx.x+offset].x;
+            buffer[threadIdx.x].y += buffer[threadIdx.x+offset].y;
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        errors[blockIdx.x] = make_float2((float) buffer[0].x, (float) buffer[0].y);
+}
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaMultipoleForce.cpp
@@ -1384,8 +1384,8 @@ int main( int numberOfArguments, char* argv[] ) {
        // test mutual polarization, no cutoff
-//        testMultipoleAmmoniaMutualPolarization( log );
+        testMultipoleAmmoniaMutualPolarization( log );
-//
 //        // test multipole direct & mutual polarization using PME
 //
 //        testMultipoleWaterPMEDirectPolarization( log );