Continuing to convert AmoebaGeneralizedKirkwoodForce to new CUDA platform

c2461875 · Peter Eastman · 6c0082a5 · c2461875 · c2461875 · c2461875
Commit c2461875 authored Sep 19, 2012 by Peter Eastman
4 changed files
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -1321,6 +1321,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
    int forceThreadBlockSize = nb.getForceThreadBlockSize();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    void* npt = NULL;
    if (pmeGrid == NULL) {
        // Compute induced dipoles.
@@ -1355,11 +1356,40 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
        for (int i = 0; i < maxInducedIterations; i++) {
            cu.clearBuffer(*inducedField);
            cu.clearBuffer(*inducedFieldPolar);
-            void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+            if (gkKernel == NULL) {
-                &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+                void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
-                &dampingAndThole->getDevicePointer()};
+                    &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
-            cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+                    &dampingAndThole->getDevicePointer()};
-            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &inducedField->getDevicePointer(),
+                cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+            }
+            else {
+                cu.clearBuffer(*gkKernel->getInducedField());
+                cu.clearBuffer(*gkKernel->getInducedFieldPolar());
+                vector<float> d, dp;
+                gkKernel->getInducedDipoles()->download(d);
+                gkKernel->getInducedDipolesPolar()->download(dp);
+                printf("dipole\n");
+                for (int i = 0; i < cu.getNumAtoms(); i++)
+                    printf("%d %g %g %g, %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2], dp[3*i], dp[3*i+1], dp[3*i+2]);
+                void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
+                    &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
+                    &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
+                    &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
+                    &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
+                cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+                vector<long long> f, fp;
+                gkKernel->getInducedField()->download(f);
+                gkKernel->getInducedFieldPolar()->download(fp);
+                printf("field\n");
+                for (int i = 0; i < cu.getNumAtoms(); i++)
+                    printf("%d %g %g %g, %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+2*cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, fp[i]/(double) 0xFFFFFFFF, fp[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, fp[i+2*cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF);
+                void* updateInducedGkFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
+                    &gkKernel->getField()->getDevicePointer(), &gkKernel->getFieldPolar()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
+                    &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
+                    &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
+                cu.executeKernel(updateInducedFieldKernel, updateInducedGkFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
+            }
+            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &npt, &inducedField->getDevicePointer(),
                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
@@ -1487,7 +1517,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
                &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
-            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &inducedField->getDevicePointer(),
+            void* updateInducedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &npt, &inducedField->getDevicePointer(),
                &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
                &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer()};
            cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
@@ -1704,8 +1734,8 @@ private:
 };
 CudaCalcAmoebaGeneralizedKirkwoodForceKernel::CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : 
-           CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), params(NULL), bornRadii(NULL), field(NULL), inducedDipoleS(NULL),
+           CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), params(NULL), bornRadii(NULL), field(NULL), fieldPolar(NULL),
-           inducedDipolePolarS(NULL), bornSum(NULL), bornForce(NULL) {
+           inducedField(NULL), inducedFieldPolar(NULL), inducedDipoleS(NULL), inducedDipolePolarS(NULL), bornSum(NULL), bornForce(NULL) {
 }
 CudaCalcAmoebaGeneralizedKirkwoodForceKernel::~CudaCalcAmoebaGeneralizedKirkwoodForceKernel() {
@@ -1716,6 +1746,12 @@ CudaCalcAmoebaGeneralizedKirkwoodForceKernel::~CudaCalcAmoebaGeneralizedKirkwood
        delete bornRadii;
    if (field != NULL)
        delete field;
+    if (fieldPolar != NULL)
+        delete fieldPolar;
+    if (inducedField != NULL)
+        delete inducedField;
+    if (inducedFieldPolar != NULL)
+        delete inducedFieldPolar;
    if (inducedDipoleS != NULL)
        delete inducedDipoleS;
    if (inducedDipolePolarS != NULL)
@@ -1741,11 +1777,17 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
    params = CudaArray::create<float2>(cu, paddedNumAtoms, "amoebaGkParams");
    bornRadii = new CudaArray(cu, paddedNumAtoms, elementSize, "bornRadii");
    field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "gkField");
+    fieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "gkFieldPolar");
    bornSum = CudaArray::create<long long>(cu, paddedNumAtoms, "bornSum");
    bornForce = CudaArray::create<long long>(cu, paddedNumAtoms, "bornForce");
    inducedDipoleS = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipoleS");
    inducedDipolePolarS = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolarS");
+    if (multipoles->getPolarizationType() == AmoebaMultipoleForce::Mutual) {
+        inducedField = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "gkInducedField");
+        inducedFieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "gkInducedFieldPolar");
+    }
    cu.addAutoclearBuffer(*field);
+    cu.addAutoclearBuffer(*fieldPolar);
    cu.addAutoclearBuffer(*bornSum);
    cu.addAutoclearBuffer(*bornForce);
    vector<float2> paramsVector(paddedNumAtoms);

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -470,6 +470,15 @@ public:
    CudaArray* getField() {
        return field;
    }
+    CudaArray* getFieldPolar() {
+        return fieldPolar;
+    }
+    CudaArray* getInducedField() {
+        return inducedField;
+    }
+    CudaArray* getInducedFieldPolar() {
+        return inducedFieldPolar;
+    }
    CudaArray* getInducedDipoles() {
        return inducedDipoleS;
    }
@@ -485,6 +494,9 @@ private:
    CudaArray* bornRadii;
    CudaArray* bornForce;
    CudaArray* field;
+    CudaArray* fieldPolar;
+    CudaArray* inducedField;
+    CudaArray* inducedFieldPolar;
    CudaArray* inducedDipoleS;
    CudaArray* inducedDipolePolarS;
    CUfunction computeBornSumKernel, reduceBornSumKernel, gkForceKernel, chainRuleKernel, ediffKernel;

--- a/plugins/amoeba/platforms/cuda2/src/kernels/gkEDiffPairForce.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/gkEDiffPairForce.cu
-#define APPLY_SCALE
 #if defined F1
 __device__ void computeOneEDiffInteractionF1(AtomData4& atom1, volatile AtomData4& atom2, float dScale, float pScale, real& outputEnergy, real3& outputForce) {
 #elif defined T1
@@ -77,7 +75,6 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    real scale3i = 3*scale3*uscale*rr3*rr2;
    real scale5i = 3*scale5*uscale*rr3*rr2;
-#ifdef APPLY_SCALE
    real dsc3 = scale3*dScale*rr3;
    real dsc5 = 3*scale5*dScale*rr3*rr2;
    real dsc7 = 15*scale7*dScale*rr3*rr3*rr1;
@@ -85,11 +82,6 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    real psc3 = scale3*pScale*rr3;
    real psc5 = 3*scale5*pScale*rr3*rr2;
    real psc7 = 15*scale7*pScale*rr3*rr3*rr1;
-#else
-    real psc3 = scale3*rr3;
-    real psc5 = 3*scale5*rr3*rr2;
-    real psc7 = 15*scale7*rr3*rr3*rr1;
-#endif
 #ifdef T1
    real dixr1 = atom1.dipole.y*zr - atom1.dipole.z*yr;
@@ -167,15 +159,9 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    qiukp2 -= atom1.quadrupoleXY*atom2.inducedDipolePolar.x + atom1.quadrupoleYY*atom2.inducedDipolePolar.y + atom1.quadrupoleYZ*atom2.inducedDipolePolar.z;
    qiukp3 -= atom1.quadrupoleXZ*atom2.inducedDipolePolar.x + atom1.quadrupoleYZ*atom2.inducedDipolePolar.y + atom1.quadrupoleZZ*atom2.inducedDipolePolar.z;
-#ifdef APPLY_SCALE
    ftm2i1 -= psc5*qiuk1 + dsc5*qiukp1;
    ftm2i2 -= psc5*qiuk2 + dsc5*qiukp2;
    ftm2i3 -= psc5*qiuk3 + dsc5*qiukp3;
-#else
-    ftm2i1 -= psc5*(qiuk1 + qiukp1);
-    ftm2i2 -= psc5*(qiuk2 + qiukp2);
-    ftm2i3 -= psc5*(qiuk3 + qiukp3);
-#endif
 #endif
 #endif
@@ -197,15 +183,9 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    qkuip2 -= atom2.quadrupoleXY*atom1.inducedDipolePolar.x + atom2.quadrupoleYY*atom1.inducedDipolePolar.y + atom2.quadrupoleYZ*atom1.inducedDipolePolar.z;
    qkuip3 -= atom2.quadrupoleXZ*atom1.inducedDipolePolar.x + atom2.quadrupoleYZ*atom1.inducedDipolePolar.y + atom2.quadrupoleZZ*atom1.inducedDipolePolar.z;
-#ifdef APPLY_SCALE
    ftm2i1 += psc5*qkui1 + dsc5*qkuip1;
    ftm2i2 += psc5*qkui2 + dsc5*qkuip2;
    ftm2i3 += psc5*qkui3 + dsc5*qkuip3;
-#else
-    ftm2i1 += psc5*(qkui1 + qkuip1);
-    ftm2i2 += psc5*(qkui2 + qkuip2);
-    ftm2i3 += psc5*(qkui3 + qkuip3);
-#endif
 #endif
 #endif
@@ -326,44 +306,24 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    real glip3 = scip3Y*sc6 - scip4Y*sc5;
 #ifdef F1
-#ifdef APPLY_SCALE
    ftm2i1 -= 0.5f*((gli1*pScale + glip1*dScale)*ddsc3_1 + (gli2*pScale + glip2*dScale)*ddsc5_1 + (gli3*pScale+glip3*dScale)*ddsc7_1);
    ftm2i2 -= 0.5f*((gli1*pScale + glip1*dScale)*ddsc3_2 + (gli2*pScale + glip2*dScale)*ddsc5_2 + (gli3*pScale+glip3*dScale)*ddsc7_2);
    ftm2i3 -= 0.5f*((gli1*pScale + glip1*dScale)*ddsc3_3 + (gli2*pScale + glip2*dScale)*ddsc5_3 + (gli3*pScale+glip3*dScale)*ddsc7_3);
-#else
-    ftm2i1 -= 0.5f*((gli1 + glip1)*ddsc3_1 + (gli2 + glip2)*ddsc5_1 + (gli3 + glip3)*ddsc7_1);
-    ftm2i2 -= 0.5f*((gli1 + glip1)*ddsc3_2 + (gli2 + glip2)*ddsc5_2 + (gli3 + glip3)*ddsc7_2);
-    ftm2i3 -= 0.5f*((gli1 + glip1)*ddsc3_3 + (gli2 + glip2)*ddsc5_3 + (gli3 + glip3)*ddsc7_3);
-#endif
    outputEnergy = gli1*psc3 + gli2*psc5 + gli3*psc7;
 #endif
-#ifdef APPLY_SCALE
    gfi1 += 1.5f*(gli1*psc3 + glip1*dsc3);
    gfi1 += 2.5f*(gli2*psc5 + glip2*dsc5);
    gfi1 += 3.5f*(gli3*psc7 + glip3*dsc7);
-#else
-    gfi1 += 1.5f*psc3*(gli1 + glip1);
-    gfi1 += 2.5f*psc5*(gli2 + glip2);
-    gfi1 += 3.5f*psc7*(gli3 + glip3);
-#endif
    gfi1 *= rr2;
    gfi1 += 0.5f*scip2*scale3i;
 #if defined F1 || defined T1
-#ifdef APPLY_SCALE
    real gfi5 =  (sci4Y*psc7+scip4Y*dsc7);
-#else
-    real gfi5 =  psc7*(sci4Y + scip4Y);
-#endif
 #endif
 #if defined F1 || defined T3
-#ifdef APPLY_SCALE
    real gfi6 = -(sci3Y*psc7+scip3Y*dsc7);
-#else
-    real gfi6 = -psc7*(sci3Y + scip3Y);
-#endif
 #endif
 #ifdef F1
@@ -371,66 +331,35 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
    real diff0 = atom1.inducedDipoleS.x - atom1.inducedDipole.x;               
    real diff1 = atom1.inducedDipolePolarS.x - atom1.inducedDipolePolar.x;               
-#ifdef APPLY_SCALE
    ftm2i1 += 0.5f*(-atom2.q*(diff0*psc3 + diff1*dsc3) + sc4*(diff0*psc5 + diff1*dsc5) - sc6*(diff0*psc7 + diff1*dsc7));
-#else
-    ftm2i1 += 0.5f*(-atom2.q*psc3*(diff0 + diff1) + sc4*psc5*(diff0 + diff1) - sc6*psc7*(diff0 + diff1));
-#endif
    diff0 = atom2.inducedDipoleS.x - atom2.inducedDipole.x;               
    diff1 = atom2.inducedDipolePolarS.x - atom2.inducedDipolePolar.x;               
-#ifdef APPLY_SCALE
    ftm2i1 += 0.5f*(atom1.q*(diff0*psc3 + diff1*dsc3) + sc3*(diff0*psc5 + diff1*dsc5) + sc5*(diff0*psc7 + diff1*dsc7));
    ftm2i1 += 0.5f*(sci4Y*psc5+scip4Y*dsc5)*atom1.dipole.x + 0.5f*(sci3Y*psc5+scip3Y*dsc5)*atom2.dipole.x + gfi5*qir1 + gfi6*qkr1;
-#else
-    ftm2i1 += 0.5f*(atom1.q*psc3*(diff0 + diff1) + sc3*psc5*(diff0 + diff1) + sc5*psc7*(diff0 + diff1));
-    ftm2i1 += 0.5f*psc5*(sci4Y + scip4Y)*atom1.dipole.x + 0.5f*psc5*(sci3Y + scip3Y)*atom2.dipole.x + gfi5*qir1 + gfi6*qkr1;
-#endif
    ftm2i2 += gfi1*yr;
    diff0 = atom1.inducedDipoleS.y - atom1.inducedDipole.y;               
    diff1 = atom1.inducedDipolePolarS.y - atom1.inducedDipolePolar.y;               
-#ifdef APPLY_SCALE
    ftm2i2 += 0.5f*(-atom2.q*(diff0*psc3 + diff1*dsc3) + sc4*(diff0*psc5 + diff1*dsc5) - sc6*(diff0*psc7 + diff1*dsc7));
-#else
-    ftm2i2 += 0.5f*(-atom2.q*psc3*(diff0 + diff1) + sc4*psc5*(diff0 + diff1) - sc6*psc7*(diff0 + diff1));
-#endif
    diff0 = atom2.inducedDipoleS.y - atom2.inducedDipole.y;               
    diff1 = atom2.inducedDipolePolarS.y - atom2.inducedDipolePolar.y;               
-#ifdef APPLY_SCALE
    ftm2i2 += 0.5f*(atom1.q*(diff0*psc3 + diff1*dsc3) + sc3*(diff0*psc5 + diff1*dsc5) + sc5*(diff0*psc7 + diff1*dsc7));
    ftm2i2 += 0.5f*(sci4Y*psc5+scip4Y*dsc5)*atom1.dipole.y + 0.5f*(sci3Y*psc5+scip3Y*dsc5)*atom2.dipole.y + gfi5*qir2 + gfi6*qkr2;
-#else
-    ftm2i2 += 0.5f*(atom1.q*psc3*(diff0 + diff1) + sc3*psc5*(diff0 + diff1) + sc5*psc7*(diff0 + diff1));
-    ftm2i2 += 0.5f*psc5*(sci4Y +scip4Y)*atom1.dipole.y + 0.5f*psc5*(sci3Y +scip3Y)*atom2.dipole.y + gfi5*qir2 + gfi6*qkr2;
-#endif
    ftm2i3 += gfi1*zr;
    diff0 = atom1.inducedDipoleS.z - atom1.inducedDipole.z;               
    diff1 = atom1.inducedDipolePolarS.z - atom1.inducedDipolePolar.z;               
-#ifdef APPLY_SCALE
    ftm2i3 += 0.5f*(-atom2.q*(diff0*psc3 + diff1*dsc3) + sc4*(diff0*psc5 + diff1*dsc5) - sc6*(diff0*psc7 + diff1*dsc7));
-#else
-    ftm2i3 += 0.5f*(-atom2.q*psc3*(diff0 + diff1) + sc4*psc5*(diff0 + diff1) - sc6*psc7*(diff0 + diff1));
-#endif
    diff0 = atom2.inducedDipoleS.z - atom2.inducedDipole.z;               
    diff1 = atom2.inducedDipolePolarS.z - atom2.inducedDipolePolar.z;               
-#ifdef APPLY_SCALE
    ftm2i3 += 0.5f*(atom1.q*(diff0*psc3 + diff1*dsc3) + sc3*(diff0*psc5 + diff1*dsc5) + sc5*(diff0*psc7 + diff1*dsc7));
    ftm2i3 += 0.5f*(sci4Y*psc5+scip4Y*dsc5)*atom1.dipole.z + 0.5f*(sci3Y*psc5+scip3Y*dsc5)*atom2.dipole.z + gfi5*qir3 + gfi6*qkr3;
-#else
-    ftm2i3 += 0.5f*(atom1.q*psc3*(diff0 + diff1) + sc3*psc5*(diff0 + diff1) + sc5*psc7*(diff0 + diff1));
-    ftm2i3 += 0.5f*psc5*(sci4Y + scip4Y)*atom1.dipole.z + 0.5f*psc5*(sci3Y+scip3Y)*atom2.dipole.z + gfi5*qir3 + gfi6*qkr3;
-#endif
    // intermediate values needed for partially excluded interactions
@@ -475,31 +404,17 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
 #endif
 #ifdef T1
-#ifdef APPLY_SCALE
    real gti2 = 0.5f*(sci4Y*psc5 + scip4Y*dsc5);
    real ttm2i1 = -(dixuk1*psc3+dixukp1*dsc3)*0.5f + gti2*dixr1 + ((ukxqir1+rxqiuk1)*psc5 +(ukxqirp1+rxqiukp1)*dsc5) - gfi5*rxqir1;
    real ttm2i2 = -(dixuk2*psc3+dixukp2*dsc3)*0.5f + gti2*dixr2 + ((ukxqir2+rxqiuk2)*psc5 +(ukxqirp2+rxqiukp2)*dsc5) - gfi5*rxqir2;
    real ttm2i3 = -(dixuk3*psc3+dixukp3*dsc3)*0.5f + gti2*dixr3 + ((ukxqir3+rxqiuk3)*psc5 +(ukxqirp3+rxqiukp3)*dsc5) - gfi5*rxqir3;
-#else
-    real gti2 = 0.5f*psc5*(sci4Y + scip4Y);
-    real ttm2i1 = -psc3*(dixuk1 + dixukp1)*0.5f + gti2*dixr1 + psc5*((ukxqir1+rxqiuk1) + (ukxqirp1+rxqiukp1)) - gfi5*rxqir1;
-    real ttm2i2 = -psc3*(dixuk2 + dixukp2)*0.5f + gti2*dixr2 + psc5*((ukxqir2+rxqiuk2) + (ukxqirp2+rxqiukp2)) - gfi5*rxqir2;
-    real ttm2i3 = -psc3*(dixuk3 + dixukp3)*0.5f + gti2*dixr3 + psc5*((ukxqir3+rxqiuk3) + (ukxqirp3+rxqiukp3)) - gfi5*rxqir3;
-#endif
 #endif
 #ifdef T3
-#ifdef APPLY_SCALE
    real gti3 = 0.5f*(sci3Y*psc5 + scip3Y*dsc5);
    real ttm3i1 = -(dkxui1*psc3+dkxuip1*dsc3)*0.5f + gti3*dkxr1 - ((uixqkr1+rxqkui1)*psc5 +(uixqkrp1+rxqkuip1)*dsc5) - gfi6*rxqkr1;
    real ttm3i2 = -(dkxui2*psc3+dkxuip2*dsc3)*0.5f + gti3*dkxr2 - ((uixqkr2+rxqkui2)*psc5 +(uixqkrp2+rxqkuip2)*dsc5) - gfi6*rxqkr2;
    real ttm3i3 = -(dkxui3*psc3+dkxuip3*dsc3)*0.5f + gti3*dkxr3 - ((uixqkr3+rxqkui3)*psc5 +(uixqkrp3+rxqkuip3)*dsc5) - gfi6*rxqkr3;
-#else
-    real gti3 = 0.5f*psc5*(sci3Y + scip3Y);
-    real ttm3i1 = -psc3*(dkxui1 + dkxuip1)*0.5f + gti3*dkxr1 - psc5*((uixqkr1+rxqkui1) + (uixqkrp1+rxqkuip1)) - gfi6*rxqkr1;
-    real ttm3i2 = -psc3*(dkxui2 + dkxuip2)*0.5f + gti3*dkxr2 - psc5*((uixqkr2+rxqkui2) + (uixqkrp2+rxqkuip2)) - gfi6*rxqkr2;
-    real ttm3i3 = -psc3*(dkxui3 + dkxuip3)*0.5f + gti3*dkxr3 - psc5*((uixqkr3+rxqkui3) + (uixqkrp3+rxqkuip3)) - gfi6*rxqkr3;
-#endif
 #endif
    // update force and torque on site k
@@ -603,27 +518,15 @@ __device__ void computeOneEDiffInteractionT3(AtomData4& atom1, volatile AtomData
 #endif
 #ifdef T1
-#ifdef APPLY_SCALE
     ttm2i1 = -(dixuk1*psc3+dixukp1*dsc3)*0.5f + ((ukxqir1+rxqiuk1)*psc5 +(ukxqirp1+rxqiukp1)*dsc5);
     ttm2i2 = -(dixuk2*psc3+dixukp2*dsc3)*0.5f + ((ukxqir2+rxqiuk2)*psc5 +(ukxqirp2+rxqiukp2)*dsc5);
     ttm2i3 = -(dixuk3*psc3+dixukp3*dsc3)*0.5f + ((ukxqir3+rxqiuk3)*psc5 +(ukxqirp3+rxqiukp3)*dsc5);
-#else
-     ttm2i1 = -psc3*(dixuk1+dixukp1)*0.5f + psc5*((ukxqir1+rxqiuk1) + (ukxqirp1+rxqiukp1));
-     ttm2i2 = -psc3*(dixuk2+dixukp2)*0.5f + psc5*((ukxqir2+rxqiuk2) + (ukxqirp2+rxqiukp2));
-     ttm2i3 = -psc3*(dixuk3+dixukp3)*0.5f + psc5*((ukxqir3+rxqiuk3) + (ukxqirp3+rxqiukp3));
-#endif
 #endif
 #ifdef T3
-#ifdef APPLY_SCALE
     ttm3i1 = -(dkxui1*psc3+dkxuip1*dsc3)*0.5f - ((uixqkr1+rxqkui1)*psc5 +(uixqkrp1+rxqkuip1)*dsc5);
     ttm3i2 = -(dkxui2*psc3+dkxuip2*dsc3)*0.5f - ((uixqkr2+rxqkui2)*psc5 +(uixqkrp2+rxqkuip2)*dsc5);
     ttm3i3 = -(dkxui3*psc3+dkxuip3*dsc3)*0.5f - ((uixqkr3+rxqkui3)*psc5 +(uixqkrp3+rxqkuip3)*dsc5);
-#else
-     ttm3i1 = -psc3*(dkxui1 + dkxuip1)*0.5f - psc5*((uixqkr1+rxqkui1) + (uixqkrp1+rxqkuip1));
-     ttm3i2 = -psc3*(dkxui2 + dkxuip2)*0.5f - psc5*((uixqkr2+rxqkui2) + (uixqkrp2+rxqkuip2));
-     ttm3i3 = -psc3*(dkxui3 + dkxuip3)*0.5f - psc5*((uixqkr3+rxqkui3) + (uixqkrp3+rxqkuip3));
-#endif
 #endif
    // update force and torque on site k;

--- a/plugins/amoeba/platforms/cuda2/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/multipoleInducedField.cu
@@ -4,10 +4,21 @@
 typedef struct {
    real3 pos;
    real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
+#ifdef USE_GK
+    real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS;
+    real bornRadius;
+#endif
    float thole, damp;
 } AtomData;
-inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+#ifdef USE_GK
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
+        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole, const real* __restrict__ inducedDipoleS,
+        const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii) {
+#else
+inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
+        const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
+#endif
    real4 atomPosq = posq[atom];
    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
    data.inducedDipole.x = inducedDipole[atom*3];
@@ -19,10 +30,30 @@ inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __res
    float2 temp = dampingAndThole[atom];
    data.damp = temp.x;
    data.thole = temp.y;
+#ifdef USE_GK
+    data.inducedDipoleS.x = inducedDipoleS[atom*3];
+    data.inducedDipoleS.y = inducedDipoleS[atom*3+1];
+    data.inducedDipoleS.z = inducedDipoleS[atom*3+2];
+    data.inducedDipolePolarS.x = inducedDipolePolarS[atom*3];
+    data.inducedDipolePolarS.y = inducedDipolePolarS[atom*3+1];
+    data.inducedDipolePolarS.z = inducedDipolePolarS[atom*3+2];
+    data.bornRadius = bornRadii[atom];
+#endif
+}
+inline __device__ void zeroAtomData(AtomData& data) {
+    data.field = make_real3(0);
+    data.fieldPolar = make_real3(0);
+#ifdef USE_GK
+    data.fieldS = make_real3(0);
+    data.fieldPolarS = make_real3(0);
+#endif
 }
 #ifdef USE_EWALD
-__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, real3* fields) {
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, bool isSelfInteraction) {
+    if (isSelfInteraction)
+        return;
    real scale1, scale2;
    real r2 = dot(deltaR, deltaR);
    if (r2 < CUTOFF_SQUARED) {
@@ -35,7 +66,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
        real bn0 = erfc(ralpha)*rI;
        real alsq2 = 2*EWALD_ALPHA*EWALD_ALPHA;
        real alsq2n = RECIP(SQRT_PI*EWALD_ALPHA);
-        real exp2a = expf(-(ralpha*ralpha));
+        real exp2a = EXP(-(ralpha*ralpha));
        alsq2n *= alsq2;
        real bn1 = (bn0+alsq2n*exp2a)*rI*rI;
@@ -73,16 +104,90 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
        scale2 = 0;
    }
    real dDotDelta = scale2*dot(deltaR, atom2.inducedDipole);
-    fields[0] = scale1*atom2.inducedDipole + dDotDelta*deltaR;
+    atom1.field += scale1*atom2.inducedDipole + dDotDelta*deltaR;
    dDotDelta = scale2*dot(deltaR, atom2.inducedDipolePolar);
-    fields[1] = scale1*atom2.inducedDipolePolar + dDotDelta*deltaR;
+    atom1.fieldPolar += scale1*atom2.inducedDipolePolar + dDotDelta*deltaR;
    dDotDelta = scale2*dot(deltaR, atom1.inducedDipole);
-    fields[2] = scale1*atom1.inducedDipole + dDotDelta*deltaR;
+    atom2.field += scale1*atom1.inducedDipole + dDotDelta*deltaR;
    dDotDelta = scale2*dot(deltaR, atom1.inducedDipolePolar);
-    fields[3] = scale1*atom1.inducedDipolePolar + dDotDelta*deltaR;
+    atom2.fieldPolar += scale1*atom1.inducedDipolePolar + dDotDelta*deltaR;
+}
+#elif defined USE_GK
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, bool isSelfInteraction) {
+    real r2 = dot(deltaR, deltaR);
+    real r = SQRT(r2);
+    if (!isSelfInteraction) {
+        real rI = RECIP(r);
+        real r2I = rI*rI;
+        real rr3 = -rI*r2I;
+        real rr5 = -3*rr3*r2I;
+        real dampProd = atom1.damp*atom2.damp;
+        real ratio = (dampProd != 0 ? r/dampProd : 1);
+        float pGamma = (atom1.thole > atom2.thole ? atom2.thole: atom1.thole);
+        real damp = ratio*ratio*ratio*pGamma;
+        real dampExp = (dampProd != 0 ? EXP(-damp) : 0); 
+        rr3 *= 1-dampExp;
+        rr5 *= 1-(1+damp)*dampExp;
+        real dDotDelta = rr5*dot(deltaR, atom2.inducedDipole);
+        atom1.field += rr3*atom2.inducedDipole + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom2.inducedDipolePolar);
+        atom1.fieldPolar += rr3*atom2.inducedDipolePolar + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom1.inducedDipole);
+        atom2.field += rr3*atom1.inducedDipole + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom1.inducedDipolePolar);
+        atom2.fieldPolar += rr3*atom1.inducedDipolePolar + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom2.inducedDipoleS);
+        atom1.fieldS += rr3*atom2.inducedDipoleS + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom2.inducedDipolePolarS);
+        atom1.fieldPolarS += rr3*atom2.inducedDipolePolarS + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom1.inducedDipoleS);
+        atom2.fieldS += rr3*atom1.inducedDipoleS + dDotDelta*deltaR;
+        dDotDelta = rr5*dot(deltaR, atom1.inducedDipolePolarS);
+        atom2.fieldPolarS += rr3*atom1.inducedDipolePolarS + dDotDelta*deltaR;
+    }
+    real rb2 = atom1.bornRadius*atom2.bornRadius;
+    real expterm = EXP(-r2/(GK_C*rb2));
+    real expc = expterm/GK_C; 
+    real gf2 = RECIP(r2+rb2*expterm);
+    real gf = SQRT(gf2);
+    real gf3 = gf2*gf;
+    real gf5 = gf3*gf2;
+    // reaction potential auxiliary terms
+    real a10 = -gf3;
+    // reaction potential gradient auxiliary terms
+    real expc1 = 1 - expc;
+    real a11 = expc1 * 3 * gf5;
+    // unweighted dipole reaction potential gradient tensor
+    real3 gux = GK_FD*make_real3(a10+deltaR.x*deltaR.x*a11, deltaR.x*deltaR.y*a11, deltaR.x*deltaR.z*a11);
+    real3 guy = make_real3(gux.y, GK_FD*(a10+deltaR.y*deltaR.y*a11), GK_FD*deltaR.y*deltaR.z*a11);
+    real3 guz = make_real3(gux.z, guy.z, GK_FD*(a10+deltaR.z*deltaR.z*a11));
+    atom1.fieldS += atom2.inducedDipoleS.x*gux+atom2.inducedDipoleS.y*guy+atom2.inducedDipoleS.z*guz;
+    atom2.fieldS += atom1.inducedDipoleS.x*gux+atom1.inducedDipoleS.y*guy+atom1.inducedDipoleS.z*guz;
+    atom1.fieldPolarS += atom2.inducedDipolePolarS.x*gux+atom2.inducedDipolePolarS.y*guy+atom2.inducedDipolePolarS.z*guz;
+    atom2.fieldPolarS += atom1.inducedDipolePolarS.x*gux+atom1.inducedDipolePolarS.y*guy+atom1.inducedDipolePolarS.z*guz;
 }
 #else
-__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, real3* fields) {
+__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, bool isSelfInteraction) {
+    if (isSelfInteraction)
+        return;
    real rI = RSQRT(dot(deltaR, deltaR));
    real r = RECIP(rI);
    real r2I = rI*rI;
@@ -96,13 +201,13 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
    rr3 *= 1 - dampExp;
    rr5 *= 1 - (1+damp)*dampExp;
    real dDotDelta = rr5*dot(deltaR, atom2.inducedDipole);
-    fields[0] = rr3*atom2.inducedDipole + dDotDelta*deltaR;
+    atom1.field += rr3*atom2.inducedDipole + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom2.inducedDipolePolar);
-    fields[1] = rr3*atom2.inducedDipolePolar + dDotDelta*deltaR;
+    atom1.fieldPolar += rr3*atom2.inducedDipolePolar + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom1.inducedDipole);
-    fields[2] = rr3*atom1.inducedDipole + dDotDelta*deltaR;
+    atom2.field += rr3*atom1.inducedDipole + dDotDelta*deltaR;
    dDotDelta = rr5*dot(deltaR, atom1.inducedDipolePolar);
-    fields[3] = rr3*atom1.inducedDipolePolar + dDotDelta*deltaR;
+    atom2.fieldPolar += rr3*atom1.inducedDipolePolar + dDotDelta*deltaR;
 }
 #endif
@@ -110,10 +215,13 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
 * Compute the mutual induced field.
 */
 extern "C" __global__ void computeInducedField(
-        unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
+        unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq,
        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
 #ifdef USE_CUTOFF
        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
+#elif defined USE_GK
+        unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
+        const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
 #endif
        const float2* __restrict__ dampingAndThole) {
    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -138,8 +246,7 @@ extern "C" __global__ void computeInducedField(
        const unsigned int tbx = threadIdx.x - tgx;
        unsigned int x, y;
        AtomData data;
-        data.field = make_real3(0);
+        zeroAtomData(data);
-        data.fieldPolar = make_real3(0);
        if (pos < end) {
 #ifdef USE_CUTOFF
            if (numTiles <= maxTiles) {
@@ -158,7 +265,11 @@ extern "C" __global__ void computeInducedField(
                }
            }
            unsigned int atom1 = x*TILE_SIZE + tgx;
+#ifdef USE_GK
+            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
+#else
            loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
+#endif
            if (pos >= end)
                ; // This warp is done.
            else if (x == y) {
@@ -170,28 +281,26 @@ extern "C" __global__ void computeInducedField(
                localData[threadIdx.x].thole = data.thole;
                localData[threadIdx.x].damp = data.damp;
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    int atom2 = tbx+j;
+                    real3 delta = localData[tbx+j].pos-data.pos;
-                    real3 delta = localData[atom2].pos-data.pos;
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                    real3 fields[4];
+                    int atom2 = y*TILE_SIZE+j;
-                    computeOneInteraction(data, localData[atom2], delta, fields);
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
-                    atom2 = y*TILE_SIZE+j;
+                        computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2);
-                    if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        data.field += fields[0];
-                        data.fieldPolar += fields[1];
-                    }
                }
            }
            else {
                // This is an off-diagonal tile.
+#ifdef USE_GK
+                loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
+#else
                loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
-                localData[threadIdx.x].field = make_real3(0);
+#endif
-                localData[threadIdx.x].fieldPolar = make_real3(0);
+                zeroAtomData(localData[threadIdx.x]);
 #ifdef USE_CUTOFF
                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
                if (flags == 0) { // TODO: Figure out what the flags != 0 case doesn't work!!!
@@ -270,21 +379,15 @@ extern "C" __global__ void computeInducedField(
                    unsigned int tj = tgx;
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        int atom2 = tbx+tj;
+                        real3 delta = localData[tbx+tj].pos-data.pos;
-                        real3 delta = localData[atom2].pos-data.pos;
 #ifdef USE_PERIODIC
                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
 #endif
-                        real3 fields[4];
+                        int atom2 = y*TILE_SIZE+j;
-                        computeOneInteraction(data, localData[atom2], delta, fields);
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
-                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            computeOneInteraction(data, localData[tbx+tj], delta, false);
-                            data.field += fields[0];
-                            data.fieldPolar += fields[1];
-                            localData[atom2].field += fields[2];
-                            localData[atom2].fieldPolar += fields[3];
-                        }
                        tj = (tj + 1) & (TILE_SIZE - 1);
                    }
                }
@@ -295,27 +398,44 @@ extern "C" __global__ void computeInducedField(
        if (pos < end) {
            const unsigned int offset = x*TILE_SIZE + tgx;
-            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0xFFFFFFFF)));
+            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0xFFFFFFFF)));
-            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0xFFFFFFFF)));
+            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0xFFFFFFFF)));
-            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0xFFFFFFFF)));
+            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0xFFFFFFFF)));
-            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0xFFFFFFFF)));
-            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0xFFFFFFFF)));
-            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0xFFFFFFFF)));
+#ifdef USE_GK
+            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0xFFFFFFFF)));
+            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0xFFFFFFFF)));
+            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0xFFFFFFFF)));
+#endif
        }
        if (pos < end && x != y) {
            const unsigned int offset = y*TILE_SIZE + tgx;
-            atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0xFFFFFFFF)));
+            atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0xFFFFFFFF)));
-            atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0xFFFFFFFF)));
+            atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0xFFFFFFFF)));
-            atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0xFFFFFFFF)));
+            atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0xFFFFFFFF)));
-            atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0xFFFFFFFF)));
-            atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0xFFFFFFFF)));
-            atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0xFFFFFFFF)));
+#ifdef USE_GK
+            atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0xFFFFFFFF)));
+            atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0xFFFFFFFF)));
+            atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0xFFFFFFFF)));
+            atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0xFFFFFFFF)));
+#endif
        }
        pos++;
    } while (pos < end);
 }
 extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
+        const long long* __restrict__ fixedFieldS, const long long* __restrict__ fixedFieldPolarS,
        const long long* __restrict__ inducedField, const long long* __restrict__ inducedFieldPolar, real* __restrict__ inducedDipole,
        real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors) {
    extern __shared__ real2 buffer[];
@@ -335,8 +455,10 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
            int fieldIndex = atom+component*PADDED_NUM_ATOMS;
            real previousDipole = inducedDipole[dipoleIndex];
            real previousDipolePolar = inducedDipolePolar[dipoleIndex];
-            real newDipole = scale*((fixedField[fieldIndex]+inducedField[fieldIndex])*fieldScale+ewaldScale*previousDipole);
+            long long fixed = fixedField[fieldIndex] + (fixedFieldS == NULL ? (long long) 0 : fixedFieldS[fieldIndex]);
-            real newDipolePolar = scale*((fixedFieldPolar[fieldIndex]+inducedFieldPolar[fieldIndex])*fieldScale+ewaldScale*previousDipolePolar);
+            long long fixedPolar = fixedFieldPolar[fieldIndex] + (fixedFieldPolarS == NULL ? (long long) 0 : fixedFieldPolarS[fieldIndex]);
+            real newDipole = scale*((fixed+inducedField[fieldIndex])*fieldScale+ewaldScale*previousDipole);
+            real newDipolePolar = scale*((fixedPolar+inducedFieldPolar[fieldIndex])*fieldScale+ewaldScale*previousDipolePolar);
            newDipole = previousDipole + polarSOR*(newDipole-previousDipole);
            newDipolePolar = previousDipolePolar + polarSOR*(newDipolePolar-previousDipolePolar);
            inducedDipole[dipoleIndex] = newDipole;