Added support for polarization type='direct' option

WCA force now written directly to cuda.force4 array

Added support for polarization type='direct' option
WCA force now written directly to cuda.force4 array
b1dc92cc · Mark Friedrichs · 959f2fab · b1dc92cc · b1dc92cc · b1dc92cc
Commit b1dc92cc authored Apr 08, 2011 by Mark Friedrichs
16 changed files
--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
@@ -66,6 +66,19 @@ public:
        PME = 1 
    };  
+    enum AmoebaPolarizationType {
+        /** 
+         * Mutual polarization
+         */
+        Mutual = 0,
+        /**
+         * Direct polarization 
+         */
+        Direct = 1 
+    };  
    enum MultipoleAxisTypes { ZThenX = 0, Bisector = 1, ZBisect = 2, ThreeFold = 3, ZOnly = 4, NoAxisType = 5, LastAxisTypeIndex = 6 };
    // Algorithm used to converge mutual induced dipoles:
@@ -100,6 +113,16 @@ public:
     */
    void setNonbondedMethod(AmoebaNonbondedMethod method);
+    /**
+     * Get polarization type
+     */
+    AmoebaPolarizationType getPolarizationType( void ) const;
+    /**
+     * Set the polarization type
+     */
+    void setPolarizationType(AmoebaPolarizationType type );
    /**
     * Get the cutoff distance (in nm) being used for nonbonded interactions.  If the NonbondedMethod in use
     * is NoCutoff, this value will have no effect.
@@ -326,6 +349,7 @@ protected:
 private:
    AmoebaNonbondedMethod nonbondedMethod;
+    AmoebaPolarizationType polarizationType;
    double cutoffDistance;
    double aewald;
    int pmeBSplineOrder;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -38,7 +38,7 @@ using namespace OpenMM;
 using std::string;
 using std::vector;
-AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), pmeBSplineOrder(5), cutoffDistance(1.0), ewaldErrorTol(1e-4), mutualInducedIterationMethod(SOR), mutualInducedMaxIterations(60),
+AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), polarizationType(Mutual), pmeBSplineOrder(5), cutoffDistance(1.0), ewaldErrorTol(1e-4), mutualInducedIterationMethod(SOR), mutualInducedMaxIterations(60),
                                               mutualInducedTargetEpsilon(1.0e-05), scalingDistanceCutoff(100.0), electricConstant(138.9354558456) {
 }
@@ -50,6 +50,14 @@ void AmoebaMultipoleForce::setNonbondedMethod( AmoebaMultipoleForce::AmoebaNonbo
    nonbondedMethod = method;
 }
+AmoebaMultipoleForce::AmoebaPolarizationType AmoebaMultipoleForce::getPolarizationType( void ) const {
+    return polarizationType;
+}
+void AmoebaMultipoleForce::setPolarizationType( AmoebaMultipoleForce::AmoebaPolarizationType type ) {
+    polarizationType = type;
+}
 double AmoebaMultipoleForce::getCutoffDistance( void ) const {
    return cutoffDistance;
 }

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -940,8 +940,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        }
    }
+    int polarizationType = static_cast<int>(force.getPolarizationType());
    int iterativeMethod = static_cast<int>(force.getMutualInducedIterationMethod());
-    if( iterativeMethod != 0 ){
+    if( iterativeMethod != 0 && polarizationType == 0 ){
         throw OpenMMException("Iterative method for mutual induced dipoles not recognized.\n");
    }
@@ -950,13 +951,17 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
         throw OpenMMException("AmoebaMultipoleForce nonbonded method not recognized.\n");
    }
+    if( polarizationType != 0 && polarizationType != 1 ){
+         throw OpenMMException("AmoebaMultipoleForce polarization type not recognized.\n");
+    }
    gpuSetAmoebaMultipoleParameters(data.getAmoebaGpu(), charges, dipoles, quadrupoles, axisTypes, multipoleAtomZs, multipoleAtomXs, multipoleAtomYs,
                                    tholes, scalingDistanceCutoff, dampingFactors, polarity,
                                    multipoleAtomCovalentInfo, covalentDegree, minCovalentIndices, minCovalentPolarizationIndices, (maxCovalentRange+2),
                                    static_cast<int>(force.getMutualInducedIterationMethod()),
                                    force.getMutualInducedMaxIterations(),
                                    static_cast<float>( force.getMutualInducedTargetEpsilon()),
-                                    nonbondedMethod,
+                                    nonbondedMethod, polarizationType,
                                    static_cast<float>( force.getCutoffDistance()),
                                    static_cast<float>( force.getAEwald()),
                                    static_cast<float>( force.getElectricConstant()) );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -358,6 +358,7 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameDipole, log );
    gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameQuadrupole, log );
+    (void) fprintf( log, "     polarizationType                   %d\n",      amoebaGpu->amoebaSim.polarizationType );
    (void) fprintf( log, "     maxCovalentDegreeSz                %d\n",      amoebaGpu->maxCovalentDegreeSz );
    (void) fprintf( log, "     solventDielectric                  %10.3f\n",  amoebaGpu->solventDielectric);
    (void) fprintf( log, "     scalingDistanceCutoff              %15.7e\n",  amoebaGpu->amoebaSim.scalingDistanceCutoff );
@@ -1519,7 +1520,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector< std::vector< std::vector<int> > >& multipoleParticleCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange, 
                                     int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, float cutoffDistance, float alphaEwald, float electricConstant ) {
+                                     int nonbondedMethod, int polarizationType, float cutoffDistance, float alphaEwald, float electricConstant ) {
 // ---------------------------------------------------------------------------------------
@@ -1545,6 +1546,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    // allocate memory
    amoebaGpu->mutualInducedIterativeMethod = mutualInducedIterativeMethod;
+    amoebaGpu->amoebaSim.polarizationType   = polarizationType;
    gpuMutualInducedFieldAllocate( amoebaGpu );
    amoebaGpu->mutualInducedMaxIterations   = mutualInducedMaxIterations;
    amoebaGpu->mutualInducedTargetEpsilon   = mutualInducedTargetEpsilon;
@@ -1562,9 +1564,9 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    }
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log,"%s Nonbonded method=%d %d [NoCutoff=%d PME=%d]\n",
+        (void) fprintf( amoebaGpu->log,"%s Nonbonded method=%d %d [NoCutoff=%d PME=%d] polarizationType=%d (0=mutual/1=direct)\n",
                        methodName.c_str(), nonbondedMethod, amoebaGpu->multipoleNonbondedMethod,
-                        AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
+                        AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD, polarizationType );
        (void) fflush( amoebaGpu->log );
    }
    amoebaGpu->amoebaSim.sqrtPi                      = std::sqrt( 3.14159265358f );
@@ -2554,6 +2556,7 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
    unsigned int particles                           = radii.size();
    amoebaGpu->psWcaDispersionRadiusEpsilon          = new CUDAStream<float2>(paddedNumberOfAtoms,   1, "WcaDispersionRadiusEpsilon");
+    amoebaGpu->amoebaSim.pWcaDispersionRadiusEpsilon = amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData;
    for (unsigned int ii = 0; ii < particles; ii++) 
    {    
        amoebaGpu->psWcaDispersionRadiusEpsilon->_pSysData[ii].x    = radii[ii];

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -136,6 +136,7 @@ struct cudaAmoebaGmxSimulation {
    float sqrtPi;                                   // sqrt(PI)
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors
+    int polarizationType;                           // polarization type (0=Mutual, 1=Direct)
    int4*  pMultipoleParticlesIdsAndAxisType; 
    int4*  pMultipoleParticlesTorqueBufferIndices; 
@@ -181,8 +182,8 @@ struct cudaAmoebaGmxSimulation {
    float awater;
    float shctd;
    float dispoff;
    float totalMaxWcaDispersionEnergy;
+    float2* pWcaDispersionRadiusEpsilon;
                    // scaling indices
    int*            pScaleIndicesIndex;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -293,7 +293,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
                                     int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, float cutoffDistance,  float alphaEwald, float electricConstant );
+                                     int nonbondedMethod, int polarizationType, float cutoffDistance,  float alphaEwald, float electricConstant );
 extern "C"

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -388,23 +388,6 @@ if( 1 ){
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole,    temp5);
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipole ,     qIuJ);//MK
-/*
-               ftm2i(1) = gfi(1)*xr + 0.5d0*
-     &           (- rr3*ck*(uind(1,i)*psc3+uinp(1,i)*dsc3)
-     &            + rr5*sc(4)*(uind(1,i)*psc5+uinp(1,i)*dsc5)
-     &            - rr7*sc(6)*(uind(1,i)*psc7+uinp(1,i)*dsc7))
-     &            + (rr3*ci*(uind(1,k)*psc3+uinp(1,k)*dsc3)
-     &            + rr5*sc(3)*(uind(1,k)*psc5+uinp(1,k)*dsc5)
-     &            + rr7*sc(5)*(uind(1,k)*psc7+uinp(1,k)*dsc7))*0.5d0
-     &            + rr5*scale5i*(sci(4)*uinp(1,i)+scip(4)*uind(1,i)
-     &            + sci(3)*uinp(1,k)+scip(3)*uind(1,k))*0.5d0
-     &            + 0.5d0*(sci(4)*psc5+scip(4)*dsc5)*rr5*di(1)
-     &            + 0.5d0*(sci(3)*psc5+scip(3)*dsc5)*rr5*dk(1)
-     &            + 0.5d0*gfi(4)*((qkui(1)-qiuk(1))*psc5
-     &            + (qkuip(1)-qiukp(1))*dsc5)
-     &            + gfi(5)*qir(1) + gfi(6)*qkr(1)
-*/
    float ftm2i_0 = gfi1*deltaR[0] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[0]*psc0 + atomI.inducedDipoleP[0]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[0]*psc1 + atomI.inducedDipoleP[0]*dsc1) -
@@ -460,6 +443,18 @@ if( 1 ){
    ftm2i_1 -= (fridmp_1 + findmp_1);
    ftm2i_2 -= (fridmp_2 + findmp_2);
+    if( cAmoebaSim.polarizationType )
+    {
+        float gfd     = 0.5*(rr5*scip2*scaleI0 - rr7*(scip3*sci4+sci3*scip4)*scaleI1);
+        float temp5   = 0.5*rr5*scaleI1;
+        float fdir_0  = gfd*deltaR[0] + temp5*(sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0] + sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0]);
+        float fdir_1  = gfd*deltaR[1] + temp5*(sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1]);
+        float fdir_2  = gfd*deltaR[2] + temp5*(sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2] + sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2]);
+        ftm2i_0      -= fdir_0 - findmp_0;
+        ftm2i_1      -= fdir_1 - findmp_1;
+        ftm2i_2      -= fdir_2 - findmp_2;
+    }
    // now perform the torque calculation;
    // intermediate terms for torque between multipoles i and j;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1311,7 +1311,7 @@ __device__ void calculateKirkwoodPairIxn_kernel( KirkwoodParticle& atomI,
    // mutual polarization electrostatic solvation free energy gradient
-//   if (poltyp .eq. 'MUTUAL'){
+    if( cAmoebaSim.polarizationType == 0 ){
        dpdx = dpdx - 0.5f *
                           (atomI.inducedDipole[0]*(atomJ.inducedDipoleP[0]*gux5+atomJ.inducedDipoleP[1]*gux6+atomJ.inducedDipoleP[2]*gux7)
@@ -1345,7 +1345,7 @@ __device__ void calculateKirkwoodPairIxn_kernel( KirkwoodParticle& atomI,
                            + atomJ.inducedDipole[2]*(atomI.inducedDipoleP[0]*guz22+atomI.inducedDipoleP[1]*guz23+atomI.inducedDipoleP[2]*guz24);
        dpbi = dpbi - 0.5f*atomJ.bornRadius*duvdr;
        dpbk = dpbk - 0.5f*atomI.bornRadius*duvdr;
-//    }
+    }
    // torque due to induced reaction field on permanent dipoles

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -515,24 +515,16 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
    // correction to convert mutual to direct polarization force
-#if 0
+    if ( cAmoebaSim.polarizationType ){
-               if (poltyp .eq. 'DIRECT') then;
+        float gfd      = 0.5f * (rr5*scip2*scale3i - rr7*(scip3*sci4+sci3*scip4)*scale5i);
-                  gfd             = 0.5f * (rr5*scip2*scale3i;
+        float fdir1    = gfd*xr + 0.5f*rr5*scale5i* (sci4*atomI.inducedDipolePS[0]+scip4*atomI.inducedDipoleS[0] + sci3*atomJ.inducedDipolePS[0]+scip3*atomJ.inducedDipoleS[0]);
-     &                  - rr7*(scip3*sci4+sci3*scip4)*scale5i);
+        float fdir2    = gfd*yr + 0.5f*rr5*scale5i* (sci4*atomI.inducedDipolePS[1]+scip4*atomI.inducedDipoleS[1] + sci3*atomJ.inducedDipolePS[1]+scip3*atomJ.inducedDipoleS[1]);
-                  fdir1             = gfd*xr + 0.5f*rr5*scale5i;
+        float fdir3    = gfd*zr + 0.5f*rr5*scale5i* (sci4*atomI.inducedDipolePS[2]+scip4*atomI.inducedDipoleS[2] + sci3*atomJ.inducedDipolePS[2]+scip3*atomJ.inducedDipoleS[2]);
-     &                         * (sci4*atomI.inducedDipolePS[0]+scip4*atomI.inducedDipoleS[0];
+        ftm2i1        -= fdir1 - findmp1;
-     &                           +sci3*atomJ.inducedDipolePS[0]+scip3*atomJ.inducedDipoleS[0]);
+        ftm2i2        -= fdir2 - findmp2;
-                  fdir2             = gfd*yr + 0.5f*rr5*scale5i;
+        ftm2i3        -= fdir3 - findmp3;
-     &                         * (sci4*atomI.inducedDipolePS[1]+scip4*atomI.inducedDipoleS[1];
-     &                           +sci3*atomJ.inducedDipolePS[1]+scip3*atomJ.inducedDipoleS[1]);
+    }
-                  fdir3             = gfd*zr + 0.5f*rr5*scale5i;
-     &                         * (sci4*atomI.inducedDipolePS[2]+scip4*atomI.inducedDipoleS[2];
-     &                           +sci3*atomJ.inducedDipolePS[2]+scip3*atomJ.inducedDipoleS[2]);
-                  ftm2i1             = ftm2i1 - fdir1 + findmp1;
-                  ftm2i2             = ftm2i2 - fdir2 + findmp2;
-                  ftm2i3             = ftm2i3 - fdir3 + findmp3;
-               end if;
-#endif
    // now perform the torque calculation
    // intermediate terms for torque between multipoles i and k
@@ -803,24 +795,16 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato
    // correction to convert mutual to direct polarization force;
-#if 0
+    if ( cAmoebaSim.polarizationType ){
-               if (poltyp .eq. 'DIRECT') then;
-                  gfd = 0.5f * (rr5*scip2*scale3i;
+        float gfd    = 0.5f * (rr5*scip2*scale3i- rr7*(scip3*sci4+sci3*scip4)*scale5i);
-     &                  - rr7*(scip3*sci4+sci3*scip4)*scale5i);
+        float fdir1  = gfd*xr + 0.5f*rr5*scale5i* (sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0] + sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
-                  fdir1 = gfd*xr + 0.5f*rr5*scale5i;
+        float fdir2  = gfd*yr + 0.5f*rr5*scale5i* (sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
-     &                         * (sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0];
+        float fdir3  = gfd*zr + 0.5f*rr5*scale5i* (sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2] + sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
-     &                           +sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
+        ftm2i1      -= fdir1 - findmp1;
-                  fdir2 = gfd*yr + 0.5f*rr5*scale5i;
+        ftm2i2      -= fdir2 - findmp2;
-     &                         * (sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1];
+        ftm2i3      -= fdir3 - findmp3;
-     &                           +sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
+    }
-                  fdir3 = gfd*zr + 0.5f*rr5*scale5i;
-     &                         * (sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2];
-     &                           +sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
-                  ftm2i1 = ftm2i1 - fdir1 + findmp1;
-                  ftm2i2 = ftm2i2 - fdir2 + findmp2;
-                  ftm2i3 = ftm2i3 - fdir3 + findmp3;
-               end if;
-#endif
    // now perform the torque calculation
    // intermediate terms for torque between multipoles i and k

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -784,6 +784,15 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
    }   
 #endif
+    // if polarization type is direct, set flags signalling done and return
+    if( amoebaGpu->amoebaSim.polarizationType )
+    {   
+        amoebaGpu->mutualInducedDone          = 1;
+        amoebaGpu->mutualInducedConverged     = 1;
+        return;
+    }   
    // ---------------------------------------------------------------------------------------
    done      = 0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
@@ -532,6 +532,15 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
    }   
 #endif
+    // if polarization type is direct, set flags signalling done and return
+    if( amoebaGpu->amoebaSim.polarizationType )
+    {   
+        amoebaGpu->mutualInducedDone          = 1;
+        amoebaGpu->mutualInducedConverged     = 1;
+        return;
+    }   
    // ---------------------------------------------------------------------------------------
    done      = 0;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
@@ -911,9 +911,16 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
            int j2 = deriv2[k+1];
            int j3 = deriv3[k+1];
-            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1] + inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1];
+            f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1];
-            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2] + inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2];
+            f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2];
-            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3] + inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3];
+            f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3];
+            if( cAmoebaSim.polarizationType == 0 )
+            {
+                f.x += inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1];
+                f.y += inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2];
+                f.z += inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3];
+            }
        }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -704,36 +704,22 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
        // correction to convert mutual to direct polarization force
-/*
+        if( cAmoebaSim.polarizationType ){
-        if (poltyp .eq. 'DIRECT') {
-           gfd = 0.5f * (bn2*scip2;
+           float gfd     = 0.5f * (bn2*scip2 - bn3*(scip3*sci4+sci3*scip4));
-&                     - bn3*(scip3*sci4+sci3*scip4));
+           ftm2i1       -= gfd*xr + 0.5f*bn2*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0]+sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
-           gfdr = 0.5f * (rr5*scip2*usc3;
+           ftm2i2       -= gfd*yr + 0.5f*bn2*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1]+sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
-&                     - rr7*(scip3*sci4;
+           ftm2i3       -= gfd*zr + 0.5f*bn2*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2]+sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
-&                           +sci3*scip4)*usc5);
-           ftm2i1 = ftm2i1 - gfd*xr - 0.5f*bn2*;
+           float gfdr    = 0.5f * (rr5*scip2*usc3 - rr7*(scip3*sci4 +sci3*scip4)*usc5);
-&                          (sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0];
+           float fdir1   = gfdr*xr + 0.5f*usc5*rr5*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0] + sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
-&                          +sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
+           float fdir2   = gfdr*yr + 0.5f*usc5*rr5*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
-           ftm2i2 = ftm2i2 - gfd*yr - 0.5f*bn2*;
+           float fdir3   = gfdr*zr + 0.5f*usc5*rr5*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2] + sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
-&                          (sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1];
-&                          +sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
+           ftm2i1       += fdir1 + findmp1;
-           ftm2i3 = ftm2i3 - gfd*zr - 0.5f*bn2*;
+           ftm2i2       += fdir2 + findmp2;
-&                          (sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2];
+           ftm2i3       += fdir3 + findmp3;
-&                          +sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
-           fdir1 = gfdr*xr + 0.5f*usc5*rr5*;
-&                         (sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0];
-&                        + sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0]);
-           fdir2 = gfdr*yr + 0.5f*usc5*rr5*;
-&                         (sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1];
-&                        + sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1]);
-           fdir3 = gfdr*zr + 0.5f*usc5*rr5*;
-&                         (sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2];
-&                        + sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2]);
-           ftm2i1 = ftm2i1 + fdir1 + findmp1;
-           ftm2i2 = ftm2i2 + fdir2 + findmp2;
-           ftm2i3 = ftm2i3 + fdir3 + findmp3;
        }
-*/
        // intermediate variables for induced torque terms
@@ -1198,6 +1184,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu )
 {
    cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpu );
    kCalculateAmoebaPMEInducedDipoleForces( amoebaGpu );
    cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -572,44 +572,25 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        gpuContext gpu = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        VectorOfDoubleVectors outputVector;
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,            outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar,       outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaEFieldPolarity", fileId, outputVector );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeEFieldPolarity", fileId, outputVector );
-/*
-        amoebaGpu->psE_FieldPolar->Download();
-        amoebaGpu->psInducedDipole->Download(),
-        amoebaGpu->psInducedDipolePolar->Download();
-        amoebaGpu->psPolarizability->Download();
-        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
-        int offset   = 0;
-        int maxPrint = 10;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-            (void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii, 
-                            amoebaGpu->psPolarizability->_pSysData[offset] );
-            if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
-                amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
-                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); 
-            }
-            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
-                            amoebaGpu->psE_Field->_pSysData[offset],       amoebaGpu->psE_Field->_pSysData[offset+1],       amoebaGpu->psE_Field->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
-            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
-                            amoebaGpu->psE_FieldPolar->_pSysData[offset],       amoebaGpu->psE_FieldPolar->_pSysData[offset+1],       amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
-            offset += 3;
-            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
    }   
+#endif
-void) fflush( amoebaGpu->log );
+    // if polarization type is direct, set flags signalling done and return
-*/
+    if( amoebaGpu->amoebaSim.polarizationType )
+    {
+        amoebaGpu->mutualInducedDone          = 1;
+        amoebaGpu->mutualInducedConverged     = 1;
+        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
+        return;
    }
-#endif
    // ---------------------------------------------------------------------------------------
@@ -624,35 +605,6 @@ void) fflush( amoebaGpu->log );
        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
        LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");  
-#ifdef GET_EFIELD_FROM_FILE
-{
-    std::string fileName = "waterInduceRecip.txt";
-    StringVectorVector fileContents;
-    readFile( fileName, fileContents );
-    unsigned int offset  = 0;
-    amoebaGpu->psWorkVector[0]->Download();
-    amoebaGpu->psWorkVector[1]->Download();
-    if( amoebaGpu->log )(void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush(  amoebaGpu->log );
-    float conversion = 100.0f;
-    for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
-        StringVector lineTokens     = fileContents[ii];
-        unsigned int lineTokenIndex = 1;
-        // (void) fprintf( amoebaGpu->log, "   %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush(  amoebaGpu->log );
-        amoebaGpu->psWorkVector[0]->_pSysData[offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psWorkVector[0]->_pSysData[offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psWorkVector[0]->_pSysData[offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str())); 
-        offset                                                     -= 3;        
-        amoebaGpu->psWorkVector[1]->_pSysData[offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psWorkVector[1]->_pSysData[offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-        amoebaGpu->psWorkVector[1]->_pSysData[offset++]       += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
-    }
-    amoebaGpu->psWorkVector[0]->Upload();
-    amoebaGpu->psWorkVector[1]->Upload();
-}
-#endif
        // post matrix multiply
        kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
@@ -353,28 +353,6 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
 #define METHOD_NAME(a, b) a##N2ByWarp##b
 #include "kCalculateAmoebaCudaWcaDispersion.h"
-// reduce psWorkArray_3_1 -> outputArray
-static void kReduceWcaDispersion(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray )
-{
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
-    LAUNCHERROR("kReduceWcaDispersion");
-}
-// reduce psWorkArray_3_1 -> outputArray
-static void kReduceWcaDispersionToFloat4(amoebaGpuContext amoebaGpu, CUDAStream<float4>* outputArray )
-{
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFieldsToFloat4_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                                   gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                                   amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
-    LAUNCHERROR("kReduceWcaDispersion");
-}
 /**---------------------------------------------------------------------------------------
   Compute WCA dispersion
@@ -394,15 +372,6 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
     gpuContext gpu    = amoebaGpu->gpuContext;
-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "kCalculateAmoebaWcaDispersionForces";
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-    debugArray->Upload();
-    int targetAtom                             = 3;
-#endif
    // set threads/block first time through
    if( threadsPerBlock == 0 ){
@@ -427,222 +396,17 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
 #endif
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
-                                                            amoebaGpu->psWorkUnit->_pDevData,
+                                                            amoebaGpu->psWorkUnit->_pDevData );
-                                                            gpu->psPosq4->_pDevData,
-                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                            debugArray->_pDevData, targetAtom );
-#else
-                                                            amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
    } else {
        kCalculateAmoebaWcaDispersionN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(
-                                                            amoebaGpu->psWorkUnit->_pDevData,
+                                                            amoebaGpu->psWorkUnit->_pDevData );
-                                                            gpu->psPosq4->_pDevData,
-                                                            amoebaGpu->psWcaDispersionRadiusEpsilon->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                            debugArray->_pDevData, targetAtom );
-#else
-                                                            amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
    }
    LAUNCHERROR("kCalculateAmoebaWcaDispersion");  
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        amoebaGpu->psWorkArray_3_1->Download();
-        debugArray->Download();
-        (void) fprintf( amoebaGpu->log,"\n" );
-        int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        double sum = 0.0;
-        double sums[8] = { 0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0 };
-        std::map<int,double> buffers;
-        std::vector< std::vector<double> > trackD;
-        std::vector< std::vector<double> > trackT;
-        std::vector<double> maxD;
-        std::vector< std::vector<int> > trackI;
-        trackD.resize( gpu->natoms );
-        trackT.resize( gpu->natoms );
-        maxD.resize( gpu->natoms );
-        trackI.resize( gpu->natoms );
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            unsigned int debugIndex = jj;
-            (void) fprintf( amoebaGpu->log,"%5d %5d DebugWca\n", targetAtom, jj );
-            int block = -1;
-            for( int kk = 0; kk < -3; kk++ ){
-                if( kk == 1 ){
-                    block =  static_cast<int>(debugArray->_pSysData[debugIndex].w + 1.0e-04); 
-                    if( buffers.find(block) == buffers.end() ){
-                        buffers[block] = 0.0;
-                    }
-                }
-                if( kk == 1  && jj != targetAtom ){
-                    sums[0] += debugArray->_pSysData[debugIndex].y;
-                    sums[1] += debugArray->_pSysData[debugIndex].z;
-                    sums[2] += debugArray->_pSysData[debugIndex].w;
-                    double x4 = debugArray->_pSysData[debugIndex].x - (debugArray->_pSysData[debugIndex].y + debugArray->_pSysData[debugIndex].z + debugArray->_pSysData[debugIndex].w);
-                    sums[3] += x4;
-                    //sum     += debugArray->_pSysData[debugIndex].x;
-                    sum     += debugArray->_pSysData[debugIndex].z;
-                    buffers[block] += debugArray->_pSysData[debugIndex].z; 
-                    (void) fprintf( amoebaGpu->log," %16.9e [%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, x4);
-                } else if( kk == 2 &&  jj != targetAtom){
-                    //sum     += debugArray->_pSysData[debugIndex].x;
-                    sum     += debugArray->_pSysData[debugIndex].z;
-                    sums[4] += debugArray->_pSysData[debugIndex].y;
-                    sums[5] += debugArray->_pSysData[debugIndex].z;
-                    sums[6] += debugArray->_pSysData[debugIndex].w;
-                    double x4 = debugArray->_pSysData[debugIndex].x - (debugArray->_pSysData[debugIndex].y + debugArray->_pSysData[debugIndex].z + debugArray->_pSysData[debugIndex].w);
-                    sums[7] += x4;
-                    buffers[block] += debugArray->_pSysData[debugIndex].z; 
-                    (void) fprintf( amoebaGpu->log," %16.9e [%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, x4);
-                } else {
-                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e] %7u\n",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, debugIndex );
-                }
-                if( kk == 5 )(void) fprintf( amoebaGpu->log,"\n" );
-                debugIndex += paddedNumberOfAtoms;
-            }
-            block =  static_cast<int>(debugArray->_pSysData[debugIndex+paddedNumberOfAtoms].w + 1.0e-04); 
-            if( buffers.find(block) == buffers.end() ){
-                buffers[block] = 0.0;
-                maxD[block]    = 0.0;
-            }
-            for( int kk = 0; kk < 3; kk++ ){
-                if( kk == 0 && jj != targetAtom ){
-                    sum            += debugArray->_pSysData[debugIndex].z;
-                    buffers[block] += debugArray->_pSysData[debugIndex].z; 
-                    trackI[block].push_back( jj );
-                    trackD[block].push_back( debugArray->_pSysData[debugIndex].z );
-                    trackT[block].push_back( debugArray->_pSysData[debugIndex].w );
-                    if( fabs( debugArray->_pSysData[debugIndex].w ) > maxD[block] ){
-                        maxD[block] = fabs( debugArray->_pSysData[debugIndex].w );
-                    }
-                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w);
-                } else if( kk == 2 &&  jj != targetAtom){
-                    sum             += debugArray->_pSysData[debugIndex].z;
-                    buffers[block]  += debugArray->_pSysData[debugIndex].z; 
-                    trackI[block].push_back( jj );
-                    trackD[block].push_back( debugArray->_pSysData[debugIndex].z );
-                    trackT[block].push_back( debugArray->_pSysData[debugIndex].w );
-                    if( fabs( debugArray->_pSysData[debugIndex].w ) > maxD[block] ){
-                        maxD[block] = fabs( debugArray->_pSysData[debugIndex].w );
-                    }
-                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w);
-                } else {
-                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e] %7u\n",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                    debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w, debugIndex );
-                }
-                if( kk == 5 )(void) fprintf( amoebaGpu->log,"\n" );
-                debugIndex += paddedNumberOfAtoms;
-            }
-            (void) fprintf( amoebaGpu->log,"\n" );
-        }
-        (void) fprintf( amoebaGpu->log,"Total sum=%14.7e\n", sum );
-        (void) fprintf( amoebaGpu->log,"DeWW\n" );
-        sum = 0.0;
-        for( int jj = 0; jj < 4; jj++ ){
-            sum += sums[jj] + sums[jj+4];
-            (void) fprintf( amoebaGpu->log,"[%14.7e %14.7e] %14.7e\n", sums[jj], sums[jj+4], sums[jj] + sums[jj+4] );
-        }
-        (void) fprintf( amoebaGpu->log,"Total sum8=%14.7e\n", sum );
-        (void) fprintf( amoebaGpu->log,"Buffers\n", sum );
-        sum = 0.0;
-        for( std::map<int,double>::const_iterator jj = buffers.begin(); jj != buffers.end(); jj++ ){
-            (void) fprintf( amoebaGpu->log,"%5d %14.7e", jj->first, jj->second );
-            sum += jj->second;
-            int block       = jj->first;
-            double sumBlock = 0.0;
-            for( unsigned int kk = 0; kk < trackI[block].size(); kk++ ){
-                 sumBlock +=  trackD[block][kk];
-            }
-            (void) fprintf( amoebaGpu->log," %5u Total=%14.7e MaxD=%14.7e\n", trackI[block].size(), sumBlock, maxD[block]);
-            for( unsigned int kk = 0; kk < trackI[block].size(); kk++ ){
-                 (void) fprintf( amoebaGpu->log,"[%5d %14.7e  %14.7e] ", trackI[block][kk], trackD[block][kk], trackT[block][kk]);
-                 if( ((kk+1)%3) == 0 )(void) fprintf( amoebaGpu->log,"\n" );
-            }
-            (void) fprintf( amoebaGpu->log,"\n\n\n" );
-        }
-        (void) fprintf( amoebaGpu->log,"Total buffer sum=%14.7e\n", sum );
-/*
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            int debugIndex   = jj;
-            int debugIndex4  = debugIndex + 4*paddedNumberOfAtoms;
-            int debugIndex5  = debugIndex + 5*paddedNumberOfAtoms;
-            int debugIndex9  = debugIndex + 9*paddedNumberOfAtoms;
-            int debugIndex10 = debugIndex + 10*paddedNumberOfAtoms;
-            (void) fprintf( amoebaGpu->log,"%6d %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e %14.7e \n", jj,
-                                       debugArray->_pSysData[debugIndex4].x*debugArray->_pSysData[debugIndex4].x,  // r2
-                                       debugArray->_pSysData[debugIndex4].y, debugArray->_pSysData[debugIndex9].y, // de
-                                       debugArray->_pSysData[debugIndex5].x, debugArray->_pSysData[debugIndex10].x, // dll
-                                       debugArray->_pSysData[debugIndex5].y, debugArray->_pSysData[debugIndex10].y, // duu
-                                       debugArray->_pSysData[debugIndex4].z, debugArray->_pSysData[debugIndex9].z ); // emxio
-        }
-*/
-    }
-#endif
-    kReduceWcaDispersionToFloat4( amoebaGpu, gpu->psForce4 );
-#ifdef AMOEBA_DEBUG
-    if( 0 && amoebaGpu->log ){
-        gpu->psEnergy->Download();
-        double sum = 0.0;
-        for (int i = 0; i < gpu->sim.energyOutputBuffers; i++){
-            sum  += gpu->psEnergy->_pSysData[i];
-            if( fabsf( (*gpu->psEnergy)[i]) > 0.0 )
-                (void) fprintf( amoebaGpu->log,"SumQQ %6d %14.7e QQ SUM\n", i, (*gpu->psEnergy)[i] );
-        }   
-        (void) fprintf( amoebaGpu->log,"%14.7e QQ SUM\n", sum );
-    }
-#endif
-    if( 0 ){
-        int paddedNumberOfAtoms             = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* psTempForce      = new CUDAStream<float>(4*paddedNumberOfAtoms, 1, "psTempForce");
-        kClearFloat( amoebaGpu, 4*paddedNumberOfAtoms, psTempForce );
-        kReduceWcaDispersion( amoebaGpu, psTempForce );
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,   outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray(  gpu->natoms, 3, psTempForce,    outputVector, NULL, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaWca", fileId, outputVector );
-        delete psTempForce;
-        //exit(0);
-     }
-#ifdef AMOEBA_DEBUG
-    delete debugArray;
-    //exit(0);
-#endif
   // ---------------------------------------------------------------------------------------
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.h
@@ -32,15 +32,8 @@ __launch_bounds__(192, 1)
 #else
 __launch_bounds__(64, 1)
 #endif
-void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
-                            unsigned int* workUnit,
+void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)( unsigned int* workUnit ){
-                            float4* atomCoord,
-                            float2*  wcaDispersionParameters,
-                            float* outputForce
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
    extern __shared__ WcaDispersionParticle sA[];
@@ -56,9 +49,6 @@ void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
    float jEpsilon;
    float totalEnergy            = 0.0f;
-#ifdef AMOEBA_DEBUG
-    float4 pullDebug[3];
-#endif
    while (pos < end)
    {
@@ -76,9 +66,9 @@ void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
        WcaDispersionParticle*  psA      = &sA[tbx];
        unsigned int atomI               = x + tgx;
-        float4 iCoord                    = atomCoord[atomI];
+        float4 iCoord                    = cSim.pPosq[atomI];
-        float iRadius                    = wcaDispersionParameters[atomI].x;
+        float iRadius                    = cAmoebaSim.pWcaDispersionRadiusEpsilon[atomI].x;
-        float iEpsilon                   = wcaDispersionParameters[atomI].y;
+        float iEpsilon                   = cAmoebaSim.pWcaDispersionRadiusEpsilon[atomI].y;
        float forceSum[3];
@@ -100,7 +90,7 @@ void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
        if (lasty != y)
        {
-            loadWcaDispersionShared( &(sA[threadIdx.x]), (y+tgx), atomCoord, wcaDispersionParameters );
+            loadWcaDispersionShared( &(sA[threadIdx.x]), (y+tgx), cSim.pPosq, cAmoebaSim.pWcaDispersionRadiusEpsilon );
        }
@@ -124,66 +114,8 @@ void METHOD_NAME(kCalculateAmoebaWcaDispersion, _kernel)(
                                                  iRadius,jRadius,
                                                  rmixo,  rmixh,
                                                  emixo,  emixh,
-                                                  ijForce, &energy
+                                                  ijForce, &energy);
-#ifdef AMOEBA_DEBUG
-,  pullDebug
-#endif
-   );
-#ifdef AMOEBA_DEBUG
-if( (atomI == targetAtom) || ( (y+tj) == targetAtom ) ){
-unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-debugArray[index].x                = (float) atomI;
-debugArray[index].y                = (float) (y + tj); 
-//debugArray[index].z                = (float) cSim.atoms;
-debugArray[index].z                = atomI == (y+tj) ? 0.0f : energy;
-energy                             = ( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) )  ? (energy) : 0.0f;
-debugArray[index].w                = energy+totalEnergy;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = iCoord.x;
-debugArray[index].y                = iCoord.y;
-debugArray[index].z                = iCoord.z;
-debugArray[index].w                = (float) (blockIdx.x * blockDim.x + threadIdx.x);
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = jCoord.x;
-debugArray[index].y                = jCoord.y;
-debugArray[index].z                = jCoord.z;
-debugArray[index].w                = -4.0f;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = emixo;
-debugArray[index].y                = emixh;
-debugArray[index].z                = rmixo;
-debugArray[index].w                = rmixh;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = pullDebug[0].x;
-debugArray[index].y                = pullDebug[0].y;
-debugArray[index].z                = pullDebug[0].z;
-debugArray[index].w                = pullDebug[0].w;
-#if 0
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = pullDebug[1].x;
-debugArray[index].y                = pullDebug[1].y;
-debugArray[index].z                = pullDebug[1].z;
-debugArray[index].w                = pullDebug[1].w;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = pullDebug[2].x;
-debugArray[index].y                = pullDebug[2].y;
-debugArray[index].z                = pullDebug[2].z;
-debugArray[index].w                = pullDebug[2].w;
-#endif
-} else {
-//    energy = 0.0f;
-}
-#endif
            if( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
                // add to field at atomI the field due atomJ's dipole
@@ -209,54 +141,8 @@ debugArray[index].w                = pullDebug[2].w;
                                                  jRadius,iRadius,
                                                  rmjxo,  rmjxh,
                                                  emjxo,  emjxh,
-                                                  ijForce, &energy
+                                                  ijForce, &energy);
-#ifdef AMOEBA_DEBUG
-,  pullDebug
-#endif
-   );
-#ifdef AMOEBA_DEBUG
-if( (atomI == targetAtom) || ( (y+tj) == targetAtom ) ){
-unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-index                             += 2*cSim.paddedNumberOfAtoms;
-debugArray[index].x                = (float) atomI;
-debugArray[index].y                = (float) (y + tj); 
-debugArray[index].z                = atomI == (y+tj) ? 0.0f : energy;
-energy                             = ( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) )  ? (energy) : 0.0f;
-debugArray[index].w                = energy+totalEnergy;
-//debugArray[index].w                = -2.0f;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = emjxo;
-debugArray[index].y                = emjxh;
-debugArray[index].z                = rmjxo;
-debugArray[index].w                = rmjxh;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = pullDebug[0].x;
-debugArray[index].y                = pullDebug[0].y;
-debugArray[index].z                = pullDebug[0].z;
-debugArray[index].w                = pullDebug[0].w;
-#if 0
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = pullDebug[1].x;
-debugArray[index].y                = pullDebug[1].y;
-debugArray[index].z                = pullDebug[1].z;
-debugArray[index].w                = pullDebug[1].w;
-index                             += cSim.paddedNumberOfAtoms;
-debugArray[index].x                = pullDebug[2].x;
-debugArray[index].y                = pullDebug[2].y;
-debugArray[index].z                = pullDebug[2].z;
-debugArray[index].w                = pullDebug[2].w;
-#endif
-} else {
-    //energy = 0.0f;
-}
-#endif
            if( (atomI != (y+tj)) && (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
                // add to field at atomI the field due atomJ's dipole
@@ -281,24 +167,32 @@ debugArray[index].w                = pullDebug[2].w;
        // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
-        unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
+        unsigned int offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
-        load3dArrayBufferPerWarp( offset, forceSum,       outputForce );
+        add3dArrayToFloat4( offset, forceSum,  cSim.pForce4);
-        offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
+        // include diagonal only once
-        load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,       outputForce );
+        if( x != y ){
+            offset                              = (y + tgx + warp*cSim.paddedNumberOfAtoms);
+            add3dArrayToFloat4( offset, sA[threadIdx.x].force, cSim.pForce4);
+        }
 #else
-        unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
+        unsigned int offset                 = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-        load3dArray( offset, forceSum,       outputForce );
+        add3dArrayToFloat4( offset, forceSum,    cSim.pForce4);
-        offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
+        // include diagonal only once
-        load3dArray( offset, sA[threadIdx.x].force,       outputForce );
+        if( x != y ){
+            offset                              = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
+            add3dArrayToFloat4( offset, sA[threadIdx.x].force,    cSim.pForce4 );
+        }
 #endif
        lasty = y;
        pos++;
    }
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] -= cAmoebaSim.awater*totalEnergy;
    if( (blockIdx.x*blockDim.x + threadIdx.x) == 0 ){
        cSim.pEnergy[0] += cAmoebaSim.totalMaxWcaDispersionEnergy;