Initial RealSpaceEwald code for AmoebaMultipoleForce

Removal SASA code Versioning introduced in AmoebaTinkerParameterFile Decomposition of PME forces included in prm files

Initial RealSpaceEwald code for AmoebaMultipoleForce
Removal SASA code Versioning introduced in AmoebaTinkerParameterFile Decomposition of PME forces included in prm files
3bcfe998 · Mark Friedrichs · f981842a · 3bcfe998 · 3bcfe998 · 3bcfe998
Commit 3bcfe998 authored Aug 27, 2010 by Mark Friedrichs
14 changed files
--- a/plugins/amoeba/openmmapi/include/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/AmoebaMultipoleForce.h
@@ -50,6 +50,19 @@ namespace OpenMM {
 class OPENMM_EXPORT AmoebaMultipoleForce : public Force {
 public:
 
+    enum AmoebaNonbondedMethod {
+        /** 
+         * No cutoff is applied to nonbonded interactions.  The full set of N^2 interactions is computed exactly.
+         * This necessarily means that periodic boundary conditions cannot be used.  This is the default.
+         */
+        NoCutoff = 0,
+        /**
+         * Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
+         * with all periodic copies of every other particle.
+         */
+        PME = 1 
+    };  
+
    enum MultipoleAxisTypes { ZThenX, Bisector };

    // Algorithm used to converge mutual induced dipoles:
@@ -74,6 +87,46 @@ public:
        return multipoles.size();
    }

+    /**
+     * Get the method used for handling long range nonbonded interactions.
+     */
+    AmoebaNonbondedMethod getNonbondedMethod( void ) const;
+
+    /**
+     * Set the method used for handling long range nonbonded interactions.
+     */
+    void setNonbondedMethod(AmoebaNonbondedMethod method);
+
+    /**
+     * Get the cutoff distance (in nm) being used for nonbonded interactions.  If the NonbondedMethod in use
+     * is NoCutoff, this value will have no effect.
+     *
+     * @return the cutoff distance, measured in nm
+     */
+    double getCutoffDistance( void ) const;
+
+    /**
+     * Set the cutoff distance (in nm) being used for nonbonded interactions.  If the NonbondedMethod in use
+     * is NoCutoff, this value will have no effect.
+     *
+     * @param distance    the cutoff distance, measured in nm
+     */
+    void setCutoffDistance(double distance);
+
+    /**
+     * Get the aEwald parameter
+     *
+     * @return the Ewald parameter
+     */
+    double getAEwald() const;
+
+    /**
+     * Set the aEwald parameter
+     *
+     * @param Ewald parameter
+     */
+    void setAEwald(double aewald);
+
    /**
     * Add multipole-related info for a particle 
     *
@@ -224,6 +277,9 @@ protected:
    ForceImpl* createImpl();
 private:

+    AmoebaNonbondedMethod nonbondedMethod;
+    double cutoffDistance;
+    double aewald;
    MutualInducedIterationMethod mutualInducedIterationMethod;
    int mutualInducedMaxIterations;
    double mutualInducedTargetEpsilon;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -36,15 +36,32 @@

 using namespace OpenMM;

-AmoebaMultipoleForce::AmoebaMultipoleForce() {
+AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), cutoffDistance(1.0), aewald(0.0), mutualInducedIterationMethod(SOR), mutualInducedMaxIterations(60),
+                                               mutualInducedTargetEpsilon(1.0e-05), scalingDistanceCutoff(100.0), electricConstant(138.9354558456) {
+}
+
+AmoebaMultipoleForce::AmoebaNonbondedMethod AmoebaMultipoleForce::getNonbondedMethod( void ) const {
+    return nonbondedMethod;
+}
+
+void AmoebaMultipoleForce::setNonbondedMethod( AmoebaMultipoleForce::AmoebaNonbondedMethod method) {
+    nonbondedMethod = method;
+}

-    mutualInducedIterationMethod  = SOR;
-    mutualInducedMaxIterations    = 60;
-    mutualInducedTargetEpsilon    = 1.0e-06;
-    scalingDistanceCutoff         = 100.0;
+double AmoebaMultipoleForce::getCutoffDistance( void ) const {
+    return cutoffDistance;
+}
+
+void AmoebaMultipoleForce::setCutoffDistance(double distance) {
+    cutoffDistance = distance;
+}
+
+double AmoebaMultipoleForce::getAEwald() const {
+    return aewald;
+}

-                                    // ONE_4PI_EPS0
-    electricConstant              = 138.9354558456;
+void AmoebaMultipoleForce::setAEwald(double inputAewald ) {
+    aewald = inputAewald;
 }

 AmoebaMultipoleForce::MutualInducedIterationMethod AmoebaMultipoleForce::getMutualInducedIterationMethod( void ) const {

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -30,7 +30,6 @@
 #include "CudaPlatform.h"
 #include "kernels/amoebaGpuTypes.h"
 #include "kernels/cudaKernels.h"
-#include "kernels/amoebaCudaKernels.h"
 #include "openmm/KernelImpl.h"

 namespace OpenMM {

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -575,12 +575,20 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
         throw OpenMMException("Iterative method for mutual induced dipoles not recognized.\n");
    }

+    int nonbondedMethod = static_cast<int>(force.getNonbondedMethod());
+    if( nonbondedMethod != 0 && nonbondedMethod != 1 ){
+         throw OpenMMException("AmoebaMultipoleForce nonbonded method not recognized.\n");
+    }
+
    gpuSetAmoebaMultipoleParameters(data.getAmoebaGpu(), charges, dipoles, quadrupoles, axisTypes, multipoleAtomId1s, multipoleAtomId2s,
                                    tholes, scalingDistanceCutoff, dampingFactors, polarity,
                                    multipoleAtomCovalentInfo, covalentDegree, minCovalentIndices, minCovalentPolarizationIndices, (maxCovalentRange+2),
                                    static_cast<int>(force.getMutualInducedIterationMethod()),
                                    force.getMutualInducedMaxIterations(),
                                    static_cast<float>( force.getMutualInducedTargetEpsilon()),
+                                    nonbondedMethod,
+                                    static_cast<float>( force.getCutoffDistance()),
+                                    static_cast<float>( force.getAEwald()),
                                    static_cast<float>( force.getElectricConstant()) );

 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -29,6 +29,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "openmm/OpenMMException.h"
 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"

@@ -332,7 +333,11 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     pD_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pD_ScaleIndices );
    (void) fprintf( log, "     pP_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pP_ScaleIndices );
    (void) fprintf( log, "     pM_ScaleIndices                    %p\n",      amoebaGpu->amoebaSim.pM_ScaleIndices );
+    (void) fprintf( log, "     sqrtPi                             %15.7e\n",  amoebaGpu->amoebaSim.sqrtPi );
+    (void) fprintf( log, "     cutoffDistance2                    %15.7e\n",  amoebaGpu->amoebaSim.cutoffDistance2 );
+    (void) fprintf( log, "     aewald                             %15.7e\n",  amoebaGpu->amoebaSim.aewald );
    (void) fprintf( log, "     electric                           %15.7e\n",  amoebaGpu->amoebaSim.electric );
+    (void) fprintf( log, "     box                                %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
    (void) fprintf( log, "     gkc                                %15.7e\n",  amoebaGpu->amoebaSim.gkc );
    (void) fprintf( log, "     dielec                             %15.7e\n",  amoebaGpu->amoebaSim.dielec );
    (void) fprintf( log, "     dwater                             %15.7e\n",  amoebaGpu->amoebaSim.dwater );
@@ -1429,7 +1434,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector<float>& tholes, float scalingDistanceCutoff,const std::vector<float>& dampingFactors, const std::vector<float>& polarity,
                                     const std::vector< std::vector< std::vector<int> > >& multipoleParticleCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange, 
-                                     int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon, float electricConstant ){
+                                     int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
+                                     int nonbondedMethod, float cutoffDistance, float aewald, float electricConstant ){

 // ---------------------------------------------------------------------------------------

@@ -1514,6 +1520,16 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
        amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].z   = ii;
    }

+    if( nonbondedMethod == 0 ){
+        amoebaGpu->multipoleNonbondedMethod = AMOEBA_NO_CUTOFF;
+    } else if( nonbondedMethod == 1 ){
+        amoebaGpu->multipoleNonbondedMethod = AMOEBA_PARTICLE_MESH_EWALD;
+    } else {
+        throw OpenMM::OpenMMException("multipoleNonbondedMethod not recognzied.\n" );
+    }
+    amoebaGpu->amoebaSim.cutoffDistance2    = cutoffDistance*cutoffDistance;
+    amoebaGpu->amoebaSim.sqrtPi             = sqrt( 3.1415926535897932384626433832795 );
+    amoebaGpu->amoebaSim.aewald             = aewald;
    amoebaGpu->amoebaSim.electric           = electricConstant;
    if( amoebaGpu->amoebaSim.dielec < 1.0e-05 ){
        amoebaGpu->amoebaSim.dielec      = 1.0f;
@@ -2432,73 +2448,6 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,

 }

-extern "C"
-void gpuSetAmoebaSASAParameters( amoebaGpuContext amoebaGpu, float probeRadius, 
-                                 const std::vector<float>& radii, const std::vector<float>& weights )
-{
-   // ---------------------------------------------------------------------------------------
-
-    static const char* methodName = "gpuSetAmoebaSASAParameters";
-    static const int maxarc       = 500;
-
-   // ---------------------------------------------------------------------------------------
-
-
-    gpuContext gpu                         = amoebaGpu->gpuContext;
-    amoebaGpu->paddedNumberOfAtoms         = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    unsigned int particles                 = radii.size();
-    if( particles < 1 ){
-        (void) fprintf( stderr, "%s no particles\n", methodName );
-        return;
-    } 
-(void) fprintf( stderr, "%s radius converted Ang !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", methodName );
-float scaleRadius = 10.0f;
-
-    amoebaGpu->amoebaSim.probeRadius       = probeRadius;
-    amoebaGpu->amoebaSim.maxarc            = maxarc;
-    amoebaGpu->psSASA_Radii                = new CUDAStream<float>(amoebaGpu->paddedNumberOfAtoms,   1, "SASARadii");
-    amoebaGpu->psSASA_Weights              = new CUDAStream<float>(amoebaGpu->paddedNumberOfAtoms,   1, "SASAWeights");
-    for (unsigned int ii = 0; ii < particles; ii++) 
-    {    
-        amoebaGpu->psSASA_Radii->_pSysStream[0][ii]     = scaleRadius*( radii[ii] + probeRadius );
-        amoebaGpu->psSASA_Weights->_pSysStream[0][ii]   = weights[ii];
-    }    
-
-    // Dummy out extra particles data
-    for (unsigned int ii = particles; ii < amoebaGpu->paddedNumberOfAtoms; ii++) 
-    {    
-        amoebaGpu->psSASA_Radii->_pSysStream[0][ii]     = 1.0f;
-        amoebaGpu->psSASA_Weights->_pSysStream[0][ii]   = 0.0f;
-    }    
-    for (unsigned int ii = 0; ii < 4; ii++) 
-    {
-        amoebaGpu->psIntWorkArray[ii] = new CUDAStream<int>(amoebaGpu->amoebaSim.maxarc*amoebaGpu->paddedNumberOfAtoms,   1, "SASAIntWorkArray");
-    }
-    amoebaGpu->psFloatWorkArray = new CUDAStream<float>(amoebaGpu->amoebaSim.maxarc*amoebaGpu->paddedNumberOfAtoms,   1, "SASAFloatWorkArray");
-    amoebaGpu->psIoListCount    = new CUDAStream<int>(amoebaGpu->paddedNumberOfAtoms,   1, "SASAIoCount");
-    amoebaGpu->psDoneAtom       = new CUDAStream<int>(amoebaGpu->paddedNumberOfAtoms,   1, "SASADoneAtom");
-
-    amoebaGpu->psSASA_Radii->Upload();
-    amoebaGpu->psSASA_Weights->Upload();
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        unsigned int maxPrint = 32;
-        (void) fprintf( amoebaGpu->log, "%s probeRadius=%12.3f\n", methodName, probeRadius );
-        for (unsigned int ii = 0; ii < gpu->natoms; ii++) 
-        {    
-            (void) fprintf( amoebaGpu->log, "%5u %15.7e %15.7e\n", ii, radii[ii], weights[ii] );
-            if( ii == maxPrint && ii < (amoebaGpu->paddedNumberOfAtoms - maxPrint) )
-            {
-                ii = (amoebaGpu->paddedNumberOfAtoms - maxPrint);
-            }
-        } 
-        (void) fflush( amoebaGpu->log );
-    }    
-#endif
-
-}
-
 extern "C"
 void amoebaGpuShutDown(amoebaGpuContext gpu)
 {
@@ -2584,16 +2533,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)

    delete gpu->psWcaDispersionRadiusEpsilon;

-    delete gpu->psSASA_Radii;
-    delete gpu->psSASA_Weights;
-    //delete gpu->psSASA_WeightIntWorkArray[0];
-    //delete gpu->psSASA_WeightIntWorkArray[1];
-    //delete gpu->psSASA_WeightIntWorkArray[2];
-    //delete gpu->psSASA_WeightIntWorkArray[3];
-    delete gpu->psDoneAtom;
-    delete gpu->psIoListCount;
-    delete gpu->psFloatWorkArray;
-
    delete gpu->psWorkArray_3_1; 
    delete gpu->psWorkArray_3_2; 
    delete gpu->psWorkArray_3_3; 
@@ -2648,7 +2587,6 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)

    gpuSetAmoebaBondOffsets( amoebaGpu );
    SetCalculateAmoebaLocalForcesSim( amoebaGpu );
-    //SetCalculateAmoebaCudaSASAForcesSim( amoebaGpu );
    SetForcesSim( amoebaGpu->gpuContext );
    SetCalculateAmoebaMultipoleForcesSim( amoebaGpu );
    SetCalculateAmoebaCudaFixedEFieldSim( amoebaGpu );
@@ -2656,12 +2594,12 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
    SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );
    SetCalculateAmoebaElectrostaticSim( amoebaGpu );
+    SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );
    SetCalculateAmoebaCudaMapTorquesSim( amoebaGpu );
    SetCalculateAmoebaKirkwoodSim( amoebaGpu );
    SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpu );
    SetCalculateAmoebaCudaFixedEAndGKFieldsSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim( amoebaGpu );
-    //SetCalculateAmoebaObcGbsaForces2Sim( amoebaGpu );
    SetCalculateObcGbsaForces2Sim(  amoebaGpu->gpuContext  );
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -97,6 +97,10 @@ extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu );

+extern void SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
+extern void GetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
+extern void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu );
+
 extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
 extern void cudaComputeAmoebaMapTorques( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce);
@@ -144,7 +148,7 @@ extern void kClearFields_1( amoebaGpuContext amoebaGpu );
 extern void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear );
 extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread );

-extern int isNanOrInfinity( double number );
+//extern int isNanOrInfinity( double number );
 extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);

 #endif //__AMOEBA_GPU_TYPES_H__

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -41,6 +41,12 @@
 #include <builtin_types.h>
 #include <vector_functions.h>

+enum CudaAmoebaNonbondedMethod
+{
+    AMOEBA_NO_CUTOFF,
+    AMOEBA_PARTICLE_MESH_EWALD
+};
+
 struct cudaAmoebaGmxSimulation {
    // Constants

@@ -118,6 +124,9 @@ struct cudaAmoebaGmxSimulation {

    unsigned int numberOfAtoms;                     // number of atoms
    unsigned int paddedNumberOfAtoms;               // padded number of atoms
+    float cutoffDistance2;                          // cutoff distance squared for PME
+    float sqrtPi;                                   // sqrt(PI)
+    float aewald;                                   // aewald parameter
    float scalingDistanceCutoff;                    // scaling cutoff
    float2*         pDampingFactorAndThole;         // Thole & damping factors

@@ -168,89 +177,6 @@ struct cudaAmoebaGmxSimulation {
    float fd;        // electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
    float fq;        // electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);

-    // SASA probe radius
-    float probeRadius; 
-    int maxarc; 
-
-//    texture<float4,2,cudaReadModeElementType> texTorTorGrid;
-
-#if 0
-    unsigned int    dihedrals;                      // Number of dihedrals
-    int4*           pDihedralID1;                   // Dihedral IDs
-    int4*           pDihedralID2;                   // Dihedral output buffer IDs
-    float4*         pDihedralParameter;             // Dihedral parameters
-    unsigned int    rb_dihedrals;                   // Number of Ryckaert Bellemans dihedrals
-    int4*           pRbDihedralID1;                 // Ryckaert Bellemans Dihedral IDs
-    int4*           pRbDihedralID2;                 // Ryckaert Bellemans Dihedral output buffer IDs
-    float4*         pRbDihedralParameter1;          // Ryckaert Bellemans Dihedral parameters
-    float2*         pRbDihedralParameter2;          // Ryckaert Bellemans Dihedral parameters
-    unsigned int    LJ14s;                          // Number of Lennard Jones 1-4 interactions
-    int4*           pLJ14ID;                        // Lennard Jones 1-4 atom and output buffer IDs
-    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
-    float           inverseTotalMass;               // Used in linear momentum removal
-    unsigned int    ShakeConstraints;               // Total number of Shake constraints
-    unsigned int    settleConstraints;              // Total number of Settle constraints
-    unsigned int    ccmaConstraints;                // Total number of CCMA constraints.
-    unsigned int    rigidClusters;                  // Total number of rigid clusters
-    unsigned int    maxRigidClusterSize;            // The size of the largest rigid cluster
-    unsigned int    clusterShakeBlockSize;          // The number of threads to process each rigid cluster
-    unsigned int    NonShakeConstraints;            // Total number of NonShake atoms
-    unsigned int    maxShakeIterations;             // Maximum shake iterations
-    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
-    float           shakeTolerance;                 // Shake tolerance
-    float           InvMassJ;                       // Shake inverse mass for hydrogens
-    int*            pNonShakeID;                    // Not Shaking atoms
-    int4*           pShakeID;                       // Shake atoms and phase
-    float4*         pShakeParameter;                // Shake parameters
-    int4*           pSettleID;                      // Settle atoms
-    float2*         pSettleParameter;               // Settle parameters
-    unsigned int*   pExclusion;                     // Nonbond exclusion data
-    unsigned int*   pExclusionIndex;                // Index of exclusion data for each work unit
-    unsigned int    dihedral_offset;                // Offset to end of dihedrals
-    unsigned int    rb_dihedral_offset;             // Offset to end of Ryckaert Bellemans dihedrals
-    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
-    int*            pAtomIndex;                     // The original index of each atom
-    float4*         pGridBoundingBox;               // The size of each grid cell
-    float4*         pGridCenter;                    // The center of each grid cell
-    int2*           pCcmaAtoms;                     // The atoms connected by each CCMA constraint
-    float4*         pCcmaDistance;                  // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
-    float*          pCcmaDelta1;                    // Workspace for CCMA
-    float*          pCcmaDelta2;                    // Workspace for CCMA
-    int*            pCcmaAtomConstraints;           // The indices of constraints involving each atom
-    int*            pCcmaNumAtomConstraints;        // The number of constraints involving each atom
-    short*          pSyncCounter;                   // Used for global thread synchronization
-    unsigned int*   pRequiredIterations;            // Used by CCMA to communicate whether iteration has converged
-    float*          pCcmaReducedMass;               // The reduced mass for each CCMA constraint
-    unsigned int*   pConstraintMatrixColumn;        // The column of each element in the constraint matrix.
-    float*          pConstraintMatrixValue;         // The value of each element in the constraint matrix.
-
-    // Mutable stuff
-    float4*         pPosq;                          // Pointer to atom positions and charges
-    float4*         pPosqP;                         // Pointer to mid-integration atom positions
-    float4*         pOldPosq;                       // Pointer to old atom positions
-    float4*         pVelm4;                         // Pointer to atom velocity and inverse mass
-    float4*         pvVector4;                      // Pointer to atom v Vector
-    float4*         pxVector4;                      // Pointer to atom x Vector
-    float*          pBornForce;                     // Pointer to Born force data
-    float*	    pBornSum;                       // Pointer to Born Radii calculation output buffers
-    float*	    pBornRadii;                     // Pointer to Born Radii
-    float*          pObcChain;                      // Pointer to OBC chain data
-    float4*         pLinearMomentum;                // Pointer to linear momentum
-    
-    // Random numbers
-    float4*         pRandom4a;                      // Pointer to first set of 4 random numbers
-    float4*         pRandom4b;                      // Pointer to second set of 4 random numbers
-    float2*         pRandom2a;                      // Pointer to first set of 2 random numbers
-    float2*         pRandom2b;                      // Pointer to second set of 2 random numbers
-    uint4*          pRandomSeed;                    // Pointer to random seeds
-    int*            pRandomPosition;                // Pointer to random number positions
-    unsigned int    randoms;                        // Number of randoms
-    unsigned int    totalRandoms;                   // Number of randoms plus overflow.
-    unsigned int    totalRandomsTimesTwo;           // Used for generating randoms
-    unsigned int    randomIterations;               // Number of iterations before regenerating randoms
-    unsigned int    randomFrames;                   // Number of frames of random numbers
-#endif
-
 };

 #endif
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -152,6 +152,10 @@ struct _amoebaGpuContext {
    CUDAStream<float>*  psE_Field;
    CUDAStream<float>*  psE_FieldPolar;

+    int multipoleNonbondedMethod;
+    double cutoffDistance;
+    double aewald;
+
    // mutual induced field

    int mutualInducedIterativeMethod;
@@ -215,17 +219,6 @@ struct _amoebaGpuContext {
    // Wca dispersion fields

    CUDAStream<float2>*  psWcaDispersionRadiusEpsilon;
-
-    // SASA fields
-
-    CUDAStream<float>*  psSASA_Radii;
-    CUDAStream<float>*  psSASA_Weights; 
-    CUDAStream<int>*    psIntWorkArray[4]; 
-    CUDAStream<int>*    psDoneAtom; 
-    CUDAStream<int>*    psIoListCount; 
-    CUDAStream<float>*  psFloatWorkArray; 
-    float  sasaArea; 
-
 };

 typedef struct _amoebaGpuContext *amoebaGpuContext;
@@ -303,7 +296,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     const std::vector<float>& tholes, float scalingDistanceCutoff,const std::vector<float>& dampingFactors, const std::vector<float>& polarity,
                                     const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo, const std::vector<int>& covalentDegree,
                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
-                                     int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon, float electricConstant );
+                                     int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
+                                     int nonbondedMethod, float cutoffDistance, float aewald, float electricConstant );


 extern "C"
@@ -332,9 +326,6 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
                                const float epso, const float epsh, const float rmino, const float rminh,
                                const float awater, const float shctd, const float dispoff );
 
-extern "C"
-void gpuSetAmoebaSASAParameters( amoebaGpuContext amoebaGpu , float probeRadius, const std::vector<float>& radii, const std::vector<float>& weights );
-
 extern "C"
 void amoebaGpuSetConstants(amoebaGpuContext gpu);


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.cu
+//-----------------------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------------------
+
+#include "amoebaGpuTypes.h"
+#include "amoebaCudaKernels.h"
+#include "kCalculateAmoebaCudaUtilities.h"
+
+//#define AMOEBA_DEBUG
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+
+void SetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
+    RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
+    RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+
+void GetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
+    RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
+    RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+}
+texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
+
+static __device__ float fastErfc(float r)
+{
+    float normalized = cSim.tabulatedErfcScale*r;
+    int index = (int) normalized;
+    float fract2 = normalized-index;
+    float fract1 = 1.0f-fract2;
+    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
+}
+
+static int const PScaleIndex            =  0; 
+static int const DScaleIndex            =  1; 
+static int const UScaleIndex            =  2; 
+static int const MScaleIndex            =  3;
+//static int const Scale3Index            =  4;
+//static int const Scale5Index            =  5;
+//static int const Scale7Index            =  6;
+//static int const Scale9Index            =  7;
+//static int const Ddsc30Index            =  8;
+//static int const Ddsc31Index            =  9;
+//static int const Ddsc32Index            = 10; 
+//static int const Ddsc50Index            = 11;
+//static int const Ddsc51Index            = 12;
+//static int const Ddsc52Index            = 13; 
+//static int const Ddsc70Index            = 14;
+//static int const Ddsc71Index            = 15;
+//static int const Ddsc72Index            = 16;
+static int const LastScalingIndex       = 4;
+
+struct RealSpaceEwaldParticle {
+
+    // coordinates charge
+
+    float x;
+    float y;
+    float z;
+    float q;
+
+    // lab frame dipole
+
+    float labFrameDipole[3];
+
+    // lab frame quadrupole
+
+    float labFrameQuadrupole[9];
+
+    // induced dipole
+
+    float inducedDipole[3];
+
+    // polar induced dipole
+
+    float inducedDipoleP[3];
+
+    // scaling factors
+
+    float thole;
+    float damp;
+
+    float force[3];
+
+    float torque[3];
+    float padding;
+
+};
+
+__device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& atomI,   RealSpaceEwaldParticle& atomJ,
+                                                       float scalingDistanceCutoff,   float* scalingFactors,
+                                                       float*  outputForce,           float  outputTorque[2][3],
+                                                       float* energy
+#ifdef AMOEBA_DEBUG
+                                                    ,float4* debugArray 
+#endif
+ ){
+  
+
+    float e,ei,f;
+    //float gfd,gfdr;
+    //float xix,yix,zix;
+    //float xiy,yiy,ziy;
+    //float xiz,yiz,ziz;
+    //float xkx,ykx,zkx;
+    //float xky,yky,zky;
+    //float xkz,ykz,zkz;
+    //float rr1,rr3;
+    //float rr5,rr7,rr9,rr11;
+    float erl,erli;
+    //float frcxi[4],frcxk[4];
+    //float frcyi[4],frcyk[4];
+    //float frczi[4],frczk[4];
+    float di[4],qi[10];
+    float dk[4],qk[10];
+    float fridmp[4],findmp[4];
+    float ftm2[4],ftm2i[4];
+    float ftm2r[4],ftm2ri[4];
+    float ttm2[4],ttm3[4];
+    float ttm2i[4],ttm3i[4];
+    float ttm2r[4],ttm3r[4];
+    float ttm2ri[4],ttm3ri[4];
+    //float fdir[4]
+    float dixdk[4];
+    float dkxui[4],dixuk[4];
+    float dixukp[4],dkxuip[4];
+    float uixqkr[4],ukxqir[4];
+    float uixqkrp[4],ukxqirp[4];
+    float qiuk[4],qkui[4];
+    float qiukp[4],qkuip[4];
+    float rxqiuk[4],rxqkui[4];
+    float rxqiukp[4],rxqkuip[4];
+    float qidk[4],qkdi[4];
+    float qir[4],qkr[4];
+    float qiqkr[4],qkqir[4];
+    float qixqk[4],rxqir[4];
+    float dixr[4],dkxr[4];
+    float dixqkr[4],dkxqir[4];
+    float rxqkr[4],qkrxqir[4];
+    float rxqikr[4],rxqkir[4];
+    float rxqidk[4],rxqkdi[4];
+    float ddsc3[4],ddsc5[4];
+    float ddsc7[4];
+    float bn[6];
+    float sc[11],gl[9];
+    float sci[9],scip[9];
+    float gli[8],glip[8];
+    float gf[8],gfi[7];
+    float gfr[8],gfri[7];
+    float gti[7],gtri[7];
+
+    f = (cAmoebaSim.electric/cAmoebaSim.dielec);
+
+    // set the permanent multipole and induced dipole values;
+
+    float pdi   = atomI.damp;
+    float pti   = atomI.thole;
+    float ci    = atomI.q;
+
+    di[1]       = atomI.labFrameDipole[0];
+    di[2]       = atomI.labFrameDipole[1];
+    di[3]       = atomI.labFrameDipole[2];
+
+    qi[1]       = atomI.labFrameQuadrupole[0];
+    qi[2]       = atomI.labFrameQuadrupole[1];
+    qi[3]       = atomI.labFrameQuadrupole[2];
+    qi[4]       = atomI.labFrameQuadrupole[3];
+    qi[5]       = atomI.labFrameQuadrupole[4];
+    qi[6]       = atomI.labFrameQuadrupole[5];
+    qi[7]       = atomI.labFrameQuadrupole[6];
+    qi[8]       = atomI.labFrameQuadrupole[7];
+    qi[9]       = atomI.labFrameQuadrupole[8];
+
+    float xr    = atomJ.x - atomI.x;
+    float yr    = atomJ.y - atomI.y;
+    float zr    = atomJ.z - atomI.z;
+
+    // periodic box 
+
+    xr         -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+    yr         -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+    zr         -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+
+    float r2    = xr*xr + yr*yr + zr*zr;
+    if( r2 <= cAmoebaSim.cutoffDistance2 ){
+        float r      = sqrt(r2);
+        float ck     = atomJ.q;
+      
+              dk[1]  = atomJ.labFrameDipole[0];
+              dk[2]  = atomJ.labFrameDipole[1];
+              dk[3]  = atomJ.labFrameDipole[2];
+      
+              qk[1]  = atomJ.labFrameQuadrupole[0];
+              qk[2]  = atomJ.labFrameQuadrupole[1];
+              qk[3]  = atomJ.labFrameQuadrupole[2];
+              qk[4]  = atomJ.labFrameQuadrupole[3];
+              qk[5]  = atomJ.labFrameQuadrupole[4];
+              qk[6]  = atomJ.labFrameQuadrupole[5];
+              qk[7]  = atomJ.labFrameQuadrupole[6];
+              qk[8]  = atomJ.labFrameQuadrupole[7];
+              qk[9]  = atomJ.labFrameQuadrupole[8];
+      
+        // calculate the real space error function terms;
+
+        float ralpha = cAmoebaSim.aewald*r;
+
+               bn[0] = fastErfc(ralpha)/r;
+
+        float alsq2  = 2.0f*cAmoebaSim.aewald*cAmoebaSim.aewald;
+        float alsq2n = 0.0f;
+        if( cAmoebaSim.aewald > 0.0f){
+            alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cAmoebaSim.aewald);
+        }
+        float exp2a  = exp(-(ralpha*ralpha));
+
+        alsq2n      *= alsq2;
+        bn[1]        = (bn[0]+alsq2n*exp2a)/r2;
+
+        alsq2n      *= alsq2;
+        bn[2]        = (3.0f*bn[1]+alsq2n*exp2a)/r2;
+
+        alsq2n      *= alsq2;
+        bn[3]        = (5.0f*bn[2]+alsq2n*exp2a)/r2;
+
+        alsq2n      *= alsq2;
+        bn[4]        = (7.0f*bn[3]+alsq2n*exp2a)/r2;
+
+        alsq2n      *= alsq2;
+        bn[5]        = (9.0f*bn[4]+alsq2n*exp2a)/r2;
+
+        // apply Thole polarization damping to scale factors;
+
+        float rr1    = 1.0f/r;
+        float rr3    = rr1 / r2;
+        float rr5    = 3.0f * rr3 / r2;
+        float rr7    = 5.0f * rr5 / r2;
+        float rr9    = 7.0f * rr7 / r2;
+        float rr11   = 9.0f * rr9 / r2;
+        float scale3 = 1.0f;
+        float scale5 = 1.0f;
+        float scale7 = 1.0f;
+
+            ddsc3[0] = 0.0f;
+            ddsc3[1] = 0.0f;
+            ddsc3[2] = 0.0f;
+
+            ddsc5[0] = 0.0f;
+            ddsc5[1] = 0.0f;
+            ddsc5[2] = 0.0f;
+
+            ddsc7[0] = 0.0f;
+            ddsc7[1] = 0.0f;
+            ddsc7[2] = 0.0f;
+
+        float pdk    = atomJ.damp;
+        float ptk    = atomJ.thole;
+        float damp   = pdi*pdk;
+        if( damp != 0.0f ){
+            float pgamma = pti < ptk ? pti : ptk;
+            float ratio  = r/damp;
+                damp     = -pgamma * ratio*ratio*ratio;
+            if( damp > -50.0f ){
+                float expdamp  = exp(damp);
+                scale3         = 1.0f - expdamp;
+                   scale5      = 1.0f - (1.0f-damp)*expdamp;
+                   scale7      = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
+                float temp3    = -3.0f * damp * expdamp / r2;
+                float temp5    = -damp;
+                float temp7    = -0.2f - 0.6f*damp;
+                   ddsc3[1] = temp3 * xr;
+                   ddsc3[2] = temp3 * yr;
+                   ddsc3[3] = temp3 * zr;
+                   ddsc5[1] = temp5 * ddsc3[1];
+                   ddsc5[2] = temp5 * ddsc3[2];
+                   ddsc5[3] = temp5 * ddsc3[3];
+                   ddsc7[1] = temp7 * ddsc5[1];
+                   ddsc7[2] = temp7 * ddsc5[2];
+                   ddsc7[3] = temp7 * ddsc5[3];
+             }
+        }
+
+        float dsc3 = 1.0f - scale3*scalingFactors[DScaleIndex];
+        float dsc5 = 1.0f - scale5*scalingFactors[DScaleIndex];
+        float dsc7 = 1.0f - scale7*scalingFactors[DScaleIndex];
+
+        float psc3 = 1.0f - scale3*scalingFactors[PScaleIndex];
+        float psc5 = 1.0f - scale5*scalingFactors[PScaleIndex];
+        float psc7 = 1.0f - scale7*scalingFactors[PScaleIndex];
+
+        float usc3 = 1.0f - scale3*scalingFactors[UScaleIndex];
+        float usc5 = 1.0f - scale5*scalingFactors[UScaleIndex];
+
+        // construct necessary auxiliary vectors
+
+        dixdk[1]       = di[2]*dk[3] - di[3]*dk[2];
+        dixdk[2]       = di[3]*dk[1] - di[1]*dk[3];
+        dixdk[3]       = di[1]*dk[2] - di[2]*dk[1];
+
+        dixuk[1]       = di[2]*atomJ.inducedDipole[2] - di[3]*atomJ.inducedDipole[1];
+        dixuk[2]       = di[3]*atomJ.inducedDipole[0] - di[1]*atomJ.inducedDipole[2];
+        dixuk[3]       = di[1]*atomJ.inducedDipole[1] - di[2]*atomJ.inducedDipole[0];
+        dkxui[1]       = dk[2]*atomI.inducedDipole[2] - dk[3]*atomI.inducedDipole[1];
+        dkxui[2]       = dk[3]*atomI.inducedDipole[0] - dk[1]*atomI.inducedDipole[2];
+        dkxui[3]       = dk[1]*atomI.inducedDipole[1] - dk[2]*atomI.inducedDipole[0];
+        dixukp[1]      = di[2]*atomJ.inducedDipoleP[2] - di[3]*atomJ.inducedDipoleP[1];
+        dixukp[2]      = di[3]*atomJ.inducedDipoleP[0] - di[1]*atomJ.inducedDipoleP[2];
+        dixukp[3]      = di[1]*atomJ.inducedDipoleP[1] - di[2]*atomJ.inducedDipoleP[0];
+        dkxuip[1]      = dk[2]*atomI.inducedDipoleP[2] - dk[3]*atomI.inducedDipoleP[1];
+        dkxuip[2]      = dk[3]*atomI.inducedDipoleP[0] - dk[1]*atomI.inducedDipoleP[2];
+        dkxuip[3]      = dk[1]*atomI.inducedDipoleP[1] - dk[2]*atomI.inducedDipoleP[0];
+        dixr[1]        = di[2]*zr - di[3]*yr;
+        dixr[2]        = di[3]*xr - di[1]*zr;
+        dixr[3]        = di[1]*yr - di[2]*xr;
+        dkxr[1]        = dk[2]*zr - dk[3]*yr;
+        dkxr[2]        = dk[3]*xr - dk[1]*zr;
+        dkxr[3]        = dk[1]*yr - dk[2]*xr;
+        qir[1]         = qi[1]*xr + qi[4]*yr + qi[7]*zr;
+        qir[2]         = qi[2]*xr + qi[5]*yr + qi[8]*zr;
+        qir[3]         = qi[3]*xr + qi[6]*yr + qi[9]*zr;
+        qkr[1]         = qk[1]*xr + qk[4]*yr + qk[7]*zr;
+        qkr[2]         = qk[2]*xr + qk[5]*yr + qk[8]*zr;
+        qkr[3]         = qk[3]*xr + qk[6]*yr + qk[9]*zr;
+        qiqkr[1]       = qi[1]*qkr[1] + qi[4]*qkr[2] + qi[7]*qkr[3];
+        qiqkr[2]       = qi[2]*qkr[1] + qi[5]*qkr[2] + qi[8]*qkr[3];
+        qiqkr[3]       = qi[3]*qkr[1] + qi[6]*qkr[2] + qi[9]*qkr[3];
+        qkqir[1]       = qk[1]*qir[1] + qk[4]*qir[2] + qk[7]*qir[3];
+        qkqir[2]       = qk[2]*qir[1] + qk[5]*qir[2] + qk[8]*qir[3];
+        qkqir[3]       = qk[3]*qir[1] + qk[6]*qir[2] + qk[9]*qir[3];
+        qixqk[1]       = qi[2]*qk[3] + qi[5]*qk[6] + qi[8]*qk[9]
+                       - qi[3]*qk[2] - qi[6]*qk[5] - qi[9]*qk[8];
+        qixqk[2]       = qi[3]*qk[1] + qi[6]*qk[4] + qi[9]*qk[7]
+                       - qi[1]*qk[3] - qi[4]*qk[6] - qi[7]*qk[9];
+        qixqk[3]       = qi[1]*qk[2] + qi[4]*qk[5] + qi[7]*qk[8]
+                       - qi[2]*qk[1] - qi[5]*qk[4] - qi[8]*qk[7];
+        rxqir[1]       = yr*qir[3] - zr*qir[2];
+        rxqir[2]       = zr*qir[1] - xr*qir[3];
+        rxqir[3]       = xr*qir[2] - yr*qir[1];
+        rxqkr[1]       = yr*qkr[3] - zr*qkr[2];
+        rxqkr[2]       = zr*qkr[1] - xr*qkr[3];
+        rxqkr[3]       = xr*qkr[2] - yr*qkr[1];
+        rxqikr[1]      = yr*qiqkr[3] - zr*qiqkr[2];
+        rxqikr[2]      = zr*qiqkr[1] - xr*qiqkr[3];
+        rxqikr[3]      = xr*qiqkr[2] - yr*qiqkr[1];
+        rxqkir[1]      = yr*qkqir[3] - zr*qkqir[2];
+        rxqkir[2]      = zr*qkqir[1] - xr*qkqir[3];
+        rxqkir[3]      = xr*qkqir[2] - yr*qkqir[1];
+        qkrxqir[1]     = qkr[2]*qir[3] - qkr[3]*qir[2];
+        qkrxqir[2]     = qkr[3]*qir[1] - qkr[1]*qir[3];
+        qkrxqir[3]     = qkr[1]*qir[2] - qkr[2]*qir[1];
+        qidk[1]        = qi[1]*dk[1] + qi[4]*dk[2] + qi[7]*dk[3];
+        qidk[2]        = qi[2]*dk[1] + qi[5]*dk[2] + qi[8]*dk[3];
+        qidk[3]        = qi[3]*dk[1] + qi[6]*dk[2] + qi[9]*dk[3];
+        qkdi[1]        = qk[1]*di[1] + qk[4]*di[2] + qk[7]*di[3];
+        qkdi[2]        = qk[2]*di[1] + qk[5]*di[2] + qk[8]*di[3];
+        qkdi[3]        = qk[3]*di[1] + qk[6]*di[2] + qk[9]*di[3];
+        qiuk[1]        = qi[1]*atomJ.inducedDipole[0] + qi[4]*atomJ.inducedDipole[1]
+                       + qi[7]*atomJ.inducedDipole[2];
+        qiuk[2]        = qi[2]*atomJ.inducedDipole[0] + qi[5]*atomJ.inducedDipole[1]
+                       + qi[8]*atomJ.inducedDipole[2];
+        qiuk[3]        = qi[3]*atomJ.inducedDipole[0] + qi[6]*atomJ.inducedDipole[1] 
+                       + qi[9]*atomJ.inducedDipole[2];
+        qkui[1]        = qk[1]*atomI.inducedDipole[0] + qk[4]*atomI.inducedDipole[1]
+                       + qk[7]*atomI.inducedDipole[2];
+        qkui[2]        = qk[2]*atomI.inducedDipole[0] + qk[5]*atomI.inducedDipole[1]
+                       + qk[8]*atomI.inducedDipole[2];
+        qkui[3]        = qk[3]*atomI.inducedDipole[0] + qk[6]*atomI.inducedDipole[1]
+                       + qk[9]*atomI.inducedDipole[2];
+        qiukp[1]       = qi[1]*atomJ.inducedDipoleP[0] + qi[4]*atomJ.inducedDipoleP[1]
+                        + qi[7]*atomJ.inducedDipoleP[2];
+        qiukp[2]       = qi[2]*atomJ.inducedDipoleP[0] + qi[5]*atomJ.inducedDipoleP[1]
+                        + qi[8]*atomJ.inducedDipoleP[2];
+        qiukp[3]       = qi[3]*atomJ.inducedDipoleP[0] + qi[6]*atomJ.inducedDipoleP[1]
+                        + qi[9]*atomJ.inducedDipoleP[2];
+        qkuip[1]       = qk[1]*atomI.inducedDipoleP[0] + qk[4]*atomI.inducedDipoleP[1]
+                        + qk[7]*atomI.inducedDipoleP[2];
+        qkuip[2]       = qk[2]*atomI.inducedDipoleP[0] + qk[5]*atomI.inducedDipoleP[1]
+                        + qk[8]*atomI.inducedDipoleP[2];
+        qkuip[3]       = qk[3]*atomI.inducedDipoleP[0] + qk[6]*atomI.inducedDipoleP[1]
+                        + qk[9]*atomI.inducedDipoleP[2];
+        dixqkr[1]      = di[2]*qkr[3] - di[3]*qkr[2];
+        dixqkr[2]      = di[3]*qkr[1] - di[1]*qkr[3];
+        dixqkr[3]      = di[1]*qkr[2] - di[2]*qkr[1];
+        dkxqir[1]      = dk[2]*qir[3] - dk[3]*qir[2];
+        dkxqir[2]      = dk[3]*qir[1] - dk[1]*qir[3];
+        dkxqir[3]      = dk[1]*qir[2] - dk[2]*qir[1];
+        uixqkr[1]      = atomI.inducedDipole[1]*qkr[3] - atomI.inducedDipole[2]*qkr[2];
+        uixqkr[2]      = atomI.inducedDipole[2]*qkr[1] - atomI.inducedDipole[0]*qkr[3];
+        uixqkr[3]      = atomI.inducedDipole[0]*qkr[2] - atomI.inducedDipole[1]*qkr[1];
+        ukxqir[1]      = atomJ.inducedDipole[1]*qir[3] - atomJ.inducedDipole[2]*qir[2];
+        ukxqir[2]      = atomJ.inducedDipole[2]*qir[1] - atomJ.inducedDipole[0]*qir[3];
+        ukxqir[3]      = atomJ.inducedDipole[0]*qir[2] - atomJ.inducedDipole[1]*qir[1];
+        uixqkrp[1]     = atomI.inducedDipoleP[1]*qkr[3] - atomI.inducedDipoleP[2]*qkr[2];
+        uixqkrp[2]     = atomI.inducedDipoleP[2]*qkr[1] - atomI.inducedDipoleP[0]*qkr[3];
+        uixqkrp[3]     = atomI.inducedDipoleP[0]*qkr[2] - atomI.inducedDipoleP[1]*qkr[1];
+        ukxqirp[1]     = atomJ.inducedDipoleP[1]*qir[3] - atomJ.inducedDipoleP[2]*qir[2];
+        ukxqirp[2]     = atomJ.inducedDipoleP[2]*qir[1] - atomJ.inducedDipoleP[0]*qir[3];
+        ukxqirp[3]     = atomJ.inducedDipoleP[0]*qir[2] - atomJ.inducedDipoleP[1]*qir[1];
+        rxqidk[1]      = yr*qidk[3] - zr*qidk[2];
+        rxqidk[2]      = zr*qidk[1] - xr*qidk[3];
+        rxqidk[3]      = xr*qidk[2] - yr*qidk[1];
+        rxqkdi[1]      = yr*qkdi[3] - zr*qkdi[2];
+        rxqkdi[2]      = zr*qkdi[1] - xr*qkdi[3];
+        rxqkdi[3]      = xr*qkdi[2] - yr*qkdi[1];
+        rxqiuk[1]      = yr*qiuk[3] - zr*qiuk[2];
+        rxqiuk[2]      = zr*qiuk[1] - xr*qiuk[3];
+        rxqiuk[3]      = xr*qiuk[2] - yr*qiuk[1];
+        rxqkui[1]      = yr*qkui[3] - zr*qkui[2];
+        rxqkui[2]      = zr*qkui[1] - xr*qkui[3];
+        rxqkui[3]      = xr*qkui[2] - yr*qkui[1];
+        rxqiukp[1]     = yr*qiukp[3] - zr*qiukp[2];
+        rxqiukp[2]     = zr*qiukp[1] - xr*qiukp[3];
+        rxqiukp[3]     = xr*qiukp[2] - yr*qiukp[1];
+        rxqkuip[1]     = yr*qkuip[3] - zr*qkuip[2];
+        rxqkuip[2]     = zr*qkuip[1] - xr*qkuip[3];
+        rxqkuip[3]     = xr*qkuip[2] - yr*qkuip[1];
+
+        // calculate the scalar products for permanent components
+
+        sc[2]          = di[1]*dk[1] + di[2]*dk[2] + di[3]*dk[3];
+        sc[3]          = di[1]*xr + di[2]*yr + di[3]*zr;
+        sc[4]          = dk[1]*xr + dk[2]*yr + dk[3]*zr;
+        sc[5]          = qir[1]*xr + qir[2]*yr + qir[3]*zr;
+        sc[6]          = qkr[1]*xr + qkr[2]*yr + qkr[3]*zr;
+        sc[7]          = qir[1]*dk[1] + qir[2]*dk[2] + qir[3]*dk[3];
+        sc[8]          = qkr[1]*di[1] + qkr[2]*di[2] + qkr[3]*di[3];
+        sc[9]          = qir[1]*qkr[1] + qir[2]*qkr[2] + qir[3]*qkr[3];
+        sc[10]         = qi[1]*qk[1] + qi[2]*qk[2] + qi[3]*qk[3]
+                       + qi[4]*qk[4] + qi[5]*qk[5] + qi[6]*qk[6]
+                       + qi[7]*qk[7] + qi[8]*qk[8] + qi[9]*qk[9];
+
+        // calculate the scalar products for induced components
+
+        sci[1]          = atomI.inducedDipole[0]*dk[1] + atomI.inducedDipole[1]*dk[2]
+                      + atomI.inducedDipole[2]*dk[3] + di[1]*atomJ.inducedDipole[0]
+                      + di[2]*atomJ.inducedDipole[1] + di[3]*atomJ.inducedDipole[2];
+
+        sci[2]          = atomI.inducedDipole[0]*atomJ.inducedDipole[0] + atomI.inducedDipole[1]*atomJ.inducedDipole[1]
+                        + atomI.inducedDipole[2]*atomJ.inducedDipole[2];
+        sci[3]          = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
+        sci[4]          = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
+        sci[7]          = qir[1]*atomJ.inducedDipole[0] + qir[2]*atomJ.inducedDipole[1]
+                        + qir[3]*atomJ.inducedDipole[2];
+        sci[8]          = qkr[1]*atomI.inducedDipole[0] + qkr[2]*atomI.inducedDipole[1]
+                        + qkr[3]*atomI.inducedDipole[2];
+        scip[1]         = atomI.inducedDipoleP[0]*dk[1] + atomI.inducedDipoleP[1]*dk[2]
+                        + atomI.inducedDipoleP[2]*dk[3] + di[1]*atomJ.inducedDipoleP[0]
+                        + di[2]*atomJ.inducedDipoleP[1] + di[3]*atomJ.inducedDipoleP[2];
+        scip[2]         = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0]+atomI.inducedDipole[1]*atomJ.inducedDipoleP[1]
+                        + atomI.inducedDipole[2]*atomJ.inducedDipoleP[2]+atomI.inducedDipoleP[0]*atomJ.inducedDipole[0]
+                        + atomI.inducedDipoleP[1]*atomJ.inducedDipole[1]+atomI.inducedDipoleP[2]*atomJ.inducedDipole[2];
+        scip[3]         = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
+        scip[4]         = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
+        scip[7]         = qir[1]*atomJ.inducedDipoleP[0] + qir[2]*atomJ.inducedDipoleP[1]
+                        + qir[3]*atomJ.inducedDipoleP[2];
+        scip[8]         = qkr[1]*atomI.inducedDipoleP[0] + qkr[2]*atomI.inducedDipoleP[1]
+                        + qkr[3]*atomI.inducedDipoleP[2];
+
+        // calculate the gl functions for permanent components
+
+        gl[0]           = ci*ck;
+        gl[1]           = ck*sc[3] - ci*sc[4];
+        gl[2]           = ci*sc[6] + ck*sc[5] - sc[3]*sc[4];
+        gl[3]           = sc[3]*sc[6] - sc[4]*sc[5];
+        gl[4]           = sc[5]*sc[6];
+        gl[5]           = -4.0f * sc[9];
+        gl[6]           = sc[2];
+        gl[7]           = 2.0f * (sc[7]-sc[8]);
+        gl[8]           = 2.0f * sc[10];
+
+        // calculate the gl functions for induced components
+
+        gli[1]          = ck*sci[3] - ci*sci[4];
+        gli[2]          = -sc[3]*sci[4] - sci[3]*sc[4];
+        gli[3]          = sci[3]*sc[6] - sci[4]*sc[5];
+        gli[6]          = sci[1];
+        gli[7]          = 2.0f * (sci[7]-sci[8]);
+        glip[1]         = ck*scip[3] - ci*scip[4];
+        glip[2]         = -sc[3]*scip[4] - scip[3]*sc[4];
+        glip[3]         = scip[3]*sc[6] - scip[4]*sc[5];
+        glip[6]         = scip[1];
+        glip[7]         = 2.0f * (scip[7]-scip[8]);
+
+        // compute the energy contributions for this interaction
+
+        e    = bn[0]*gl[0] + bn[1]*(gl[1]+gl[6])
+                 + bn[2]*(gl[2]+gl[7]+gl[8])
+                 + bn[3]*(gl[3]+gl[5]) + bn[4]*gl[4];
+        ei    = 0.5f * (bn[1]*(gli[1]+gli[6])
+                       + bn[2]*(gli[2]+gli[7]) + bn[3]*gli[3]);
+
+        // get the real energy without any screening function
+
+        erl = rr1*gl[0] + rr3*(gl[1]+gl[6])
+                   + rr5*(gl[2]+gl[7]+gl[8])
+                   + rr7*(gl[3]+gl[5]) + rr9*gl[4];
+        erli = 0.5f*(rr3*(gli[1]+gli[6])*psc3
+                    + rr5*(gli[2]+gli[7])*psc5
+                    + rr7*gli[3]*psc7);
+        e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
+        ei = ei - erli;
+        e = f * e;
+        ei = f * ei;
+//        em = em + e;
+//        ep = ep + ei;
+
+        // increment the total intramolecular energy; assumes;
+        // intramolecular distances are less than half of cell;
+        // length and less than the ewald cutoff;
+/*
+        if (molcule(ii) .eq. molcule(kk)) {
+           eintra = eintra + mscale(kk)*erl*f;
+           eintra = eintra + 0.5f*pscale(kk);
+&                        * (rr3*(gli[1]+gli[6])*scale3;
+&                              + rr5*(gli[2]+gli[7])*scale5;
+&                              + rr7*gli[3]*scale7);
+        }
+*/
+
+        // intermediate variables for permanent force terms
+
+        gf[1] = bn[1]*gl[0] + bn[2]*(gl[1]+gl[6])
+                     + bn[3]*(gl[2]+gl[7]+gl[8])
+                     + bn[4]*(gl[3]+gl[5]) + bn[5]*gl[4];
+        gf[2] = -ck*bn[1] + sc[4]*bn[2] - sc[6]*bn[3];
+        gf[3] = ci*bn[1] + sc[3]*bn[2] + sc[5]*bn[3];
+        gf[4] = 2.0f * bn[2];
+        gf[5] = 2.0f * (-ck*bn[2]+sc[4]*bn[3]-sc[6]*bn[4]);
+        gf[6] = 2.0f * (-ci*bn[2]-sc[3]*bn[3]-sc[5]*bn[4]);
+        gf[7] = 4.0f * bn[3];
+        gfr[1] = rr3*gl[0] + rr5*(gl[1]+gl[6])
+                      + rr7*(gl[2]+gl[7]+gl[8])
+                      + rr9*(gl[3]+gl[5]) + rr11*gl[4];
+        gfr[2] = -ck*rr3 + sc[4]*rr5 - sc[6]*rr7;
+        gfr[3] = ci*rr3 + sc[3]*rr5 + sc[5]*rr7;
+        gfr[4] = 2.0f * rr5;
+        gfr[5] = 2.0f * (-ck*rr5+sc[4]*rr7-sc[6]*rr9);
+        gfr[6] = 2.0f * (-ci*rr5-sc[3]*rr7-sc[5]*rr9);
+        gfr[7] = 4.0f * rr7;
+
+        // intermediate variables for induced force terms
+
+        gfi[1] = 0.5f*bn[2]*(gli[1]+glip[1]+gli[6]+glip[6])
+                      + 0.5f*bn[2]*scip[2]
+                      + 0.5f*bn[3]*(gli[2]+glip[2]+gli[7]+glip[7])
+                      - 0.5f*bn[3]*(sci[3]*scip[4]+scip[3]*sci[4])
+                      + 0.5f*bn[4]*(gli[3]+glip[3]);
+        gfi[2] = -ck*bn[1] + sc[4]*bn[2] - sc[6]*bn[3];
+        gfi[3] = ci*bn[1] + sc[3]*bn[2] + sc[5]*bn[3];
+        gfi[4] = 2.0f * bn[2];
+        gfi[5] = bn[3] * (sci[4]+scip[4]);
+        gfi[6] = -bn[3] * (sci[3]+scip[3]);
+        gfri[1] = 0.5f*rr5*((gli[1]+gli[6])*psc3
+                             + (glip[1]+glip[6])*dsc3
+                             + scip[2]*usc3)
+                  + 0.5f*rr7*((gli[7]+gli[2])*psc5
+                             + (glip[7]+glip[2])*dsc5
+                      - (sci[3]*scip[4]+scip[3]*sci[4])*usc5)
+                  + 0.5f*rr9*(gli[3]*psc7+glip[3]*dsc7);
+        gfri[2] = -rr3*ck + rr5*sc[4] - rr7*sc[6];
+        gfri[3] = rr3*ci + rr5*sc[3] + rr7*sc[5];
+        gfri[4] = 2.0f * rr5;
+        gfri[5] = rr7 * (sci[4]*psc7+scip[4]*dsc7);
+        gfri[6] = -rr7 * (sci[3]*psc7+scip[3]*dsc7);
+
+        // get the permanent force with screening
+
+        ftm2[1] = gf[1]*xr + gf[2]*di[1] + gf[3]*dk[1]
+                       + gf[4]*(qkdi[1]-qidk[1]) + gf[5]*qir[1]
+                       + gf[6]*qkr[1] + gf[7]*(qiqkr[1]+qkqir[1]);
+        ftm2[2] = gf[1]*yr + gf[2]*di[2] + gf[3]*dk[2]
+                       + gf[4]*(qkdi[2]-qidk[2]) + gf[5]*qir[2]
+                       + gf[6]*qkr[2] + gf[7]*(qiqkr[2]+qkqir[2]);
+        ftm2[3] = gf[1]*zr + gf[2]*di[3] + gf[3]*dk[3]
+                       + gf[4]*(qkdi[3]-qidk[3]) + gf[5]*qir[3]
+                       + gf[6]*qkr[3] + gf[7]*(qiqkr[3]+qkqir[3]);
+
+        // get the permanent force without screening
+
+        ftm2r[1] = gfr[1]*xr + gfr[2]*di[1] + gfr[3]*dk[1]
+                       + gfr[4]*(qkdi[1]-qidk[1]) + gfr[5]*qir[1]
+                       + gfr[6]*qkr[1] + gfr[7]*(qiqkr[1]+qkqir[1]);
+        ftm2r[2] = gfr[1]*yr + gfr[2]*di[2] + gfr[3]*dk[2]
+                       + gfr[4]*(qkdi[2]-qidk[2]) + gfr[5]*qir[2]
+                       + gfr[6]*qkr[2] + gfr[7]*(qiqkr[2]+qkqir[2]);
+        ftm2r[3] = gfr[1]*zr + gfr[2]*di[3] + gfr[3]*dk[3]
+                       + gfr[4]*(qkdi[3]-qidk[3]) + gfr[5]*qir[3]
+                       + gfr[6]*qkr[3] + gfr[7]*(qiqkr[3]+qkqir[3]);
+
+        // get the induced force with screening
+
+        ftm2i[1] = gfi[1]*xr + 0.5f*
+              (gfi[2]*(atomI.inducedDipole[0]+atomI.inducedDipoleP[0])
+             + bn[2]*(sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0])
+             + gfi[3]*(atomJ.inducedDipole[0]+atomJ.inducedDipoleP[0])
+             + bn[2]*(sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0])
+             + (sci[4]+scip[4])*bn[2]*di[1]
+             + (sci[3]+scip[3])*bn[2]*dk[1]
+             + gfi[4]*(qkui[1]+qkuip[1]-qiuk[1]-qiukp[1]))
+             + gfi[5]*qir[1] + gfi[6]*qkr[1];
+        ftm2i[2] = gfi[1]*yr + 0.5f*
+              (gfi[2]*(atomI.inducedDipole[1]+atomI.inducedDipoleP[1])
+             + bn[2]*(sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1])
+             + gfi[3]*(atomJ.inducedDipole[1]+atomJ.inducedDipoleP[1])
+             + bn[2]*(sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1])
+             + (sci[4]+scip[4])*bn[2]*di[2]
+             + (sci[3]+scip[3])*bn[2]*dk[2]
+             + gfi[4]*(qkui[2]+qkuip[2]-qiuk[2]-qiukp[2]))
+             + gfi[5]*qir[2] + gfi[6]*qkr[2];
+        ftm2i[3] = gfi[1]*zr + 0.5f*
+              (gfi[2]*(atomI.inducedDipole[2]+atomI.inducedDipoleP[2])
+             + bn[2]*(sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2])
+             + gfi[3]*(atomJ.inducedDipole[2]+atomJ.inducedDipoleP[2])
+             + bn[2]*(sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2])
+             + (sci[4]+scip[4])*bn[2]*di[3]
+             + (sci[3]+scip[3])*bn[2]*dk[3]
+             + gfi[4]*(qkui[3]+qkuip[3]-qiuk[3]-qiukp[3]))
+             + gfi[5]*qir[3] + gfi[6]*qkr[3];
+
+        // get the induced force without screening
+
+        ftm2ri[1] = gfri[1]*xr + 0.5f*
+            (- rr3*ck*(atomI.inducedDipole[0]*psc3+atomI.inducedDipoleP[0]*dsc3)
+             + rr5*sc[4]*(atomI.inducedDipole[0]*psc5+atomI.inducedDipoleP[0]*dsc5)
+             - rr7*sc[6]*(atomI.inducedDipole[0]*psc7+atomI.inducedDipoleP[0]*dsc7))
+             + (rr3*ci*(atomJ.inducedDipole[0]*psc3+atomJ.inducedDipoleP[0]*dsc3)
+             + rr5*sc[3]*(atomJ.inducedDipole[0]*psc5+atomJ.inducedDipoleP[0]*dsc5)
+             + rr7*sc[5]*(atomJ.inducedDipole[0]*psc7+atomJ.inducedDipoleP[0]*dsc7))*0.5f
+             + rr5*usc5*(sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0]
+             + sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0])*0.5f
+             + 0.5f*(sci[4]*psc5+scip[4]*dsc5)*rr5*di[1]
+             + 0.5f*(sci[3]*psc5+scip[3]*dsc5)*rr5*dk[1]
+             + 0.5f*gfri[4]*((qkui[1]-qiuk[1])*psc5
+             + (qkuip[1]-qiukp[1])*dsc5)
+             + gfri[5]*qir[1] + gfri[6]*qkr[1];
+        ftm2ri[2] = gfri[1]*yr + 0.5f*
+            (- rr3*ck*(atomI.inducedDipole[1]*psc3+atomI.inducedDipoleP[1]*dsc3)
+             + rr5*sc[4]*(atomI.inducedDipole[1]*psc5+atomI.inducedDipoleP[1]*dsc5)
+             - rr7*sc[6]*(atomI.inducedDipole[1]*psc7+atomI.inducedDipoleP[1]*dsc7))
+             + (rr3*ci*(atomJ.inducedDipole[1]*psc3+atomJ.inducedDipoleP[1]*dsc3)
+             + rr5*sc[3]*(atomJ.inducedDipole[1]*psc5+atomJ.inducedDipoleP[1]*dsc5)
+             + rr7*sc[5]*(atomJ.inducedDipole[1]*psc7+atomJ.inducedDipoleP[1]*dsc7))*0.5f
+             + rr5*usc5*(sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1]
+             + sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1])*0.5f
+             + 0.5f*(sci[4]*psc5+scip[4]*dsc5)*rr5*di[2]
+             + 0.5f*(sci[3]*psc5+scip[3]*dsc5)*rr5*dk[2]
+             + 0.5f*gfri[4]*((qkui[2]-qiuk[2])*psc5
+             + (qkuip[2]-qiukp[2])*dsc5)
+             + gfri[5]*qir[2] + gfri[6]*qkr[2];
+        ftm2ri[3] = gfri[1]*zr + 0.5f*
+            (- rr3*ck*(atomI.inducedDipole[2]*psc3+atomI.inducedDipoleP[2]*dsc3)
+             + rr5*sc[4]*(atomI.inducedDipole[2]*psc5+atomI.inducedDipoleP[2]*dsc5)
+             - rr7*sc[6]*(atomI.inducedDipole[2]*psc7+atomI.inducedDipoleP[2]*dsc7))
+             + (rr3*ci*(atomJ.inducedDipole[2]*psc3+atomJ.inducedDipoleP[2]*dsc3)
+             + rr5*sc[3]*(atomJ.inducedDipole[2]*psc5+atomJ.inducedDipoleP[2]*dsc5)
+             + rr7*sc[5]*(atomJ.inducedDipole[2]*psc7+atomJ.inducedDipoleP[2]*dsc7))*0.5f
+             + rr5*usc5*(sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2]
+             + sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2])*0.5f
+             + 0.5f*(sci[4]*psc5+scip[4]*dsc5)*rr5*di[3]
+             + 0.5f*(sci[3]*psc5+scip[3]*dsc5)*rr5*dk[3]
+             + 0.5f*gfri[4]*((qkui[3]-qiuk[3])*psc5
+             + (qkuip[3]-qiukp[3])*dsc5)
+             + gfri[5]*qir[3] + gfri[6]*qkr[3];
+
+        // account for partially excluded induced interactions
+
+        float temp3 = 0.5f * rr3 * ((gli[1]+gli[6])*scalingFactors[PScaleIndex]
+                                   +(glip[1]+glip[6])*scalingFactors[DScaleIndex]);
+        float temp5 = 0.5f * rr5 * ((gli[2]+gli[7])*scalingFactors[PScaleIndex]
+                                   +(glip[2]+glip[7])*scalingFactors[DScaleIndex]);
+        float temp7 = 0.5f * rr7 * (gli[3]*scalingFactors[PScaleIndex]
+                                   +glip[3]*scalingFactors[DScaleIndex]);
+        fridmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1]
+                         + temp7*ddsc7[1];
+        fridmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2]
+                         + temp7*ddsc7[2];
+        fridmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3]
+                         + temp7*ddsc7[3];
+
+        // find some scaling terms for induced-induced force
+
+        temp3 = 0.5f * rr3 * scalingFactors[UScaleIndex] * scip[2];
+        temp5 = -0.5f * rr5 * scalingFactors[UScaleIndex] * (sci[3]*scip[4]+scip[3]*sci[4]);
+        findmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1];
+        findmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2];
+        findmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3];
+
+        // modify the forces for partially excluded interactions
+
+        ftm2i[1] = ftm2i[1] - fridmp[1] - findmp[1];
+        ftm2i[2] = ftm2i[2] - fridmp[2] - findmp[2];
+        ftm2i[3] = ftm2i[3] - fridmp[3] - findmp[3];
+
+        // correction to convert mutual to direct polarization force
+
+/*
+        if (poltyp .eq. 'DIRECT') {
+           gfd = 0.5f * (bn[2]*scip[2];
+&                     - bn[3]*(scip[3]*sci[4]+sci[3]*scip[4]));
+           gfdr = 0.5f * (rr5*scip[2]*usc3;
+&                     - rr7*(scip[3]*sci[4];
+&                           +sci[3]*scip[4])*usc5);
+           ftm2i[1] = ftm2i[1] - gfd*xr - 0.5f*bn[2]*;
+&                          (sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0];
+&                          +sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0]);
+           ftm2i[2] = ftm2i[2] - gfd*yr - 0.5f*bn[2]*;
+&                          (sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1];
+&                          +sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1]);
+           ftm2i[3] = ftm2i[3] - gfd*zr - 0.5f*bn[2]*;
+&                          (sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2];
+&                          +sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2]);
+           fdir[1] = gfdr*xr + 0.5f*usc5*rr5*;
+&                         (sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0];
+&                        + sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0]);
+           fdir[2] = gfdr*yr + 0.5f*usc5*rr5*;
+&                         (sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1];
+&                        + sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1]);
+           fdir[3] = gfdr*zr + 0.5f*usc5*rr5*;
+&                         (sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2];
+&                        + sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2]);
+           ftm2i[1] = ftm2i[1] + fdir[1] + findmp[1];
+           ftm2i[2] = ftm2i[2] + fdir[2] + findmp[2];
+           ftm2i[3] = ftm2i[3] + fdir[3] + findmp[3];
+        }
+*/
+
+        // intermediate variables for induced torque terms
+
+        gti[2] = 0.5f * bn[2] * (sci[4]+scip[4]);
+        gti[3] = 0.5f * bn[2] * (sci[3]+scip[3]);
+        gti[4] = gfi[4];
+        gti[5] = gfi[5];
+        gti[6] = gfi[6];
+        gtri[2] = 0.5f * rr5 * (sci[4]*psc5+scip[4]*dsc5);
+        gtri[3] = 0.5f * rr5 * (sci[3]*psc5+scip[3]*dsc5);
+        gtri[4] = gfri[4];
+        gtri[5] = gfri[5];
+        gtri[6] = gfri[6];
+
+        // get the permanent torque with screening
+
+        ttm2[1] = -bn[1]*dixdk[1] + gf[2]*dixr[1]
+            + gf[4]*(dixqkr[1]+dkxqir[1]+rxqidk[1]-2.0f*qixqk[1])
+            - gf[5]*rxqir[1] - gf[7]*(rxqikr[1]+qkrxqir[1]);
+        ttm2[2] = -bn[1]*dixdk[2] + gf[2]*dixr[2]
+            + gf[4]*(dixqkr[2]+dkxqir[2]+rxqidk[2]-2.0f*qixqk[2])
+            - gf[5]*rxqir[2] - gf[7]*(rxqikr[2]+qkrxqir[2]);
+        ttm2[3] = -bn[1]*dixdk[3] + gf[2]*dixr[3]
+            + gf[4]*(dixqkr[3]+dkxqir[3]+rxqidk[3]-2.0f*qixqk[3])
+            - gf[5]*rxqir[3] - gf[7]*(rxqikr[3]+qkrxqir[3]);
+        ttm3[1] = bn[1]*dixdk[1] + gf[3]*dkxr[1]
+            - gf[4]*(dixqkr[1]+dkxqir[1]+rxqkdi[1]-2.0f*qixqk[1])
+            - gf[6]*rxqkr[1] - gf[7]*(rxqkir[1]-qkrxqir[1]);
+        ttm3[2] = bn[1]*dixdk[2] + gf[3]*dkxr[2]
+            - gf[4]*(dixqkr[2]+dkxqir[2]+rxqkdi[2]-2.0f*qixqk[2])
+            - gf[6]*rxqkr[2] - gf[7]*(rxqkir[2]-qkrxqir[2]);
+        ttm3[3] = bn[1]*dixdk[3] + gf[3]*dkxr[3]
+            - gf[4]*(dixqkr[3]+dkxqir[3]+rxqkdi[3]-2.0f*qixqk[3])
+            - gf[6]*rxqkr[3] - gf[7]*(rxqkir[3]-qkrxqir[3]);
+
+        // get the permanent torque without screening
+
+        ttm2r[1] = -rr3*dixdk[1] + gfr[2]*dixr[1]-gfr[5]*rxqir[1]
+            + gfr[4]*(dixqkr[1]+dkxqir[1]+rxqidk[1]-2.0f*qixqk[1])
+            - gfr[7]*(rxqikr[1]+qkrxqir[1]);
+        ttm2r[2] = -rr3*dixdk[2] + gfr[2]*dixr[2]-gfr[5]*rxqir[2]
+            + gfr[4]*(dixqkr[2]+dkxqir[2]+rxqidk[2]-2.0f*qixqk[2])
+            - gfr[7]*(rxqikr[2]+qkrxqir[2]);
+        ttm2r[3] = -rr3*dixdk[3] + gfr[2]*dixr[3]-gfr[5]*rxqir[3]
+            + gfr[4]*(dixqkr[3]+dkxqir[3]+rxqidk[3]-2.0f*qixqk[3])
+            - gfr[7]*(rxqikr[3]+qkrxqir[3]);
+        ttm3r[1] = rr3*dixdk[1] + gfr[3]*dkxr[1] -gfr[6]*rxqkr[1]
+            - gfr[4]*(dixqkr[1]+dkxqir[1]+rxqkdi[1]-2.0f*qixqk[1])
+            - gfr[7]*(rxqkir[1]-qkrxqir[1]);
+        ttm3r[2] = rr3*dixdk[2] + gfr[3]*dkxr[2] -gfr[6]*rxqkr[2]
+            - gfr[4]*(dixqkr[2]+dkxqir[2]+rxqkdi[2]-2.0f*qixqk[2])
+            - gfr[7]*(rxqkir[2]-qkrxqir[2]);
+        ttm3r[3] = rr3*dixdk[3] + gfr[3]*dkxr[3] -gfr[6]*rxqkr[3]
+            - gfr[4]*(dixqkr[3]+dkxqir[3]+rxqkdi[3]-2.0f*qixqk[3])
+            - gfr[7]*(rxqkir[3]-qkrxqir[3]);
+
+        // get the induced torque with screening
+
+        ttm2i[1] = -bn[1]*(dixuk[1]+dixukp[1])*0.5f
+            + gti[2]*dixr[1] + gti[4]*(ukxqir[1]+rxqiuk[1]
+            + ukxqirp[1]+rxqiukp[1])*0.5f - gti[5]*rxqir[1];
+        ttm2i[2] = -bn[1]*(dixuk[2]+dixukp[2])*0.5f
+            + gti[2]*dixr[2] + gti[4]*(ukxqir[2]+rxqiuk[2]
+            + ukxqirp[2]+rxqiukp[2])*0.5f - gti[5]*rxqir[2];
+        ttm2i[3] = -bn[1]*(dixuk[3]+dixukp[3])*0.5f
+            + gti[2]*dixr[3] + gti[4]*(ukxqir[3]+rxqiuk[3]
+            + ukxqirp[3]+rxqiukp[3])*0.5f - gti[5]*rxqir[3];
+        ttm3i[1] = -bn[1]*(dkxui[1]+dkxuip[1])*0.5f
+            + gti[3]*dkxr[1] - gti[4]*(uixqkr[1]+rxqkui[1]
+            + uixqkrp[1]+rxqkuip[1])*0.5f - gti[6]*rxqkr[1];
+        ttm3i[2] = -bn[1]*(dkxui[2]+dkxuip[2])*0.5f
+            + gti[3]*dkxr[2] - gti[4]*(uixqkr[2]+rxqkui[2]
+            + uixqkrp[2]+rxqkuip[2])*0.5f - gti[6]*rxqkr[2];
+        ttm3i[3] = -bn[1]*(dkxui[3]+dkxuip[3])*0.5f
+            + gti[3]*dkxr[3] - gti[4]*(uixqkr[3]+rxqkui[3]
+            + uixqkrp[3]+rxqkuip[3])*0.5f - gti[6]*rxqkr[3];
+
+        // get the induced torque without screening
+
+        ttm2ri[1] = -rr3*(dixuk[1]*psc3+dixukp[1]*dsc3)*0.5f
+            + gtri[2]*dixr[1] + gtri[4]*((ukxqir[1]+rxqiuk[1])*psc5
+            +(ukxqirp[1]+rxqiukp[1])*dsc5)*0.5f - gtri[5]*rxqir[1];
+        ttm2ri[2] = -rr3*(dixuk[2]*psc3+dixukp[2]*dsc3)*0.5f
+            + gtri[2]*dixr[2] + gtri[4]*((ukxqir[2]+rxqiuk[2])*psc5
+            +(ukxqirp[2]+rxqiukp[2])*dsc5)*0.5f - gtri[5]*rxqir[2];
+        ttm2ri[3] = -rr3*(dixuk[3]*psc3+dixukp[3]*dsc3)*0.5f
+            + gtri[2]*dixr[3] + gtri[4]*((ukxqir[3]+rxqiuk[3])*psc5
+            +(ukxqirp[3]+rxqiukp[3])*dsc5)*0.5f - gtri[5]*rxqir[3];
+        ttm3ri[1] = -rr3*(dkxui[1]*psc3+dkxuip[1]*dsc3)*0.5f
+            + gtri[3]*dkxr[1] - gtri[4]*((uixqkr[1]+rxqkui[1])*psc5
+            +(uixqkrp[1]+rxqkuip[1])*dsc5)*0.5f - gtri[6]*rxqkr[1];
+        ttm3ri[2] = -rr3*(dkxui[2]*psc3+dkxuip[2]*dsc3)*0.5f
+            + gtri[3]*dkxr[2] - gtri[4]*((uixqkr[2]+rxqkui[2])*psc5
+            +(uixqkrp[2]+rxqkuip[2])*dsc5)*0.5f - gtri[6]*rxqkr[2];
+        ttm3ri[3] = -rr3*(dkxui[3]*psc3+dkxuip[3]*dsc3)*0.5f
+            + gtri[3]*dkxr[3] - gtri[4]*((uixqkr[3]+rxqkui[3])*psc5
+            +(uixqkrp[3]+rxqkuip[3])*dsc5)*0.5f - gtri[6]*rxqkr[3];
+
+        // handle the case where scaling is used
+
+        for( int j = 1; j <= 3; j++ ){
+           ftm2[j] = f * (ftm2[j]-(1.0f-scalingFactors[MScaleIndex])*ftm2r[j]);
+           ftm2i[j] = f * (ftm2i[j]-ftm2ri[j]);
+           ttm2[j] = f * (ttm2[j]-(1.0f-scalingFactors[MScaleIndex])*ttm2r[j]);
+           ttm2i[j] = f * (ttm2i[j]-ttm2ri[j]);
+           ttm3[j] = f * (ttm3[j]-(1.0f-scalingFactors[MScaleIndex])*ttm3r[j]);
+           ttm3i[j] = f * (ttm3i[j]-ttm3ri[j]);
+        }
+
+        // increment gradient due to force and torque on first site;
+/*
+        dem(1,ii) = dem(1,ii) + ftm2[1];
+        dem(2,ii) = dem(2,ii) + ftm2[2];
+        dem(3,ii) = dem(3,ii) + ftm2[3];
+        dep(1,ii) = dep(1,ii) + ftm2i[1];
+        dep(2,ii) = dep(2,ii) + ftm2i[2];
+        dep(3,ii) = dep(3,ii) + ftm2i[3];
+        call torque (i,ttm2,ttm2i,frcxi,frcyi,frczi);
+
+        // increment gradient due to force and torque on second site;
+
+        dem(1,kk) = dem(1,kk) - ftm2[1];
+        dem(2,kk) = dem(2,kk) - ftm2[2];
+        dem(3,kk) = dem(3,kk) - ftm2[3];
+        dep(1,kk) = dep(1,kk) - ftm2i[1];
+        dep(2,kk) = dep(2,kk) - ftm2i[2];
+        dep(3,kk) = dep(3,kk) - ftm2i[3];
+        call torque (k,ttm3,ttm3i,frcxk,frcyk,frczk);
+*/
+
+    }
+
+    return;
+
+}
+
+__device__ void loadRealSpaceEwaldShared( struct RealSpaceEwaldParticle* sA, unsigned int atomI,
+                                       float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
+                                       float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
+{
+    // coordinates & charge
+
+    sA->x                        = atomCoord[atomI].x;
+    sA->y                        = atomCoord[atomI].y;
+    sA->z                        = atomCoord[atomI].z;
+    sA->q                        = atomCoord[atomI].w;
+
+    // lab dipole
+
+    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
+    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
+    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
+
+    // lab quadrupole
+
+    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
+    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
+    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
+    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
+    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
+
+    // induced dipole
+
+    sA->inducedDipole[0]          = inducedDipole[atomI*3];
+    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
+    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
+
+    // induced dipole polar
+
+    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
+    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
+    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
+
+    sA->damp                     = dampingFactorAndThole[atomI].x;
+    sA->thole                    = dampingFactorAndThole[atomI].y;
+
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaRealSpaceEwald.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaRealSpaceEwald.h"
+
+// reduce psWorkArray_3_1 -> force
+// reduce psWorkArray_3_2 -> torque
+
+static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
+{
+    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
+                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+                             amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
+    LAUNCHERROR("kReduceRealSpaceEwaldForce");
+    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
+                             amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
+                             amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
+    LAUNCHERROR("kReduceRealSpaceEwaldTorque");
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Amoeba electrostatic force & torque
+
+   @param amoebaGpu        amoebaGpu context
+   @param gpu              OpenMM gpu Cuda context
+
+   --------------------------------------------------------------------------------------- */
+
+void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
+{
+  
+   // ---------------------------------------------------------------------------------------
+
+    static unsigned int threadsPerBlock = 0;
+
+#ifdef AMOEBA_DEBUG
+    static const char* methodName = "cudaComputeAmoebaRealSpaceEwald";
+    static int timestep = 0;
+    std::vector<int> fileId;
+    timestep++;
+    fileId.resize( 2 );
+    fileId[0] = timestep;
+    fileId[1] = 1;
+#endif
+
+    // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu = amoebaGpu->gpuContext;
+
+    // apparently debug array can take up nontrivial no. registers
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+      (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
+                      " gamma=%.3e scalingDistanceCutoff=%.3f ZZZ\n",
+                      methodName, gpu->natoms,
+                      amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
+                      amoebaGpu->scalingDistanceCutoff );
+    }   
+   int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    debugArray->Upload();
+    unsigned int targetAtom                   = 0;
+#endif
+
+    // on first pass, set threads/block
+
+    if( threadsPerBlock == 0 ){
+      unsigned int maxThreads;
+      if (gpu->sm_version >= SM_20)
+          maxThreads = 384;
+      else if (gpu->sm_version >= SM_12)
+          maxThreads = 128;
+      else
+          maxThreads = 64;
+      threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(RealSpaceEwaldParticle)), maxThreads);
+    }
+
+    kClearFields_3( amoebaGpu, 2 );
+
+    if (gpu->bOutputBufferPerWarp){
+
+      kCalculateAmoebaCudaRealSpaceEwaldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
+                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                         gpu->psPosq4->_pDevStream[0],
+                                                                         amoebaGpu->psLabFrameDipole->_pDevStream[0],
+                                                                         amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
+                                                                         amoebaGpu->psInducedDipole->_pDevStream[0],
+                                                                         amoebaGpu->psInducedDipolePolar->_pDevStream[0],
+                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                         debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+#endif
+
+    } else {
+
+#ifdef AMOEBA_DEBUG
+      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaRealSpaceEwaldN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                      sizeof(RealSpaceEwaldParticle), sizeof(RealSpaceEwaldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+      (void) fflush( amoebaGpu->log );
+#endif
+
+      kCalculateAmoebaCudaRealSpaceEwaldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
+                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
+                                                                         gpu->psPosq4->_pDevStream[0],
+                                                                         amoebaGpu->psLabFrameDipole->_pDevStream[0],
+                                                                         amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
+                                                                         amoebaGpu->psInducedDipole->_pDevStream[0],
+                                                                         amoebaGpu->psInducedDipolePolar->_pDevStream[0],
+                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+#ifdef AMOEBA_DEBUG
+                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                         debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+#endif
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaRealSpaceEwaldN2Forces");
+    kReduceForceTorque( amoebaGpu );
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+
+      amoebaGpu->psForce->Download();
+      amoebaGpu->psTorque->Download();
+      debugArray->Download();
+
+      (void) fprintf( amoebaGpu->log, "Finished RealSpaceEwald kernel execution\n" ); (void) fflush( amoebaGpu->log );
+
+      int maxPrint        = 1400;
+      for( int ii = 0; ii < gpu->natoms; ii++ ){
+         (void) fprintf( amoebaGpu->log, "%5d ", ii); 
+
+          int indexOffset     = ii*3;
+    
+         // force
+
+         (void) fprintf( amoebaGpu->log,"RealSpaceEwaldF [%16.9e %16.9e %16.9e] ",
+                         amoebaGpu->psForce->_pSysStream[0][indexOffset],
+                         amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
+                         amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
+    
+         // torque
+
+         (void) fprintf( amoebaGpu->log,"RealSpaceEwaldT [%16.9e %16.9e %16.9e] ",
+                         amoebaGpu->psTorque->_pSysStream[0][indexOffset],
+                         amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
+                         amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
+
+         // coords
+
+#if 0
+          (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
+                          gpu->psPosq4->_pSysStream[0][ii].x,
+                          gpu->psPosq4->_pSysStream[0][ii].y,
+                          gpu->psPosq4->_pSysStream[0][ii].z);
+
+
+         for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
+             int debugIndex = jj*gpu->natoms + ii;
+             float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
+             float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
+             float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
+             (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
+                             ii, jj, xx, yy, zz,
+                             debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
+
+         }
+#endif
+         (void) fprintf( amoebaGpu->log,"\n" );
+         if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
+              ii = gpu->natoms - maxPrint;
+         }
+      }
+      if( 1 ){
+          (void) fprintf( amoebaGpu->log,"DebugElec\n" );
+          int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+          for( int jj = 0; jj < gpu->natoms; jj++ ){
+              int debugIndex = jj;
+              for( int kk = 0; kk < 5; kk++ ){
+                  (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
+                                  debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
+                                  debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
+                  debugIndex += paddedNumberOfAtoms;
+              }
+              (void) fprintf( amoebaGpu->log,"\n" );
+          }
+      }
+      (void) fflush( amoebaGpu->log );
+
+      if( 0 ){
+          (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
+          int maxPrint = 12;
+          for( int ii = 0; ii < gpu->natoms; ii++ ){
+    
+              // print cpu & gpu reductions
+    
+              int offset  = 3*ii;
+    
+              (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
+                              amoebaGpu->psForce->_pSysStream[0][offset],
+                              amoebaGpu->psForce->_pSysStream[0][offset+1],
+                              amoebaGpu->psForce->_pSysStream[0][offset+2],
+                              amoebaGpu->psTorque->_pSysStream[0][offset],
+                              amoebaGpu->psTorque->_pSysStream[0][offset+1],
+                              amoebaGpu->psTorque->_pSysStream[0][offset+2] );
+              if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
+          }   
+      }   
+
+      if( 1 ){
+          std::vector<int> fileId;
+          //fileId.push_back( 0 );
+          VectorOfDoubleVectors outputVector;
+          cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector );
+          cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector );
+          cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector);
+          cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
+       }
+
+    }   
+    delete debugArray;
+
+#endif
+
+   // ---------------------------------------------------------------------------------------
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRealSpaceEwald.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaScaleFactors.h"
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(384, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(128, 1)
+#else
+__launch_bounds__(64, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
+                            unsigned int* workUnit,
+                            float4* atomCoord,
+                            float* labFrameDipole,
+                            float* labFrameQuadrupole,
+                            float* inducedDipole,
+                            float* inducedDipolePolar,
+                            float* outputForce,
+                            float* outputTorque
+
+#ifdef AMOEBA_DEBUG
+                           , float4* debugArray, unsigned int targetAtom
+#endif
+){
+
+#ifdef AMOEBA_DEBUG
+    float4 pullBack[20];
+#endif
+
+    extern __shared__ RealSpaceEwaldParticle sA[];
+
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+    float totalEnergy            = 0.0f;     
+
+    float scalingFactors[LastScalingIndex];
+
+    while (pos < end)
+    {
+
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+
+        // Extract cell coordinates
+
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+
+        unsigned int tgx              = threadIdx.x & (GRID - 1);
+        unsigned int tbx              = threadIdx.x - tgx;
+        unsigned int tj               = tgx;
+
+        RealSpaceEwaldParticle* psA    = &sA[tbx];
+        unsigned int atomI            = x + tgx;
+        RealSpaceEwaldParticle localParticle;
+        loadRealSpaceEwaldShared(&localParticle, atomI,
+                                atomCoord, labFrameDipole, labFrameQuadrupole,
+                                inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
+
+        localParticle.force[0]                   = 0.0f;
+        localParticle.force[1]                   = 0.0f;
+        localParticle.force[2]                   = 0.0f;
+
+        localParticle.torque[0]                  = 0.0f;
+        localParticle.torque[1]                  = 0.0f;
+        localParticle.torque[2]                  = 0.0f;
+
+        scalingFactors[PScaleIndex]   = 1.0f;
+        scalingFactors[DScaleIndex]   = 1.0f;
+        scalingFactors[UScaleIndex]   = 1.0f;
+        scalingFactors[MScaleIndex]   = 1.0f;
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+
+            // load shared data
+
+           loadRealSpaceEwaldShared( &(sA[threadIdx.x]), atomI,
+                                    atomCoord, labFrameDipole, labFrameQuadrupole,
+                                    inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
+
+            if (!bExclusionFlag)
+            {
+
+                // this branch is never exercised since it includes the
+                // interaction between atomI and itself which is always excluded
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+
+                    float force[3];
+                    float torque[2][3];
+
+                    float energy;
+                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[j],
+                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                          force, torque, &energy
+#ifdef AMOEBA_DEBUG
+, pullBack
+#endif
+ );
+
+                    unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+
+                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+
+                    localParticle.force[0]            += mask ? force[0]     : 0.0f;
+                    localParticle.force[1]            += mask ? force[1]     : 0.0f;
+                    localParticle.force[2]            += mask ? force[2]     : 0.0f;
+
+                    localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
+                    localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
+                    localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
+ 
+                    totalEnergy            += mask ? 0.5*energy   : 0.0f;
+                }
+
+            }
+            else  // bExclusion
+            {
+                unsigned int xi       = x >> GRIDBITS;
+                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                int2 mScaleMask       = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+
+                    float force[3];
+                    float torque[2][3];
+
+                    unsigned int atomJ = y + j;
+
+                    // set scale factors
+
+                    getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
+                    getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
+                    getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
+
+                    // force
+
+                    float energy;
+                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[j],
+                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                          force, torque, &energy
+#ifdef AMOEBA_DEBUG
+, pullBack
+#endif
+ );
+
+                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
+                    // by setting match flag
+
+                    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+
+                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
+
+                    localParticle.force[0]            += mask ? force[0]     : 0.0f;
+                    localParticle.force[1]            += mask ? force[1]     : 0.0f;
+                    localParticle.force[2]            += mask ? force[2]     : 0.0f;
+
+                    localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
+                    localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
+                    localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
+
+                    totalEnergy            += mask ? 0.5*energy   : 0.0f;
+
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom ){
+            unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+
+            debugArray[index].x                = (float) atomI;
+            debugArray[index].y                = (float) atomJ;
+            debugArray[index].z                = 1.0f;
+            debugArray[index].w                = (float) y;
+/*
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = mask ? force[0]     : 0.0f;
+            debugArray[index].y                = mask ? force[1]     : 0.0f;
+            debugArray[index].z                = mask ? force[2]     : 0.0f;
+
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = mask ? torque[0][0] : 0.0f;
+            debugArray[index].y                = mask ? torque[0][1] : 0.0f;
+            debugArray[index].z                = mask ? torque[0][2] : 0.0f;
+*/
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            int pullIndex                      = 0;
+            debugArray[index].x                = pullBack[pullIndex].x;
+            debugArray[index].y                = pullBack[pullIndex].y;
+            debugArray[index].z                = pullBack[pullIndex].z;
+            debugArray[index].w                = pullBack[pullIndex].w;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            pullIndex++;
+            debugArray[index].x                = pullBack[pullIndex].x;
+            debugArray[index].y                = pullBack[pullIndex].y;
+            debugArray[index].z                = pullBack[pullIndex].z;
+            debugArray[index].w                = pullBack[pullIndex].w;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            pullIndex++;
+            debugArray[index].x                = pullBack[pullIndex].x;
+            debugArray[index].y                = pullBack[pullIndex].y;
+            debugArray[index].z                = pullBack[pullIndex].z;
+            debugArray[index].w                = pullBack[pullIndex].w;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            pullIndex++;
+            debugArray[index].x                = pullBack[pullIndex].x;
+            debugArray[index].y                = pullBack[pullIndex].y;
+            debugArray[index].z                = pullBack[pullIndex].z;
+            debugArray[index].w                = pullBack[pullIndex].w;
+
+}
+#endif
+
+                }
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            float  of;
+            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            of                                  = outputForce[offset];
+            of                                 += localParticle.force[0];
+            outputForce[offset]                 = of;
+
+            of                                  = outputForce[offset+1];
+            of                                 += localParticle.force[1];
+            outputForce[offset+1]               = of;
+
+            of                                  = outputForce[offset+2];
+            of                                 += localParticle.force[2];
+            outputForce[offset+2]               = of;
+
+            of                                  = outputTorque[offset];
+            of                                 += localParticle.torque[0];
+            outputTorque[offset]                = of;
+
+            of                                  = outputTorque[offset+1];
+            of                                 += localParticle.torque[1];
+            outputTorque[offset+1]              = of;
+
+            of                                  = outputTorque[offset+2];
+            of                                 += localParticle.torque[2];
+            outputTorque[offset+2]              = of;
+
+#else
+            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+            outputForce[offset]                 = localParticle.force[0];
+            outputForce[offset+1]               = localParticle.force[1];
+            outputForce[offset+2]               = localParticle.force[2];
+
+            outputTorque[offset]                = localParticle.torque[0];
+            outputTorque[offset+1]              = localParticle.torque[1];
+            outputTorque[offset+2]              = localParticle.torque[2];
+#endif
+
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+
+            if (lasty != y)
+            {
+                // load shared data
+
+               loadRealSpaceEwaldShared( &(sA[threadIdx.x]), (y+tgx),
+                                        atomCoord, labFrameDipole, labFrameQuadrupole,
+                                        inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
+
+            }
+
+            sA[threadIdx.x].force[0]     = 0.0f;
+            sA[threadIdx.x].force[1]     = 0.0f;
+            sA[threadIdx.x].force[2]     = 0.0f;
+
+            sA[threadIdx.x].torque[0]    = 0.0f;
+            sA[threadIdx.x].torque[1]    = 0.0f;
+            sA[threadIdx.x].torque[2]    = 0.0f;
+
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+
+                    float force[3];
+                    float torque[2][3];
+
+                    unsigned int atomJ = y + tj;
+
+                    float energy;
+                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[tj],
+                                                                 cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                                 force, torque, &energy
+#ifdef AMOEBA_DEBUG
+, pullBack
+#endif
+ );
+       
+                           unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+       
+                           // add force and torque to atom I due atom J
+       
+                           localParticle.force[0]         += mask ? force[0]      : 0.0f;
+                           localParticle.force[1]         += mask ? force[1]      : 0.0f;
+                           localParticle.force[2]         += mask ? force[2]      : 0.0f;
+       
+                           localParticle.torque[0]        += mask ? torque[0][0]  : 0.0f;
+                           localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
+                           localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
+
+                           totalEnergy         += mask ? energy        : 0.0f;
+           
+                           // add force and torque to atom J due atom I
+       
+                           psA[tj].force[0]     -= mask ?  force[0]     : 0.0f;
+                           psA[tj].force[1]     -= mask ?  force[1]     : 0.0f;
+                           psA[tj].force[2]     -= mask ?  force[2]     : 0.0f;
+       
+                           psA[tj].torque[0]    += mask ?  torque[1][0] : 0.0f;
+                           psA[tj].torque[1]    += mask ?  torque[1][1] : 0.0f;
+                           psA[tj].torque[2]    += mask ?  torque[1][2] : 0.0f;
+       
+       
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom  || atomJ == targetAtom ){
+        unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
+        unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
+        float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) atomJ;
+        debugArray[index].z                = 2.0f;
+        debugArray[index].w                = (float) y;
+/*
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
+        debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
+        debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
+
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
+        debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
+        debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
+        debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
+        debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
+*/
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        int pullIndex                      = 0;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        pullIndex++;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        pullIndex++;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+       
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        pullIndex++;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+       
+}
+#endif
+       
+                           tj                  = (tj + 1) & (GRID - 1);
+       
+                       }
+                   }
+                   else  // bExclusion
+                   {
+                       // Read fixed atom data into registers and GRF
+       
+                       unsigned int xi   = x >> GRIDBITS;
+                       unsigned int yi   = y >> GRIDBITS;
+                       unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                       int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                       int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+                       int2 mScaleMask   = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
+       
+                       for (unsigned int j = 0; j < GRID; j++)
+                       {
+       
+                           float force[3];
+                           float torque[2][3];
+       
+                           unsigned int atomJ = y + tj;
+       
+                           // set scale factors
+       
+                           getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
+                           getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
+                           getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
+       
+                           // force
+       
+                    float energy;
+                    calculateRealSpaceEwaldPairIxn_kernel( localParticle,                              psA[tj],
+                                                          cAmoebaSim.scalingDistanceCutoff,           scalingFactors,
+                                                          force, torque, &energy
+#ifdef AMOEBA_DEBUG
+, pullBack
+#endif
+ );
+
+                    // check if atoms out-of-bounds
+
+                    unsigned int mask    =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+
+                    // add force and torque to atom I due atom J
+
+                    localParticle.force[0]         += mask ? force[0]      : 0.0f;
+                    localParticle.force[1]         += mask ? force[1]      : 0.0f;
+                    localParticle.force[2]         += mask ? force[2]      : 0.0f;
+
+                    localParticle.torque[0]        += mask ? torque[0][0]  : 0.0f;
+                    localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
+                    localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
+    
+                    totalEnergy         += mask ? energy        : 0.0f;
+
+                    // add force and torque to atom J due atom I
+
+                    psA[tj].force[0]     -= mask ?  force[0]     : 0.0f;
+                    psA[tj].force[1]     -= mask ?  force[1]     : 0.0f;
+                    psA[tj].force[2]     -= mask ?  force[2]     : 0.0f;
+
+                    psA[tj].torque[0]    += mask ?  torque[1][0] : 0.0f;
+                    psA[tj].torque[1]    += mask ?  torque[1][1] : 0.0f;
+                    psA[tj].torque[2]    += mask ?  torque[1][2] : 0.0f;
+
+
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom  || atomJ == targetAtom ){
+         unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
+         unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
+         unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
+         float forceSign                    = (atomI == targetAtom) ? 1.0f : -1.0f;
+
+         debugArray[index].x                = (float) atomI;
+         debugArray[index].y                = (float) atomJ;
+         debugArray[index].z                = 3.0f;
+         debugArray[index].w                = (float) y;
+/*
+         index                             += cAmoebaSim.paddedNumberOfAtoms;
+         debugArray[index].x                = mask ? forceSign*force[0]     : 0.0f;
+         debugArray[index].y                = mask ? forceSign*force[1]     : 0.0f;
+         debugArray[index].z                = mask ? forceSign*force[2]     : 0.0f;
+
+
+         index                             += cAmoebaSim.paddedNumberOfAtoms;
+         debugArray[index].x                = mask ? torque[indexI][0] : 0.0f;
+         debugArray[index].y                = mask ? torque[indexI][1] : 0.0f;
+         debugArray[index].z                = mask ? torque[indexI][2] : 0.0f;
+
+         index                             += cAmoebaSim.paddedNumberOfAtoms;
+         debugArray[index].x                = mask ? torque[indexJ][0] : 0.0f;
+         debugArray[index].y                = mask ? torque[indexJ][1] : 0.0f;
+         debugArray[index].z                = mask ? torque[indexJ][2] : 0.0f;
+*/
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        int pullIndex                      = 0;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        pullIndex++;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        pullIndex++;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+       
+        index                             += cAmoebaSim.paddedNumberOfAtoms;
+        pullIndex++;
+        debugArray[index].x                = pullBack[pullIndex].x;
+        debugArray[index].y                = pullBack[pullIndex].y;
+        debugArray[index].z                = pullBack[pullIndex].z;
+        debugArray[index].w                = pullBack[pullIndex].w;
+       
+#if 0
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = scalingFactors[DScaleIndex];
+            debugArray[index].y                = dScaleVal;
+            debugArray[index].z                = scalingFactors[PScaleIndex];
+            debugArray[index].w                = pScaleVal;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = scalingFactors[MScaleIndex];
+            debugArray[index].y                = mScaleVal;
+            for( int pIndex = 0; pIndex < 14; pIndex++ ){
+                index                             += cAmoebaSim.paddedNumberOfAtoms;
+                debugArray[index].x                = pullBack[pIndex].x;
+                debugArray[index].y                = pullBack[pIndex].y;
+                debugArray[index].z                = pullBack[pIndex].z;
+                debugArray[index].w                = pullBack[pIndex].w;
+            }
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = labFrameDipole[3*atomI];
+            debugArray[index].y                = labFrameDipole[3*atomI+1];
+            debugArray[index].z                = labFrameDipole[3*atomI+2];
+            debugArray[index].w                = 25.0f;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = labFrameDipole[3*atomJ];
+            debugArray[index].y                = labFrameDipole[3*atomJ+1];
+            debugArray[index].z                = labFrameDipole[3*atomJ+2];
+            debugArray[index].w                = 26.0f;
+
+            index                             += cAmoebaSim.paddedNumberOfAtoms;
+            debugArray[index].x                = jDipole[0];
+            debugArray[index].y                = jDipole[1];
+            debugArray[index].z                = jDipole[2];
+            debugArray[index].w                = 27.0f;
+#endif
+
+
+}
+#endif
+
+
+                    tj                  = (tj + 1) & (GRID - 1);
+                }
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+
+            float of;
+            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+            of                                  = outputForce[offset];
+            of                                 += localParticle.force[0];
+            outputForce[offset]                 = of;
+
+            of                                  = outputForce[offset+1];
+            of                                 += localParticle.force[1];
+            outputForce[offset+1]               = of;
+
+            of                                  = outputForce[offset+2];
+            of                                 += localParticle.force[2];
+            outputForce[offset+2]               = of;
+
+            of                                  = outputTorque[offset];
+            of                                 += localParticle.torque[0];
+            outputTorque[offset]                 = of;
+
+            of                                  = outputTorque[offset+1];
+            of                                 += localParticle.torque[1];
+            outputTorque[offset+1]               = of;
+
+            of                                  = outputTorque[offset+2];
+            of                                 += localParticle.torque[2];
+            outputTorque[offset+2]               = of;
+
+            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
+
+            of                                  = outputForce[offset];
+            of                                 += sA[threadIdx.x].force[0];
+            outputForce[offset]                 = of;
+
+            of                                  = outputForce[offset+1];
+            of                                 += sA[threadIdx.x].force[1];
+            outputForce[offset+1]               = of;
+
+            of                                  = outputForce[offset+2];
+            of                                 += sA[threadIdx.x].force[2];
+            outputForce[offset+2]               = of;
+
+            of                                  = outputTorque[offset];
+            of                                 += sA[threadIdx.x].torque[0];
+            outputTorque[offset]                = of;
+
+            of                                  = outputTorque[offset+1];
+            of                                 += sA[threadIdx.x].torque[1];
+            outputTorque[offset+1]              = of;
+
+            of                                  = outputTorque[offset+2];
+            of                                 += sA[threadIdx.x].torque[2];
+            outputTorque[offset+2]              = of;
+
+#else
+            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+
+            outputForce[offset]                 = localParticle.force[0];
+            outputForce[offset+1]               = localParticle.force[1];
+            outputForce[offset+2]               = localParticle.force[2];
+
+            outputTorque[offset]                = localParticle.torque[0];
+            outputTorque[offset+1]              = localParticle.torque[1];
+            outputTorque[offset+2]              = localParticle.torque[2];
+
+            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
+
+            outputForce[offset]                 = sA[threadIdx.x].force[0];
+            outputForce[offset+1]               = sA[threadIdx.x].force[1];
+            outputForce[offset+2]               = sA[threadIdx.x].force[2];
+
+            outputTorque[offset]                = sA[threadIdx.x].torque[0];
+            outputTorque[offset+1]              = sA[threadIdx.x].torque[1];
+            outputTorque[offset+2]              = sA[threadIdx.x].torque[2];
+
+
+#endif
+            lasty = y;
+        }
+
+        pos++;
+    }
+    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaSASA.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaSASA.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
-
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-#include <time.h>
-
-//#define AMOEBA_DEBUG
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-
-void SetCalculateAmoebaCudaSASAForcesSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaSASASim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaSASASim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-
-void GetCalculateAmoebaCudaSASAForcesSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaSASASim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaSASASim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-
-
-__device__ void insertionSort( unsigned int length, float* A, int* P)
-{
-    for( unsigned int ii = 1; ii < length; ii++ ){
-        float value = A[ii];
-        int index   = P[ii];
-        int jj      = ii;
-        while( jj > 0 && A[jj-1] > value ){
-            A[jj] = A[jj-1];
-            P[jj] = P[jj-1];
-            jj--;
-        }
-        A[jj] = value;
-        P[jj] = index;
-    }
-}
-
-__device__ void insertionSortArc( unsigned int length, float4* arcList )
-{
-    for( unsigned int ii = 1; ii < length; ii++ ){
-        float4 value = arcList[ii];
-        int jj       = ii;
-        while( jj > 0 && arcList[jj-1].x > value.x ){
-            arcList[jj] = arcList[jj-1];
-            jj--;
-        }
-        arcList[jj] = value;
-    }
-}
-
-__device__ int setOmitListOrig( float4 atomCoordI, float4* atomCoord, float* grArray, unsigned int ioListLength, int* ioList,
-                                int* omitList, float delta )
-{
-
-    float pi_f  = 3.1415926535897f;
-    for( unsigned int jj = 0; jj < ioListLength-1; jj++ ){
-
-        int atomJ        = ioList[jj];
-
-        float txj        = atomCoord[atomJ].x - atomCoordI.x;
-        float tyj        = atomCoord[atomJ].y - atomCoordI.y;
-        float tzj        = atomCoord[atomJ].z - atomCoordI.z;
-
-        float bj         = sqrtf( txj*txj + tyj*tyj + tzj*tzj );
-        float therj      = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[jj])));
-
-        for( unsigned int kk = jj + 1; kk < ioListLength; kk++ ){
-
-            int atomK        = ioList[kk];
-
-            float txk        = atomCoord[atomK].x - atomCoordI.x;
-            float tyk        = atomCoord[atomK].y - atomCoordI.y;
-            float tzk        = atomCoord[atomK].z - atomCoordI.z;
-
-            float bk         = sqrt( txk*txk + tyk*tyk + tzk*tzk );
-            float therk      = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[kk])));
-   
-            // check to see if kk circle is intersecting jj circle
-            // get distance between circle centers and sum of radii
-   
-            float cc         = (txj*txk + tyj*tyk + tzj*tzk)/(bj*bk);
-                  cc         = acos(min(1.0f,max(-1.0f,cc)));
-            float td         = therj + therk;
-   
-            // check to see if circles enclose separate regions
-   
-            int mask1        = omitList[jj] == 0 && omitList[kk] == 0 && cc < td ? 1 : 0;
-            int mask2        = (cc + therk) < therj ? 1 : 0;
-            int mask3        = cc <= delta ? 1 : 0;
-            omitList[kk]     = omitList[kk] || (mask1 && (mask2 || mask3)) ? 1 : 0;
-
-            if( (cc > delta) && ( (2.0f*pi_f - cc) <= td) ){
-                return 1; // done atom
-            }
-        }
-    }
-    return 0;
-}
-
-__device__ int setOmitList( float4 atomCoordI, float4* atomCoord, float* grArray, unsigned int ioListLength, int* ioList,
-                            int* omitList, float delta )
-{
-
-    float pi_f                      = 3.1415926535897f;
-    unsigned int outputIncludeCount = 0;
-    ioList[outputIncludeCount++]    = ioList[0];
-    for( unsigned int jj = 0; jj < ioListLength-1; jj++ ){
-
-        int atomJ        = ioList[jj];
-
-        float txj        = atomCoord[atomJ].x - atomCoordI.x;
-        float tyj        = atomCoord[atomJ].y - atomCoordI.y;
-        float tzj        = atomCoord[atomJ].z - atomCoordI.z;
-
-        float bj         = txj*txj + tyj*tyj + tzj*tzj;
-        float therj      = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[jj])));
-
-        for( unsigned int kk = jj + 1; kk < ioListLength; kk++ ){
-
-            int atomK        = ioList[kk];
-
-            float txk        = atomCoord[atomK].x - atomCoordI.x;
-            float tyk        = atomCoord[atomK].y - atomCoordI.y;
-            float tzk        = atomCoord[atomK].z - atomCoordI.z;
-
-            float bk         = txk*txk + tyk*tyk + tzk*tzk;
-            float therk      = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[kk])));
-   
-            // check to see if kk circle is intersecting jj circle
-            // get distance between circle centers and sum of radii
-   
-            float cc         = (txj*txk + tyj*tyk + tzj*tzk)/sqrtf(bj*bk);
-                  cc         = acos(min(1.0f,max(-1.0f,cc)));
-            float td         = therj + therk;
-   
-            // check to see if circles enclose separate regions
-   
-            int mask1        = omitList[jj] == 0 && omitList[kk] == 0 && cc < td ? 1 : 0;
-            int mask2        = (cc + therk) < therj ? 1 : 0;
-            int mask3        = cc <= delta ? 1 : 0;
-            omitList[kk]     = omitList[kk] || (mask1 && (mask2 || mask3)) ? 1 : 0;
-
-            if( (cc > delta) && ( (2.0f*pi_f - cc) <= td) ){
-                return 0; // done atom
-            }
-        }
-        if( omitList[jj+1] == 0 ){
-            ioList[outputIncludeCount++] = atomJ;
-        }
-    }
-    return outputIncludeCount;
-}
-
-__device__ int setArcList( float4 atomCoordI, float4* atomCoord, float* grArray, unsigned int ioListLength, int* ioList,
-                           int* omitList, float delta )
-{
-
-    float pi_f                      = 3.1415926535897f;
-    unsigned int outputIncludeCount = 0;
-    ioList[outputIncludeCount++]    = ioList[0];
-    for( unsigned int jj = 0; jj < ioListLength-1; jj++ ){
-
-        int atomJ        = ioList[jj];
-
-        float txj        = atomCoord[atomJ].x - atomCoordI.x;
-        float tyj        = atomCoord[atomJ].y - atomCoordI.y;
-        float tzj        = atomCoord[atomJ].z - atomCoordI.z;
-
-        float bj         = sqrtf( txj*txj + tyj*tyj + tzj*tzj );
-        float therj      = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[jj])));
-
-        for( unsigned int kk = jj + 1; kk < ioListLength; kk++ ){
-
-            int atomK        = ioList[kk];
-
-            float txk        = atomCoord[atomK].x - atomCoordI.x;
-            float tyk        = atomCoord[atomK].y - atomCoordI.y;
-            float tzk        = atomCoord[atomK].z - atomCoordI.z;
-
-            float bk         = sqrt( txk*txk + tyk*tyk + tzk*tzk );
-            float therk      = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[kk])));
-   
-            // check to see if kk circle is intersecting jj circle
-            // get distance between circle centers and sum of radii
-   
-            float cc         = (txj*txk + tyj*tyk + tzj*tzk)/(bj*bk);
-                  cc         = acos(min(1.0f,max(-1.0f,cc)));
-            float td         = therj + therk;
-   
-            // check to see if circles enclose separate regions
-   
-            int mask1        = omitList[jj] == 0 && omitList[kk] == 0 && cc < td ? 1 : 0;
-            int mask2        = (cc + therk) < therj ? 1 : 0;
-            int mask3        = cc <= delta ? 1 : 0;
-            omitList[kk]     = omitList[kk] || (mask1 && (mask2 || mask3)) ? 1 : 0;
-
-            if( (cc > delta) && ( (2.0f*pi_f - cc) <= td) ){
-                return 0; // done atom
-            }
-        }
-        if( omitList[jj+1] == 0 ){
-            ioList[outputIncludeCount++] = atomJ;
-        }
-    }
-    return outputIncludeCount;
-}
-
-void insertionSortCpu( unsigned int length, float* A, int* P)
-{
-    for( unsigned int ii = 1; ii < length; ii++ ){
-        float value = A[ii];
-        int index   = P[ii];
-        int jj      = ii;
-        while( jj > 0 && A[jj-1] > value ){
-            A[jj] = A[jj-1];
-            P[jj] = P[jj-1];
-            jj--;
-        }
-        A[jj] = value;
-        P[jj] = index;
-    }
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void kCalculateAmoebaCudaSASA( float4* atomCoord, float* rPlusProbe,
-                                          int* doneAtom, int* ioListCount,
-                                          int* workI1Array, int* workI2Array, int* workI3Array, int* workI4Array,
-                                          float* workFArray
-#ifdef AMOEBA_DEBUG
-                                          ,float4* debugArray
-#endif
-)
-{
-
-
-    unsigned int atomI           = blockIdx.x*blockDim.x+threadIdx.x;
-    if( atomI >= cAmoebaSim.numberOfAtoms)
-    {
-        return;
-    }
-
-    unsigned int arrayOffset     = atomI*cAmoebaSim.maxarc;
-
-    float* grArray               = workFArray  + arrayOffset;
-    int* ioList                  = workI1Array + arrayOffset;
-    int* omitList                = workI2Array + arrayOffset;
-
-    unsigned int atomJ           = 0;
-    float4 coordI                = atomCoord[atomI];
-    float  delta                 = 1.0e-08f;
-    float  rPlusProbeI           = rPlusProbe[atomI];
-    unsigned int io              = 0;
-    while (atomJ < cAmoebaSim.numberOfAtoms)
-    {
-        float4 coordJ            = atomCoord[atomJ];
-        float  rPlusProbeJ       = rPlusProbe[atomJ];
-       
-        float rplus              = rPlusProbeI + rPlusProbeJ;
-        float tx                 = coordJ.x - coordI.x;
-        float ty                 = coordJ.y - coordI.y;
-        float tz                 = coordJ.z - coordI.z;
-        float ccsq               = tx*tx + ty*ty + tz*tz;
-        float cc                 = sqrtf( ccsq );
-        float rminus             = rPlusProbeI - rPlusProbeJ;
-
-        int mask1                = (atomI != atomJ) && (fabsf( tx ) < rplus) && (fabsf( ty ) < rplus) && (fabsf( tz ) < rplus) ? 1 : 0;
-        int mask2                = ((rplus-cc) > delta) ? mask1 : 0;
-        int mask3                = (cc-fabsf(rminus) > delta) ? mask2 : 0; 
-
-        if( mask3 ){
-
-            ioList[io]           = atomJ;
-            omitList[io]         = 0;
-            grArray[io]          = (ccsq+rplus*rminus) / (2.0f*rPlusProbeI*cc);
-            io++;
-
-#ifdef AMOEBA_DEBUG
-            debugArray[arrayOffset+io].x = tx;
-            debugArray[arrayOffset+io].y = ty;
-            debugArray[arrayOffset+io].z = rplus;
-            debugArray[arrayOffset+io].w = rPlusProbe[atomJ];
-#endif
-        }
-        atomJ++;
-    }
-
-//    ioListCount[atomI] = io;
-    insertionSort( io, grArray, ioList );
-
-    // set omit/doneAtom values
-
-    io              = setOmitList( coordI, atomCoord, grArray, io, ioList, omitList, delta );
-    doneAtom[atomI] = (io == 0) ? 1 : 0;
-}
-
-
-/**---------------------------------------------------------------------------------------
-
-   Compute SASA force/energy
-
-   @param amoebaGpu        amoebaGpu context
-
-   --------------------------------------------------------------------------------------- */
-
-void kCalculateAmoebaSASAForces(amoebaGpuContext amoebaGpu )
-{
-
-   // ---------------------------------------------------------------------------------------
-
-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "kCalculateAmoebaSASAForces";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
-   if( amoebaGpu )return;
-
-   // ---------------------------------------------------------------------------------------
-
-     gpuContext gpu    = amoebaGpu->gpuContext;
-    int numOfElems     = amoebaGpu->paddedNumberOfAtoms;
-    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
-    int numBlocks      = numOfElems/numThreads;
-
-    if( (numOfElems % numThreads) != 0 )numBlocks++;
-
-#ifdef AMOEBA_DEBUG
-    CUDAStream<float4>* debugArray = NULL;
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d maxarc=%d\n",
-                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks, amoebaGpu->amoebaSim.maxarc );
-        (void) fflush( amoebaGpu->log );
-        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        debugArray                                 = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-        memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-        debugArray->Upload();
-
-    }   
-#endif
-
-    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
-
-    kCalculateAmoebaCudaSASA<<< numBlocks, numThreads >>>(
-         gpu->psPosq4->_pDevStream[0],
-         amoebaGpu->psSASA_Radii->_pDevStream[0],
-         amoebaGpu->psDoneAtom->_pDevStream[0],
-         amoebaGpu->psIoListCount->_pDevStream[0],
-         amoebaGpu->psIntWorkArray[0]->_pDevStream[0],
-         amoebaGpu->psIntWorkArray[1]->_pDevStream[0],
-         amoebaGpu->psIntWorkArray[2]->_pDevStream[0],
-         amoebaGpu->psIntWorkArray[3]->_pDevStream[0],
-         amoebaGpu->psFloatWorkArray->_pDevStream[0]
-#ifdef AMOEBA_DEBUG
-, debugArray->_pDevStream[0]
-#endif
- );
-    LAUNCHERROR("kCalculateAmoebaCudaSASA");
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        debugArray->Download();
-        amoebaGpu->psIntWorkArray[0]->Download();
-        amoebaGpu->psIntWorkArray[1]->Download();
-        amoebaGpu->psIoListCount->Download();
-        amoebaGpu->psDoneAtom->Download();
-        amoebaGpu->psFloatWorkArray->Download();
-        (void) fprintf( amoebaGpu->log, "Post kCalculateAmoebaCudaSASA \n" );
-        for( unsigned int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++)
-        {
-            int index            = ii*amoebaGpu->amoebaSim.maxarc;
-            unsigned int length  = (unsigned int) amoebaGpu->psIoListCount->_pSysStream[0][ii];
-            int omitSum = 0;
-            for( unsigned int jj = 0; jj < length; jj++)
-            {
-                omitSum += amoebaGpu->psIntWorkArray[1]->_pSysStream[0][index+jj];
-            }
-            (void) fprintf( amoebaGpu->log, "%5u %5d omitSum=%5d done=%d\n", ii, length, omitSum, amoebaGpu->psDoneAtom->_pSysStream[0][ii] );
-            if( ii == 0 ){
-                for( unsigned int jj = 0; jj < length; jj++)
-                {
-                    (void) fprintf( amoebaGpu->log, "    %5d omt=%5d %14.6e %14.6e %14.6e %14.6e %14.6e \n",
-                                    amoebaGpu->psIntWorkArray[0]->_pSysStream[0][index+jj],
-                                    amoebaGpu->psIntWorkArray[1]->_pSysStream[0][index+jj],
-                                    amoebaGpu->psFloatWorkArray->_pSysStream[0][index+jj],
-                                    debugArray->_pSysStream[0][index+jj].x, debugArray->_pSysStream[0][index+jj].y, debugArray->_pSysStream[0][index+jj].z, debugArray->_pSysStream[0][index+jj].w);
-                }
-/*
-                int*   permutation   = (int*) malloc( sizeof( int )*length );
-                float* values        = (float*) malloc( sizeof( float )*length );
-                for( unsigned int jj = 1; jj <= length; jj++)
-                {
-                    permutation[jj-1] = amoebaGpu->psIntWorkArray[0]->_pSysStream[0][index+jj];
-                    values[jj-1]      = amoebaGpu->psFloatWorkArray->_pSysStream[0][index+jj];
-                }
-                insertionSortCpu( length, values, permutation);
-                for( unsigned int jj = 0; jj < length; jj++)
-                {
-                    (void) fprintf( amoebaGpu->log, "Sortd: %5u %5d %14.6e\n", jj, permutation[jj], values[jj] );
-                }
-                free( permutation );
-                free( values );
-*/
-            }
-        }
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
- 
-    if( 0 )
-    {
-        unsigned int iterations = 5000;
-        time_t start            = clock();
-        for( unsigned int ii = 0; ii < iterations; ii++)
-        {
-            kCalculateAmoebaCudaSASA<<< numBlocks, numThreads >>>(
-                 gpu->psPosq4->_pDevStream[0],
-                 amoebaGpu->psSASA_Radii->_pDevStream[0],
-                 amoebaGpu->psDoneAtom->_pDevStream[0],
-                 amoebaGpu->psIoListCount->_pDevStream[0],
-                 amoebaGpu->psIntWorkArray[0]->_pDevStream[0],
-                 amoebaGpu->psIntWorkArray[1]->_pDevStream[0],
-                 amoebaGpu->psIntWorkArray[2]->_pDevStream[0],
-                 amoebaGpu->psIntWorkArray[3]->_pDevStream[0],
-                 amoebaGpu->psFloatWorkArray->_pDevStream[0]
-#ifdef AMOEBA_DEBUG
-                , debugArray->_pDevStream[0]
-#endif
-                );
-                LAUNCHERROR("kCalculateAmoebaCudaSASA");
-        }
-        float sasaTime = (float)(clock()-start)/CLOCKS_PER_SEC;
-        (void) fprintf( amoebaGpu->log, "Total sasa kernel time=%12.5e time/iter=%12.5e %u %u\n", sasaTime, sasaTime/(float) iterations, (unsigned int) start, (unsigned int) clock()); 
-
-    }
-#ifdef AMOEBA_DEBUG
-    delete debugArray;
-#endif
-
-}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -382,7 +382,11 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG

    // calculate electrostatic forces

+    if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
        cudaComputeAmoebaElectrostatic( amoebaGpu );
+    } else {
+        cudaComputeAmoebaRealSpaceEwald( amoebaGpu );
+    }

    // map torques to forces


--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -40,6 +40,7 @@
 #include <sys/time.h>
 #endif

+extern int isNanOrInfinity( double number );

 using namespace std;

@@ -1937,6 +1938,7 @@ static void readAmoebaMultipoleCovalent( FILE* filePtr, AmoebaMultipoleForce* mu
    Read Amoeba multipole parameters

    @param filePtr              file pointer to parameter file
+    @param version              version id for file
    @param forceMap             map of forces to be included
    @param tokens               array of strings from first line of parameter file for this block of parameters
    @param system               System reference
@@ -1949,7 +1951,7 @@ static void readAmoebaMultipoleCovalent( FILE* filePtr, AmoebaMultipoleForce* mu

    --------------------------------------------------------------------------------------- */

-static int readAmoebaMultipoleParameters( FILE* filePtr, MapStringInt& forceMap, const StringVector& tokens,
+static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringInt& forceMap, const StringVector& tokens,
                                           System& system, int useOpenMMUnits, MapStringVectorOfVectors& supplementary,
                                           MapStringString& inputArgumentMap, int* lineCount, FILE* log ){

@@ -1984,8 +1986,27 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, MapStringInt& forceMap,
    // load in parameters
 
    int numberOfMultipoles            = atoi( tokens[1].c_str() );
+    int usePme                        = 0;
+    double aewald                     = 0.0;
+    double cutoffDistance             = 0.0;
+
+    // usePme, aewald, cutoffDistance added w/ Version 1
+
+    if( version > 0 ){
+        usePme                        = atoi( tokens[2].c_str() );
+        aewald                        = atof( tokens[3].c_str() );
+        cutoffDistance                = atof( tokens[4].c_str() );
+    }
+    if( usePme ){
+        multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::PME );
+    } else {
+        multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::NoCutoff );
+    }
+    multipoleForce->setAEwald( aewald );
+    multipoleForce->setCutoffDistance( cutoffDistance );
    if( log ){
-        (void) fprintf( log, "%s number of MultipoleParameter terms=%d\n", methodName.c_str(), numberOfMultipoles );
+        (void) fprintf( log, "%s number of MultipoleParameter terms=%d usePme=%d aewald=%15.7e cutoffDistance=%12.4f\n",
+                        methodName.c_str(), numberOfMultipoles, usePme, aewald, cutoffDistance );
        (void) fflush( log );
    }
    for( int ii = 0; ii < numberOfMultipoles; ii++ ){
@@ -3380,7 +3401,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
    }

    int lineCount                  = 0;
-    std::string version            = "0.1"; 
+    int version                    = 0; 
    int isNotEof                   = 1;
    Integrator* returnIntegrator   = NULL;

@@ -3404,9 +3425,9 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
   
            if( field == "Version" ){
                if( tokens.size() > 1 ){
-                   version = tokens[1];
+                   version = atoi( tokens[1].c_str() );
                   if( log ){
-                       (void) fprintf( log, "Version=<%s> at line=%d\n", version.c_str(), lineCount );
+                       (void) fprintf( log, "Version=%d at line=%d\n", version, lineCount );
                   }
                }
            } else if( field == "Masses" ){
@@ -3522,13 +3543,28 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
            // AmoebaMultipole
  
            } else if( field == "AmoebaMultipoleParameters" ){
-                readAmoebaMultipoleParameters( filePtr, forceMap, tokens, system, useOpenMMUnits, supplementary, inputArgumentMap, &lineCount, log );
-            } else if( field == "AmoebaMultipoleForce" ){
+                readAmoebaMultipoleParameters( filePtr, version, forceMap, tokens, system, useOpenMMUnits, supplementary, inputArgumentMap, &lineCount, log );
+            } else if( field == "AmoebaMultipoleForce" || field == "AmoebaPmeForce" ){
                readVec3( filePtr, tokens, forces[AMOEBA_MULTIPOLE_FORCE], &lineCount, field, log );
-            } else if( field == "AmoebaMultipoleEnergy" ){
+            } else if( field == "AmoebaMultipoleEnergy" || field == "AmoebaPmeEnergy" ){
               if( tokens.size() > 1 ){
                 potentialEnergy[AMOEBA_MULTIPOLE_FORCE] = atof( tokens[1].c_str() ); 
               }
+            } else if( field == "AmoebaRealPmeForce"                   || 
+                       field == "AmoebaKSpacePmeForce"                 ||
+                       field == "AmoebaSelfPmeForce"                   ){
+                std::vector< std::vector<double> > vectorOfDoubleVectors;
+                readVectorOfDoubleVectors( filePtr, tokens, vectorOfDoubleVectors, &lineCount, field, log );
+                supplementary[field] = vectorOfDoubleVectors;
+            } else if( field == "AmoebaRealPmeEnergy"                  || 
+                       field == "AmoebaKSpacePmeEnergy"                ||
+                       field == "AmoebaSelfPmeEnergy"                  ){
+                double value = atof( tokens[1].c_str() );
+                std::vector< std::vector<double> > vectorOfDoubleVectors;
+                std::vector<double> doubleVectors;
+                doubleVectors.push_back( value );
+                vectorOfDoubleVectors.push_back( doubleVectors ); 
+                supplementary[field] =  vectorOfDoubleVectors;

            // Amoeba GK

@@ -3677,7 +3713,6 @@ void initializeForceMap( MapStringInt& forceMap, int initialValue ){
     forceMap[AMOEBA_GK_FORCE]                       = initialValue;
     forceMap[AMOEBA_VDW_FORCE]                      = initialValue;
     forceMap[AMOEBA_WCA_DISPERSION_FORCE]           = initialValue;
-     forceMap[AMOEBA_SASA_FORCE]                     = 0;
 
     return;

@@ -5830,8 +5865,7 @@ Double "simulationTimeBetweenReportsRatio"       simulationTimeBetweenReportsRat
                   key == AMOEBA_MULTIPOLE_FORCE                  ||
                   key == AMOEBA_GK_FORCE                         ||
                   key == AMOEBA_VDW_FORCE                        ||
-                   key == AMOEBA_WCA_DISPERSION_FORCE             ||
-                   key == AMOEBA_SASA_FORCE                       ){
+                   key == AMOEBA_WCA_DISPERSION_FORCE             ){
            forceMap[key]                              = atoi( value.c_str() );
        } else {
            inputArgumentMap[key]                      = value;

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.h
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.h
@@ -77,7 +77,6 @@ static std::string AMOEBA_GK_FORCE                                    = "AmoebaG
 static std::string AMOEBA_GK_CAVITY_FORCE                             = "AmoebaGkAndCavity";
 static std::string AMOEBA_VDW_FORCE                                   = "AmoebaVdw";
 static std::string AMOEBA_WCA_DISPERSION_FORCE                        = "AmoebaWcaDispersion";
-static std::string AMOEBA_SASA_FORCE                                  = "AmoebaSASA";
 static std::string ALL_FORCES                                         = "AllForces";

 static std::string AMOEBA_MULTIPOLE_ROTATION_MATRICES                 = "AmoebaMultipoleRotationMatrices";