Added copyright

Removed debugging code

Added copyright
Removed debugging code
2b508482 · Mark Friedrichs · 36762962 · 2b508482 · 2b508482 · 2b508482
Commit 2b508482 authored Nov 23, 2011 by Mark Friedrichs
13 changed files
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.h
@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
                            unsigned int* workUnit,
-                            float* outputField, float* outputFieldPolar
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
+                            float* outputField, float* outputFieldPolar){

    extern __shared__ MutualInducedParticle sA[];

@@ -99,11 +95,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(

                // load coords, charge, ...

-                calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-,  debugArray
-#endif
-);
+                calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField);

                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;

@@ -117,34 +109,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
                fieldPolarSum[1]       += mask ? ijField[1][1] : 0.0f;
                fieldPolarSum[2]       += mask ? ijField[1][2] : 0.0f;

-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom ){
-        unsigned int index                 = y + j;
-        unsigned int indexI                = 0;
-        //unsigned int indexJ                = 2;
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + j);
-        //debugArray[index].z                = cAmoebaSim.pDampingFactorAndThole[atomI].x;
-        debugArray[index].z                = (float) cSim.atoms;
-        debugArray[index].w                = (float) (mask + 1);
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? ijField[indexI][0] : 0.0f;
-        debugArray[index].y                = mask ? ijField[indexI][1] : 0.0f;
-        debugArray[index].z                = mask ? ijField[indexI][2] : 0.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? ijField[indexI+1][0] : 0.0f;
-        debugArray[index].y                = mask ? ijField[indexI+1][1] : 0.0f;
-        debugArray[index].z                = mask ? ijField[indexI+1][2] : 0.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) 1.0f;
-}
-#endif
            }

            // Write results
@@ -161,9 +125,8 @@ if( atomI == targetAtom ){

 #endif

-        }
-        else        // 100% utilization
-        {
+        } else {
+
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
@@ -185,13 +148,9 @@ if( atomI == targetAtom ){

                // load coords, charge, ...

-                calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-,  debugArray
-#endif
-   );
+                calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField);

-                unsigned int mask   =  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
+                unsigned int mask         =  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0 : 1;
           
                // add to field at atomI the field due atomJ's dipole

@@ -217,36 +176,8 @@ if( atomI == targetAtom ){
                psA[tj].fieldPolar[1]    += mask ? ijField[3][1] : 0.0f;
                psA[tj].fieldPolar[2]    += mask ? ijField[3][2] : 0.0f;

-#ifdef AMOEBA_DEBUG
-//#if 0
-if( atomI == targetAtom  || (y + tj) == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-        unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
-        //unsigned int indexJ                = (atomI == targetAtom) ? 2 : 0;
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + tj);
-        debugArray[index].z                = cAmoebaSim.pDampingFactorAndThole[atomI].x;
-        debugArray[index].w                = (float) (mask+1);
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? ijField[indexI][0] : 0.0f;
-        debugArray[index].y                = mask ? ijField[indexI][1] : 0.0f;
-        debugArray[index].z                = mask ? ijField[indexI][2] : 0.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? ijField[indexI+1][0] : 0.0f;
-        debugArray[index].y                = mask ? ijField[indexI+1][1] : 0.0f;
-        debugArray[index].z                = mask ? ijField[indexI+1][2] : 0.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) -1.0f;
-}
-#endif

-                tj                  = (tj + 1) & (GRID - 1);
+                tj                        = (tj + 1) & (GRID - 1);

            }


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "cudaKernels.h"

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
-///-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-//#define AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -213,11 +233,7 @@ __device__ void calculateBn_kernel( float r, float4* bn, float* bn0, float *bn5
 #undef SUB_METHOD_NAME

 __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDirectElectrostaticParticle& atomI, const   PmeDirectElectrostaticParticle& atomJ,
-                                                                   const float* scalingFactors, float4 forceTorqueEnergy[3]
-#ifdef AMOEBA_DEBUG
-                                                               ,float4* debugArray
-#endif
- ){
+                                                                   const float* scalingFactors, float4 forceTorqueEnergy[3]){

    float xr    = atomJ.x - atomI.x;
    float yr    = atomJ.y - atomI.y;
@@ -909,132 +925,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire
        forceTorqueEnergy[2].y       =  (ttm32 + ttm3i2);
        forceTorqueEnergy[2].z       =  (ttm33 + ttm3i3);

-#ifdef AMOEBA_DEBUG
-    int debugIndex               = 0;
-    float idTracker              = 1.0f;
-/*
-    debugArray[debugIndex].x = atomI.labFrameDipole[0];
-    debugArray[debugIndex].y = atomI.labFrameDipole[1];
-    debugArray[debugIndex].z = atomI.labFrameDipole[2];
-    debugArray[debugIndex].w = r2;
-
-    debugIndex++;
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
-    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
-    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
-    debugArray[debugIndex].w = cSim.alphaEwald;
-
-    debugIndex++;
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = atomI.inducedDipole[0];
-    debugArray[debugIndex].y = atomI.inducedDipole[1];
-    debugArray[debugIndex].z = atomI.inducedDipole[2];
-    debugArray[debugIndex].w = idTracker;
-
-    debugIndex++;
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = atomJ.inducedDipole[0];
-    debugArray[debugIndex].y = atomJ.inducedDipole[1];
-    debugArray[debugIndex].z = atomJ.inducedDipole[2];
-    debugArray[debugIndex].w = idTracker;
-
-    debugIndex++;
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = atomI.inducedDipoleP[0];
-    debugArray[debugIndex].y = atomI.inducedDipoleP[1];
-    debugArray[debugIndex].z = atomI.inducedDipoleP[2];
-    debugArray[debugIndex].w = idTracker;
-
-    debugIndex++;
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = atomJ.inducedDipoleP[0];
-    debugArray[debugIndex].y = atomJ.inducedDipoleP[1];
-    debugArray[debugIndex].z = atomJ.inducedDipoleP[2];
-    debugArray[debugIndex].w = idTracker;
-
-    debugIndex++;
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = conversionFactor*ftm21;
-    debugArray[debugIndex].y = conversionFactor*ftm22;
-    debugArray[debugIndex].z = conversionFactor*ftm23;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 1.0;
-    debugArray[debugIndex].x = e;
-    debugArray[debugIndex].y = ei;
-    debugArray[debugIndex].z = erl;
-    debugArray[debugIndex].w = erli;
-    debugIndex++;
-*/
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = r2;
-    debugArray[debugIndex].y = cSim.alphaEwald;
-    debugArray[debugIndex].z = conversionFactor;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = conversionFactor*ftm21;
-    debugArray[debugIndex].y = conversionFactor*ftm22;
-    debugArray[debugIndex].z = conversionFactor*ftm23;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = conversionFactor*ftm2i1;
-    debugArray[debugIndex].y = conversionFactor*ftm2i2;
-    debugArray[debugIndex].z = conversionFactor*ftm2i3;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-/*
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = fridmp1;
-    debugArray[debugIndex].y = fridmp2;
-    debugArray[debugIndex].z = fridmp3;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = findmp1;
-    debugArray[debugIndex].y = findmp2;
-    debugArray[debugIndex].z = findmp3;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-*/
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = conversionFactor*ttm21;
-    debugArray[debugIndex].y = conversionFactor*ttm22;
-    debugArray[debugIndex].z = conversionFactor*ttm23;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = conversionFactor*ttm2i1;
-    debugArray[debugIndex].y = conversionFactor*ttm2i2;
-    debugArray[debugIndex].z = conversionFactor*ttm2i3;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = conversionFactor*ttm31;
-    debugArray[debugIndex].y = conversionFactor*ttm32;
-    debugArray[debugIndex].z = conversionFactor*ttm33;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-    idTracker               += 100.0;
-    debugArray[debugIndex].x = conversionFactor*ttm3i1;
-    debugArray[debugIndex].y = conversionFactor*ttm3i2;
-    debugArray[debugIndex].z = conversionFactor*ttm3i3;
-    debugArray[debugIndex].w = idTracker;
-    debugIndex++;
-
-#endif
-
    } else {

        forceTorqueEnergy[0].x  = 0.0f;
@@ -1051,15 +941,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxnOrig_kernel( const PmeDire

        forceTorqueEnergy[0].w  = 0.0f;

-#ifdef AMOEBA_DEBUG
-for( int ii = 0; ii < 12; ii++ ){
-    debugArray[ii].x = 0.0f;
-    debugArray[ii].y = 0.0f;
-    debugArray[ii].z = 0.0f;
-    debugArray[ii].w = (float) (-ii);
-}
-#endif
-
    }

    return;
@@ -1351,35 +1232,11 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )
 void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 {

-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "cudaComputeAmoebaPmeDirectElectrostatic";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-      (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                      methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-    }
-    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    int maxOffset                             = 20;
-    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxOffset*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxOffset*paddedNumberOfAtoms);
-    debugArray->Upload();
-    unsigned int targetAtom                   = 49;
-#endif
-
    // on first pass, set threads/block

    static unsigned int threadsPerBlock = 0;
@@ -1403,37 +1260,16 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )

    kClearFields_3( amoebaGpu, 1 );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u maxL1=%d\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, maxL1 );
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
-
    if (gpu->bOutputBufferPerWarp){

        kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                          gpu->sim.pInteractingWorkUnit,
-#ifdef AMOEBA_DEBUG
-                                                                          amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                          debugArray->_pDevData, targetAtom );
-#else
                                                                          amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
-
    } else {

        kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                    gpu->sim.pInteractingWorkUnit,
-#ifdef AMOEBA_DEBUG
-                                                                    amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                    debugArray->_pDevData, targetAtom );
-#else
                                                                    amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu

-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-//#define AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -171,12 +191,7 @@ __device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& at
 }

 __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
-                                                            float dscale, float pscale, float4 fields[3]
-#ifdef AMOEBA_DEBUG
-                                                            , float4* pullBack
-#endif
-
- ){
+                                                            float dscale, float pscale, float4 fields[3]){

    // compute the real space portion of the Ewald summation
  
@@ -329,18 +344,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
        fields[2].w       = 0.0f;
    }

-#ifdef AMOEBA_DEBUG
-    pullBack[0].x = xr;
-    pullBack[0].y = yr;
-    pullBack[0].z = zr;
-    pullBack[0].w = r2;
-
-    pullBack[1].x = atomJ.x - atomI.x;
-    pullBack[1].y = atomJ.y - atomI.y;
-    pullBack[1].z = atomJ.z - atomI.z;
-    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
-
-#endif
 }

 // Include versions of the kernels for N^2 calculations.
@@ -361,40 +364,6 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&

   --------------------------------------------------------------------------------------- */

-#ifdef AMOEBA_DEBUG
-static int isNanOrInfinity( double number ){
-    return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0; 
-}
-
-static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
-{
-  int i, j, temp;
-  int test; /*use this only if unsure whether the list is already sorted or not*/
-  for(i = length - 1; i > 0; i--)
-  {
-    test=0;
-    for(j = 0; j < i; j++)
-    {
-      if(array[j] > array[j+1]) /* compare neighboring elements */
-      {
-
-        temp = array[j];    /* swap array[j] and array[j+1] */
-        array[j] = array[j+1];
-        array[j+1] = temp;
-
-        temp = track[j];    /* swap array[j] and array[j+1] */
-        track[j] = track[j+1];
-        track[j+1] = temp;
-
-        test=1;
-      }
-    } /*end for j*/
-    if(test==0) break; /*will exit if the list is sorted!*/
-  } /*end for i*/
-      
-}
-#endif
-
 /**---------------------------------------------------------------------------------------

   Compute fixed electric field using PME
@@ -409,22 +378,6 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    static unsigned int threadsPerBlock  = 0;
    gpuContext gpu                       = amoebaGpu->gpuContext;

-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "computeCudaAmoebaPmeFixedEField";
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
-    }
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    int slots                                  = 15;
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*slots, 1, "DebugArray");
-    memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*slots);
-    debugArray->Upload();
-
-    // print intermediate results for the targetAtom 
-
-    unsigned int targetAtom  = 0;
-#endif
-
    kClearFields_3( amoebaGpu, 2 );

    // on first pass, set threads/block
@@ -444,175 +397,17 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-#endif
    } else {
        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-#endif
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");

    kReducePmeDirectE_Fields( amoebaGpu );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        gpu->psInteractionCount->Download();
-        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
-                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3), gpu->sharedMemoryPerBlock),
-                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
-        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
-        (void) fflush( amoebaGpu->log );
-/*
-        (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2]  paddedNumberOfAtoms=%d\n",  gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers );
-        amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_3_2->Download();
-        for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-
-           // buffer 1
-
-           (void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset],
-                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+1],
-                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+2] );
-   
-           // buffer 2
-
-           (void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset],
-                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+1],
-                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+2] );
-
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-*/
-        amoebaGpu->psE_Field->Download();
-        amoebaGpu->psE_FieldPolar->Download();
-        (void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
-        int maxPrint             = 3002;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-
-           // E_Field
-
-           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_Field->_pSysData[indexOffset],
-                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
-                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
-   
-           // E_Field polar
-
-           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
-
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-        (void) fprintf( amoebaGpu->log, "EFields End\n" );
-
-        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
-        debugArray->Download();
-
-        std::vector<int> indices;
-        std::vector<int> track;
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            int debugIndex = jj;
-            if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
-                int orderIndex = gpu->psAtomIndex->_pSysData[jj];
-                indices.push_back( orderIndex );
-                track.push_back( jj );
-            }
-        }
-        bubbleSort( indices, track, static_cast<int>(track.size()) );
-
-        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        amoebaGpu->gpuContext->psPosq4->Download();
-        unsigned int count = 0;
-        float sum0[3] = { 0.0f, 0.0f, 0.0f };
-        float sum1[3] = { 0.0f, 0.0f, 0.0f };
-        int offset0   = 1;
-        int offset1   = 2;
-/*
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            int debugIndex = jj;
-if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
-            int orderIndex = gpu->psAtomIndex->_pSysData[jj];
-            count++;
-*/
-
-        for( unsigned int ii = 0; ii < track.size(); ii++ ){
-            int jj         = track[ii];
-            int debugIndex = jj;
-            int orderIndex = indices[ii];
-            if( orderIndex > 31 && offset0 == 1 ){
-                offset0 = 2;
-                offset1 = 2;
-            }
-            count++;
-
-            sum0[0] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].x;
-            sum0[1] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].y;
-            sum0[2] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].z;
-
-            sum1[0] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].x;
-            sum1[1] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].y;
-            sum1[2] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].z;
-
-            (void) fprintf( amoebaGpu->log,"%5d %5d %u PmeFixedEField\n", orderIndex, jj, count );
-            for( int kk = 0; kk < 7; kk++ ){
-                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                debugIndex += paddedNumberOfAtoms;
-            }
-            (void) fprintf( amoebaGpu->log,"%6d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %6d %6d cum sumsOp\n", 
-                            orderIndex, sum0[0], sum0[1], sum0[2], sum1[0], sum1[1], sum1[2], jj, count );
-            (void) fprintf( amoebaGpu->log,"\n" );
-
-        }
-
-        // write results to file
-
-        if( 1 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
-         }
-         delete debugArray;
-    }
-#endif
-
 }

 void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
@@ -621,21 +416,4 @@ void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
    kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );

-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        gpuContext gpu                       = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        kReduceForces( gpu );
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceTorqueFixed", fileId, outputVector );
-        //cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
-        exit(0);
-    }
-#endif
-
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.h
@@ -37,16 +37,7 @@ __launch_bounds__(64, 1)
 void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
                            unsigned int* workUnit,
                            float* outputEField,
-                            float* outputEFieldPolar
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
-
-#ifdef AMOEBA_DEBUG
-    int maxPullIndex = 1;
-    float4 pullBack[12];
-#endif
+                            float* outputEFieldPolar){ 

    extern __shared__ FixedFieldParticle sA[];

@@ -118,11 +109,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
                }

                float4 ijField[3];
-                calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
-#ifdef AMOEBA_DEBUG
-                                            , pullBack
-#endif
-                );
+                calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField);

                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
                // by setting match flag
@@ -139,66 +126,6 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
                fieldPolarSum[1]       += match ? 0.0f : ijField[1].z;
                fieldPolarSum[2]       += match ? 0.0f : ijField[2].z;

-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || targetAtom == (y+j) ){
-    unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
-    unsigned int indexI                = 0;
-    unsigned int indexJ                = indexI ? 0 : 2;
-    float flag                         = 7.0f;
-
-    debugArray[index].x                = (float) atomI;
-    debugArray[index].y                = (float) (y + j);
-    debugArray[index].z                = dScaleValue;
-    debugArray[index].w                = pScaleValue;
-/*
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = (float) bExclusionFlag;
-    debugArray[index].y                = (float) (tgx);
-    debugArray[index].z                = (float) j;
-    debugArray[index].w                = flag;
-
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = (float) dScaleMask;
-    debugArray[index].y                = (float) pScaleMask.x;
-    debugArray[index].z                = (float) pScaleMask.y;
-    debugArray[index].w                = flag;
-*/
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = match ? 0.0f : ijField[0].x;
-    debugArray[index].y                = match ? 0.0f : ijField[1].x;
-    debugArray[index].z                = match ? 0.0f : ijField[2].x;
-    debugArray[index].w                = flag + 1.0f;
-
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = match ? 0.0f : ijField[0].z;
-    debugArray[index].y                = match ? 0.0f : ijField[1].z;
-    debugArray[index].z                = match ? 0.0f : ijField[2].z;
-    debugArray[index].w                = flag + 2.0f;
-
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = match ? 0.0f : ijField[0].y;
-    debugArray[index].y                = match ? 0.0f : ijField[1].y;
-    debugArray[index].z                = match ? 0.0f : ijField[2].y;
-    debugArray[index].w                = flag + 3.0f;
-
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = match ? 0.0f : ijField[0].w;
-    debugArray[index].y                = match ? 0.0f : ijField[1].w;
-    debugArray[index].z                = match ? 0.0f : ijField[2].w;
-    debugArray[index].w                = flag + 4.0f;
-
-
-    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullBack[pullIndex].x;
-        debugArray[index].y                = pullBack[pullIndex].y;
-        debugArray[index].z                = pullBack[pullIndex].z;
-        debugArray[index].w                = pullBack[pullIndex].w;
-    }   
-
-
-}
-#endif
            }

            // Write results
@@ -252,11 +179,7 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
                        }

                        float4 ijField[3];
-                        calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField
-    #ifdef AMOEBA_DEBUG
-                                                    , pullBack
-    #endif
-                        );
+                        calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[jIdx], dScaleValue, pScaleValue, ijField);

                        unsigned int outOfBounds     = ( (atomI >= cSim.atoms) || ((y+jIdx) >= cSim.atoms) ) ? 1 : 0;

@@ -317,67 +240,8 @@ if( atomI == targetAtom || targetAtom == (y+j) ){
                            }
                        }

-#ifdef AMOEBA_DEBUG
-if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
-
-            unsigned int index                 = (atomI == targetAtom) ? (y + jIdx) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
-            unsigned int indexJ                = (atomI == targetAtom) ? 2 : 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + jIdx);
-            debugArray[index].z                = dScaleValue;
-            debugArray[index].w                = pScaleValue;
-
-            float flag                         = 9.0f;
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = (float) bExclusionFlag;
-            debugArray[index].y                = (float) (tgx);
-            debugArray[index].z                = (float) j;
-            debugArray[index].w                = jIdx;
-        
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = (float) dScaleMask;
-            debugArray[index].y                = (float) pScaleMask.x;
-            debugArray[index].z                = (float) pScaleMask.y;
-            debugArray[index].w                = (float) flags;
- */       
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].x;
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].x;
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].x;
-            debugArray[index].w                =  flag + 1.0f;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].y;
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].y;
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].y;
-            debugArray[index].w                = flag + 2.0f;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].z;
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].z;
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].z;
-            debugArray[index].w                = flag + 3.0f;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  outOfBounds ? 0.0f : ijField[0].w;
-            debugArray[index].y                =  outOfBounds ? 0.0f : ijField[1].w;
-            debugArray[index].z                =  outOfBounds ? 0.0f : ijField[2].w;
-            debugArray[index].w                = flag + 4.0f;
-
-            for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
-                index                             += cSim.paddedNumberOfAtoms;
-                debugArray[index].x                = pullBack[pullIndex].x;
-                debugArray[index].y                = pullBack[pullIndex].y;
-                debugArray[index].z                = pullBack[pullIndex].z;
-                debugArray[index].w                = pullBack[pullIndex].w;
-            }
-}        
-#endif
                    }
-                    tj                  = (tj + 1) & (GRID - 1);
+                    tj = (tj + 1) & (GRID - 1);

                } // j-loop block
    

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
@@ -33,9 +55,6 @@ void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 #undef INCLUDE_MI_FIELD_BUFFERS
 #define INCLUDE_MI_FIELD_BUFFERS 
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
@@ -331,10 +350,6 @@ static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* ar
    {   
        epsilon[0]  = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
-#ifdef AMOEBA_DEBUG
-        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
-        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
-#endif
    }   
 }

@@ -416,18 +431,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
                                                                  CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
 {
  
-  static unsigned int threadsPerBlock  = 0;
-  gpuContext gpu                       = amoebaGpu->gpuContext;
-
-#ifdef AMOEBA_DEBUG
-    int targetAtom                = 546;
-    static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
-    static int iteration          = 1;
-    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
+    static unsigned int threadsPerBlock  = 0;
+    gpuContext gpu                       = amoebaGpu->gpuContext;

    kClearFields_3( amoebaGpu, 2 );

@@ -444,17 +449,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }    

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        gpu->psInteractionCount->Download();
-        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    if (gpu->bOutputBufferPerWarp){

        kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
@@ -474,43 +468,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte

    kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log && iteration == 1 ){
-        (void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d -- Direct only -- self added in kSorUpdateMutualInducedField_kernel\n",
-                        iteration ); (void) fflush( amoebaGpu->log );
-        outputArray->Download();
-        outputPolarArray->Download();
-        //debugArray->Download();
-        int maxPrint = 5;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-            (void) fprintf( amoebaGpu->log, "%5d ", ii); 
- 
-             int indexOffset     = ii*3;
-     
-            // MI
- 
-            (void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
-                            outputArray->_pSysData[indexOffset],
-                            outputArray->_pSysData[indexOffset+1],
-                            outputArray->_pSysData[indexOffset+2] );
-     
-            // MI polar
- 
-            (void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e]\n",
-                            outputPolarArray->_pSysData[indexOffset],
-                            outputPolarArray->_pSysData[indexOffset+1],
-                            outputPolarArray->_pSysData[indexOffset+2] );
-            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-            }
-
-        }
-        (void) fflush( amoebaGpu->log );
-        iteration++;
-
-     }
-#endif
-
 }

 /**---------------------------------------------------------------------------------------
@@ -526,25 +483,12 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
  
   // ---------------------------------------------------------------------------------------

-//#define AMOEBA_DEBUG
-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldBySOR";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
-   // ---------------------------------------------------------------------------------------
-
    int done;
    int iteration;

-     gpuContext gpu    = amoebaGpu->gpuContext;
+    gpuContext gpu    = amoebaGpu->gpuContext;

-   // ---------------------------------------------------------------------------------------
+    // ---------------------------------------------------------------------------------------

    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
@@ -559,19 +503,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
    cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData,        amoebaGpu->psE_Field->_pDevData,       3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
    cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData,   amoebaGpu->psE_FieldPolar->_pDevData,  3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        std::vector<int> fileId;
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,            outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar,       outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeEFieldPolarity", fileId, outputVector );
-    }   
-#endif
-
    // if polarization type is direct, set flags signalling done and return

    if( amoebaGpu->amoebaSim.polarizationType )
@@ -609,12 +540,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
           amoebaGpu->psCurrentEpsilon->_pDevData );
        LAUNCHERROR("kReducePmeMutualInducedFieldDelta");

-#ifdef AMOEBA_DEBUG
-        if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
-            trackMutualInducedIterations( amoebaGpu, iteration);
-        }
-#endif
-
        // Debye=48.033324f
        amoebaGpu->psCurrentEpsilon->Download();
        float currentEpsilon                     = amoebaGpu->psCurrentEpsilon->_pSysData[0];
@@ -624,79 +549,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
            done = 1;
        }

-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-           amoebaGpu->psInducedDipole->Download();
-           amoebaGpu->psInducedDipolePolar->Download();
-#if 1
-           (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldBySOR iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
-                           iteration, amoebaGpu->mutualInducedCurrentEpsilon,
-                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
-                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
-#else
-           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
-                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
-                           amoebaGpu->psCurrentEpsilon->_pSysData[0], 
-                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
-                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
-#endif
-           (void) fflush( amoebaGpu->log );
-
-            if( 0 ){
-                gpuContext gpu = amoebaGpu->gpuContext;
-                std::vector<int> fileId;
-                fileId.push_back( iteration );
-                VectorOfDoubleVectors outputVector;
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
-            }
-/*
-            int offset   = 0;
-            int maxPrint = 10;
-            for( int ii = 0; ii < gpu->natoms; ii++ ){
-                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
-    
-                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psInducedDipole->_pSysData[offset],
-                                amoebaGpu->psInducedDipole->_pSysData[offset+1],
-                                amoebaGpu->psInducedDipole->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psInducedDipolePolar->_pSysData[offset],
-                                amoebaGpu->psInducedDipolePolar->_pSysData[offset+1],
-                                amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
-                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
-                    ii =  (gpu->natoms - maxPrint);
-                    offset = 3*(ii+1);
-                } else {
-                    offset += 3;
-                }
-            }   
-            (void) fflush( amoebaGpu->log );
-*/
-
-            if( 0 ){
-                std::vector<int> fileId;
-                fileId.push_back( iteration );
-                VectorOfDoubleVectors outputVector;
-                cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-                cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
-            }
-
-        }
-
-        (void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
-                        iteration, amoebaGpu->mutualInducedCurrentEpsilon,
-                        amoebaGpu->psCurrentEpsilon->_pSysData[1], 
-                        amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
-        (void) fflush( amoebaGpu->log );
-
-#endif
-
        // exit if nan

        if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
@@ -710,25 +562,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;

-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
-     }
-
-    if( 0 ){
-        static int iteration = 0;
-        checkForNans( gpu->natoms,  3, amoebaGpu->psInducedDipole, gpu->psAtomIndex->_pSysData,    ++iteration, "CudaPmeMI", stderr );
-        checkForNans( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, gpu->psAtomIndex->_pSysData, iteration, "CudaPmeMIPolar", stderr );
-     }
-#endif
-
-   // ---------------------------------------------------------------------------------------
 }

 void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.h
@@ -37,9 +37,6 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
                            unsigned int* workUnit,
                            float* outputField, float* outputFieldPolar
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
 ){

    extern __shared__ MutualInducedParticle sA[];
@@ -52,10 +49,6 @@ void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
    unsigned int lasty           = 0xFFFFFFFF;
    const float uscale           = 1.0f;

-#ifdef AMOEBA_DEBUG
-    float4 pullBack[4];
-#endif
-
    while (pos < end)
    {


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
@@ -51,8 +73,6 @@ __device__ static float normVector3( float* vector )
    return returnNorm;
 }

-#undef AMOEBA_DEBUG
-
 // ZThenX     == 0
 // Bisector   == 1
 // ZBisect    == 2
@@ -379,10 +399,6 @@ void kCudaComputeLabFrameMoments_kernel( void )
 void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
 {

-   // ---------------------------------------------------------------------------------------
-
-   static const char* methodName = "computeCudaAmoebaLabFrameMoments";
-
   // ---------------------------------------------------------------------------------------

    gpuContext gpu    = amoebaGpu->gpuContext;
@@ -390,33 +406,6 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
    int numBlocks     = gpu->sim.blocks;
    int numThreads    = gpu->sim.threads_per_block;

-//#define AMOEBA_DEBUG  
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s: numBlocks/atoms=%d\n", methodName, numBlocks ); (void) fflush( amoebaGpu->log );
-        amoebaGpu->psMultipoleParticlesIdsAndAxisType->Download();
-        amoebaGpu->psMolecularDipole->Download();
-        amoebaGpu->psMultipoleParticlesTorqueBufferIndices->Download();
-        gpu->psPosq4->Download();
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-            int mIndex = 3*ii;
-             (void) fprintf( amoebaGpu->log,"%6d [%6d %6d %6d %6d] x[%16.9e %16.9e %16.9e] %s [%6d %6d %6d %6d]\n", ii,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].x,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].y,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].z,
-                             amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w,
-                             gpu->psPosq4->_pSysData[ii].x,
-                             gpu->psPosq4->_pSysData[ii].y,
-                             gpu->psPosq4->_pSysData[ii].z, (amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysData[ii].w > 1 ? " XXX" : ""),
-                             amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].x,
-                             amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].y,
-                             amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].z,
-                             amoebaGpu->psMultipoleParticlesTorqueBufferIndices->_pSysData[ii].w );
-            //if( ii == 30 )ii = gpu->natoms - 30;
-        }
-    }
-#endif
-
    // copy molecular moments to lab frame moment arrays
    // check if chiral center requires moments to have sign flipped
    // compute lab frame moments
@@ -428,7 +417,7 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
    LAUNCHERROR("kCudaComputeCheckChiral");

    kCudaComputeLabFrameMoments_kernel<<< numBlocks, numThreads>>> ( );
-    LAUNCHERROR(methodName);
+    LAUNCHERROR("kCudaComputeLabFrameMoments");

 }

@@ -505,5 +494,3 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    }

 }
-
-#undef AMOEBA_DEBUG
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaUtilities.cu
-
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaCudaKernels.h"
-//#define AMOEBA_DEBUG

 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
@@ -38,12 +60,6 @@ void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext amoebaGpu)
    RTERROR(status, "GetCalculateAmoebaCudaVdw14_7Sim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-//#define AMOEBA_DEBUG_PRINT
-#undef AMOEBA_DEBUG_PRINT
-
-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 __device__ void zeroVdw14_7SharedForce( struct Vdw14_7Particle* sA ) 
 {
    // zero shared fields
@@ -101,11 +117,7 @@ __device__ void getVdw14_7CombindedSigmaEpsilon_kernel( int sigmaCombiningRule,
 }

 __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma,    float combindedEpsilon,
-                                                float force[3], float* energy
-#ifdef AMOEBA_DEBUG
-               , float4* debugArray
-#endif
-)
+                                                float force[3], float* energy)
 {

    const float deltaHalM1 = 0.07f;
@@ -120,14 +132,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma,    float c
    float r2                                     = force[0]*force[0] + force[1]*force[1] + force[2]*force[2];
    if( r2 > cAmoebaSim.vdwCutoff2 ){
        *energy = force[0] = force[1] = force[2] = 0.0f;
-#ifdef AMOEBA_DEBUG
-        float rI                                     =  rsqrtf( r2 );
-        float r                                      =  1.0f/rI;
-        debugArray[0].x = r;
-        debugArray[0].y = debugArray[0].z = debugArray[0].w = 0.0f;
-        debugArray[1].x = debugArray[1].y = debugArray[1].z = 0.0f;
-        debugArray[1].w = r;
-#endif
        return;
    }
    float rI                                     =  rsqrtf( r2 );
@@ -156,17 +160,6 @@ __device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma,    float c
    force[1]                                    *= deltaE;
    force[2]                                    *= deltaE;

-#ifdef AMOEBA_DEBUG
-    debugArray[0].x                              = r;
-    debugArray[0].y                              = deltaE;
-    debugArray[0].z                              = combindedSigma;
-    debugArray[0].w                              = combindedEpsilon;
-
-    debugArray[1].x                              = tau;
-    debugArray[1].y                              = rho;
-    debugArray[1].z                              = gTau;
-    debugArray[1].w                              = r;
-#endif
 }

 // perform reduction of force on H's and add to heavy atom partner
@@ -504,22 +497,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff

     gpuContext gpu    = amoebaGpu->gpuContext;

-#ifdef AMOEBA_DEBUG_PRINT
-    static const char* methodName = "kCalculateAmoebaVdw14_7Forces";
-    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s: \n", methodName );
-        (void) fflush( amoebaGpu->log );
-    }   
-#ifdef AMOEBA_DEBUG
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    int maxSlots                               = 10;
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
-    debugArray->Upload();
-    int targetAtom                             = 1;
-#endif
-#endif
-
    // set threads/block first time through

    // on first pass, set threads/block
@@ -535,44 +512,9 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle), gpu->sharedMemoryPerBlock ), maxThreads);
    }    

-#ifdef AMOEBA_DEBUG_PRINT
-    if( 0 ){
-        static int iteration = 0;
-        checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, ++iteration, "\n\nzCoordPreCopyVdw", stderr );
-     }   
-#endif
-
    kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates );
    kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates );

-#ifdef AMOEBA_DEBUG_PRINT
-    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
-        (void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                        sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        if( 0 ){  
-            gpu->psInteractionCount->Download();
-            amoebaGpu->psVdwWorkUnit->Download();
-            unsigned int totalWarps  = (gpu->sim.nonbond_blocks*threadsPerBlock)/GRID;
-            float        ratiof     = (float)totalWarps/(float)amoebaGpu->psVdwWorkUnit->_length;
-            (void) fprintf( amoebaGpu->log, "Ixn warps=%u count=%u\n", totalWarps, gpu->psInteractionCount->_pSysData[0] );
-            for( unsigned int ii = 0; ii < amoebaGpu->psVdwWorkUnit->_length; ii++ ){
-        
-                unsigned int x          = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
-                unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
-                unsigned int exclusions = (x & 0x1);
-                             x          = (x >> 17) << GRIDBITS;
-                float        warp       = (float)(ii)*ratiof; 
-                (void) fprintf( amoebaGpu->log, "GpuCell %8u  [%5u %5u %1u] %10u warp=%15.6f\n", ii, x,y,exclusions, warp );
-        
-            }           
-        }
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    // clear output arrays

    kClearFields_3( amoebaGpu, 1 );
@@ -588,33 +530,6 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
        LAUNCHERROR("kFindInteractionsWithinBlocksVdwPeriodic");

-#ifdef AMOEBA_DEBUG
-        if( 0 && amoebaGpu->log ){  
-            gpu->psInteractionCount->Download();
-            gpu->psInteractingWorkUnit->Download();
-            gpu->psInteractionFlag->Download();
-            amoebaGpu->psVdwWorkUnit->Download();
-            (void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysData[0] );
-
-            for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
-        
-                unsigned int x          = gpu->psInteractingWorkUnit->_pSysData[ii];
-                unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
-                unsigned int exclusions = (x & 0x1);
-                             x          = (x >> 17) << GRIDBITS;
-                (void) fprintf( amoebaGpu->log, "GpuCell %8u  %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysData[ii], x,y,exclusions, gpu->psInteractionFlag->_pSysData[ii] );
-        
-                             x          = amoebaGpu->psVdwWorkUnit->_pSysData[ii];
-                             y          = ((x >> 2) & 0x7fff) << GRIDBITS;
-                             exclusions = (x & 0x1);
-                             x          = (x >> 17) << GRIDBITS;
-                (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysData[ii], x,y,exclusions );
-            }    
-                (void) fflush( amoebaGpu->log );
-
-        }
-#endif
-
        if (gpu->bOutputBufferPerWarp){
            kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                gpu->sim.pInteractingWorkUnit,
@@ -622,12 +537,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
-#ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                debugArray->_pDevData, targetAtom );
-#else
                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
        } else {

            kCalculateAmoebaVdw14_7Cutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
@@ -636,13 +546,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
-#ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                debugArray->_pDevData, targetAtom );
-#else
                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
-    
        }
        LAUNCHERROR("kCalculateAmoebaVdw14_7Cutoff");  

@@ -656,12 +560,7 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
-#ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                debugArray->_pDevData, targetAtom );
-#else
                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
        } else {

            kCalculateAmoebaVdw14_7N2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
@@ -670,112 +569,15 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff
                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevData,
                                                                amoebaGpu->vdwSigmaCombiningRule,
                                                                amoebaGpu->vdwEpsilonCombiningRule,
-#ifdef AMOEBA_DEBUG
-                                                                amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                debugArray->_pDevData, targetAtom );
-#else
                                                                amoebaGpu->psWorkArray_3_1->_pDevData );
-#endif
    
        }
        LAUNCHERROR("kCalculateAmoebaVdw14_7N2");  
    }

-#ifdef AMOEBA_DEBUG_PRINT
-    if( amoebaGpu->log ){
-        static int iteration = 0;
-        (void) fprintf( amoebaGpu->log, "Finished 14-7 kernel execution step=%d\n", ++iteration );
-        (void) fflush( amoebaGpu->log );
-
-#ifdef AMOEBA_DEBUG
-        debugArray->Download();
-        int paddedNumberOfAtoms          = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        double cutOff                    = 1.0e+03;
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            int debugIndex = jj;
-            (void) fprintf( amoebaGpu->log,"%5d %5d DebugVdw\n", targetAtom, jj );
-            for( int kk = 0; kk < 5; kk++ ){
-                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                if( kk == 4 && ( fabs(  debugArray->_pSysData[debugIndex].x ) > cutOff ||
-                                 fabs(  debugArray->_pSysData[debugIndex].y ) > cutOff ||
-                                 fabs(  debugArray->_pSysData[debugIndex].z ) > cutOff ) ){
-                    (void) fprintf( amoebaGpu->log," XXXX\n" );
-                }
-                debugIndex += paddedNumberOfAtoms;
-            }
-            (void) fprintf( amoebaGpu->log,"\n" );
-        }
-#endif
-/*
-        amoebaGpu->psWorkArray_3_2->Download();
-        amoebaGpu->psWorkArray_3_1->Download();
-        //for( int jj = 0; jj < 3*gpu->natoms; jj += 3 )
-        for( int jj = 0; jj < 3*gpu->natoms; jj += 3 ){
-            for( int kk = 0; kk < gpu->sim.outputBuffers; kk++ ){
-                float delta = fabs(amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] + 1.0f);
-                if( delta < 5.0e-06 || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj] ) || isNanOrInfinity( (double) amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2] )  )
-                (void) fprintf( amoebaGpu->log,"%6d %6d [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e]\n", jj, kk,
-                                amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj],
-                                amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+1],
-                                amoebaGpu->psWorkArray_3_1->_pSysStream[kk][jj+2],
-                                amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj],
-                                amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+1],
-                                amoebaGpu->psWorkArray_3_2->_pSysStream[kk][jj+2] );
-            }
-        }
-*/
-
-    }
-#endif
-
-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        static int iteration = 0;
-        checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "PreVdw", stderr );
-        checkForNansFloat4( gpu->natoms, gpu->psPosq4, gpu->psAtomIndex->_pSysData, iteration, "zCoordPreVdw", stderr );
-     }   
-#endif
-
    kReduceVdw14_7( amoebaGpu, amoebaGpu->psWorkArray_3_2 );

-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        static int iteration = 0;
-        checkForNans( gpu->natoms, 3, amoebaGpu->psWorkArray_3_2, gpu->psAtomIndex->_pSysData, ++iteration, "Vdw32", stderr );
-     }   
-#endif
-
    kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );
    kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, amoebaGpu->gpuContext->psForce4 );

-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        int paddedNumberOfAtoms             = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float4>* psTempForce     = new CUDAStream<float4>(paddedNumberOfAtoms, 1, "psTempForce");
-        kClearFloat4( amoebaGpu, paddedNumberOfAtoms, psTempForce );
-        //kCalculateAmoebaVdw14_7Reduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
-        kCalculateAmoebaVdw14_7NonReduction( amoebaGpu, amoebaGpu->psWorkArray_3_2, psTempForce );
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaLoadCudaFloat4Array( gpu->natoms,  3, psTempForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaVdw", fileId, outputVector );
-        delete psTempForce;
-        //exit(0);
-     }
-
-    if( 0 ){
-        static int iteration = 0;
-        checkForNansFloat4( gpu->natoms, amoebaGpu->gpuContext->psForce4, gpu->psAtomIndex->_pSysData, ++iteration, "VdwForce", stderr );
-     }   
-#endif
-
-#ifdef AMOEBA_DEBUG
-    delete debugArray;
-#endif
-
-   // ---------------------------------------------------------------------------------------
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
@@ -39,9 +39,6 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
                            int sigmaCombiningRule,
                            int epsilonCombiningRule,
                            float* outputForce
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
 ){

    extern __shared__ Vdw14_7Particle sA[];
@@ -57,9 +54,6 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
    int exclusionMask;
    float totalEnergy            = 0.0f;

-#ifdef AMOEBA_DEBUG
-    float4 pullDebug[5];
-#endif
    while (pos < end)
    {

@@ -129,11 +123,7 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
                }

                float energy;
-                calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy
-#ifdef AMOEBA_DEBUG
-,  pullDebug
-#endif
-);
+                calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy);
                // mask out excluded ixns

                unsigned int mask  =  ( (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;
@@ -148,41 +138,6 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
                forceSum[1]            += mask ? ijForce[1]  : 0.0f;
                forceSum[2]            += mask ? ijForce[2]  : 0.0f;
                totalEnergy            += mask ? 0.5f*energy : 0.0f;
-
-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+j) == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? (y + j) : atomI;
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + j); 
-        debugArray[index].z                = -1.0f;
-        debugArray[index].w                = (float) (mask + 1); 
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) tgx;
-        debugArray[index].w                = energy;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = pullDebug[0].y;
-        debugArray[index].z                = pullDebug[0].z;
-        debugArray[index].w                = pullDebug[0].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[1].x;
-        debugArray[index].y                = pullDebug[1].y;
-        debugArray[index].z                = pullDebug[1].z;
-        debugArray[index].w                = pullDebug[1].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? ijForce[0] : 0.0f;
-        debugArray[index].y                = mask ? ijForce[1] : 0.0f;
-        debugArray[index].z                = mask ? ijForce[2] : 0.0f;
-}
-#endif
-
            }

            // Write results
@@ -260,11 +215,7 @@ flags = 0xFFFFFFFF;
                            ijForce[1]   -= floor(ijForce[1]*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                            ijForce[2]   -= floor(ijForce[2]*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
                        }
-                        calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy
-#ifdef AMOEBA_DEBUG
-            ,  pullDebug
-#endif
-               );
+                        calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy);
            
                        // mask out excluded ixns
        
@@ -328,40 +279,6 @@ flags = 0xFFFFFFFF;
 #endif
        
                
-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+jIdx) == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? (y + jIdx) : atomI;
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + jIdx); 
-        debugArray[index].z                = -3.0;
-        debugArray[index].w                = (float) (mask + 1); 
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) tgx;
-        debugArray[index].w                = energy;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = pullDebug[0].y;
-        debugArray[index].z                = pullDebug[0].z;
-        debugArray[index].w                = pullDebug[0].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[1].x;
-        debugArray[index].y                = pullDebug[1].y;
-        debugArray[index].z                = pullDebug[1].z;
-        debugArray[index].w                = pullDebug[1].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = mask ? ijForce[0] : 0.0f;
-        debugArray[index].y                = mask ? ijForce[1] : 0.0f;
-        debugArray[index].z                = mask ? ijForce[2] : 0.0f;
-}
-#endif
-
 #ifdef USE_CUTOFF
                    }
 #endif

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaWcaDispersion.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
@@ -35,9 +57,6 @@ void GetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext amoebaGpu)
    RTERROR(status, "GetCalculateAmoebaCudaWcaDispersionSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 __device__ void zeroWcaDispersionSharedForce( struct WcaDispersionParticle* sA ) 
 {
    // zero shared fields
@@ -105,14 +124,7 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
                                                      float radiusI,  float radiusJ,
                                                      float rmixo,    float rmixh,
                                                      float emixo,    float emixh,
-                                                      float force[3], float* energy
-
-#ifdef AMOEBA_DEBUG
-               , float4* debugArray
-#endif
-
- )
-{
+                                                      float force[3], float* energy ) {

    const float pi         = 3.1415926535897f;
    const float shctd      = cAmoebaSim.shctd;
@@ -318,29 +330,6 @@ __device__ void calculateWcaDispersionPairIxn_kernel( float4 atomCoordinatesI, f
    force[1]                                    *= de;
    force[2]                                    *= de;

-#ifdef AMOEBA_DEBUG
-    debugArray[0].x                              = sum;
-    debugArray[0].y                              = sum;
-    debugArray[0].z                              = sum;
-    debugArray[0].w                              = sum;
-#if 0
-    debugArray[0].x                              = r;
-    debugArray[0].y                              = -r*de/awater;
-    debugArray[0].z                              = emixo;
-    debugArray[0].w                              = mask2;
-
-    debugArray[1].x                              = dl;
-    debugArray[1].y                              = du;
-    debugArray[1].z                              = lik;
-    debugArray[1].w                              = uik;
-
-    debugArray[2].x                              = du1;
-    debugArray[2].y                              = du2;
-    debugArray[2].z                              = term;
-    debugArray[2].w                              = sk;
-#endif
-
-#endif
 }

 // Include versions of the kernels for N^2 calculations.
@@ -385,16 +374,6 @@ void kCalculateAmoebaWcaDispersionForces( amoebaGpuContext amoebaGpu )
       threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(WcaDispersionParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(WcaDispersionParticle), sizeof(WcaDispersionParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    if (gpu->bOutputBufferPerWarp){

        kCalculateAmoebaWcaDispersionN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(WcaDispersionParticle)*threadsPerBlock>>>(

--- a/plugins/amoeba/platforms/cuda/src/kernels/kFindInteractingBlocksVdw.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kFindInteractingBlocksVdw.h
@@ -24,56 +24,6 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-/**
- * This file contains the kernels for identifying interacting blocks.  It is included
- * several times in kCalculateCDLJForces.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-/**
- * Find a bounding box for the atoms in each block.
- */
-/*
-__global__ void METHOD_NAME(kFindBlockBounds, _kernel)()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int base = pos << GRIDBITS;
-    if (base < cSim.atoms)
-    {
-        float4 apos = cSim.pPosq[base];
-#ifdef USE_PERIODIC
-        apos.x -= floor(apos.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
-        apos.y -= floor(apos.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
-        apos.z -= floor(apos.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        float4 firstPoint = apos;
-#endif
-        float minx = apos.x;
-        float maxx = apos.x;
-        float miny = apos.y;
-        float maxy = apos.y;
-        float minz = apos.z;
-        float maxz = apos.z;
-        for (unsigned int i = 1; i < GRID; i++)
-        {
-            apos = cSim.pPosq[base+i];
-#ifdef USE_PERIODIC
-            apos.x -= floor((apos.x-firstPoint.x)*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-            apos.y -= floor((apos.y-firstPoint.y)*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-            apos.z -= floor((apos.z-firstPoint.z)*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-            minx = min(minx, apos.x);
-            maxx = max(maxx, apos.x);
-            miny = min(miny, apos.y);
-            maxy = max(maxy, apos.y);
-            minz = min(minz, apos.z);
-            maxz = max(maxz, apos.z);
-        }
-        cSim.pGridBoundingBox[pos] = make_float4(0.5f*(maxx-minx), 0.5f*(maxy-miny), 0.5f*(maxz-minz), 0);
-        cSim.pGridCenter[pos] = make_float4(0.5f*(maxx+minx), 0.5f*(maxy+miny), 0.5f*(maxz+minz), 0);
-    }
-}
-*/
-
 /**
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.