Added copyright

Removed debugging code

Added copyright
Removed debugging code
2b508482 · Mark Friedrichs · 36762962 · 2b508482 · 2b508482 · 2b508482
Commit 2b508482 authored Nov 23, 2011 by Mark Friedrichs
20 changed files
--- a/platforms/cuda/tests/TestCudaGBVIForce2.cpp
+++ b/platforms/cuda/tests/TestCudaGBVIForce2.cpp
@@ -61,6 +61,17 @@
 #include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
 #include "OpenMM.h"

+#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
+#include "ReferencePlatform.h"
+#include "OpenCLPlatform.h"
+#endif
+
+#if TEST_PLATFORM == TEST_CUDA_PLATFORM
+#include "ReferencePlatform.h"
+#include "CudaPlatform.h"
+#endif
+    
+
 #ifdef USE_SOFTCORE
 #include "OpenMMFreeEnergy.h"
 #include "openmm/freeEnergyKernels.h"
@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
        throw OpenMMException( msg.str() );
    }
 
+#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
+    ReferencePlatform platform1;   
+    OpenCLPlatform platform2;
+#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
+    ReferencePlatform platform1;   
+    CudaPlatform platform2;
+#else
    Platform& platform1 = Platform::getPlatformByName( platformName1 );
    if( deviceId1 ){
        setDeviceId( platform1, deviceId1, log );
    }
    setDeviceIdUsingEnvVariable( platform1, log );
-    Context context1( system1, integrator1, platform1 );
-    context1.setPositions(positions);
-    State state1 = context1.getState(State::Forces | State::Energy);

    Platform& platform2 = Platform::getPlatformByName( platformName2 );
    if( deviceId2 ){
        setDeviceId( platform2, deviceId2, log );
    }
    setDeviceIdUsingEnvVariable( platform2, log );
+#endif
+
+    Context context1( system1, integrator1, platform1 );
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+
    Context context2( system2, integrator2, platform2 );
    context2.setPositions(positions);
-
    State state2 = context2.getState(State::Forces | State::Energy);

    double energyDiff = 0.0;
@@ -3419,10 +3439,19 @@ int main() {
            }
        }

+        // if TEST_PLATFORM is not set, then check that required libs are available
+     
+#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
        envVariableIsSet = checkRequiredLibsAreAvailable( requiredLibs, loadedLibs, log );
        if( envVariableIsSet == 0 && log ){
            (void) fprintf( log, "Aborting tests due to missing libs.\n" );
        }
+#else
+
+        // unit test path: force tests to run
+        
+        envVariableIsSet = 1;
+#endif

        if( platformId2s.size() > 0 ){
            generativeArgumentMaps["platformId2"] = platformId2s;

--- a/platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
+++ b/platforms/opencl/tests/TestOpenCLGBSAOBCForce2.cpp
@@ -61,6 +61,17 @@
 #include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
 #include "OpenMM.h"

+#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
+#include "ReferencePlatform.h"
+#include "OpenCLPlatform.h"
+#endif
+
+#if TEST_PLATFORM == TEST_CUDA_PLATFORM
+#include "ReferencePlatform.h"
+#include "CudaPlatform.h"
+#endif
+    
+
 #ifdef USE_SOFTCORE
 #include "OpenMMFreeEnergy.h"
 #include "openmm/freeEnergyKernels.h"
@@ -2900,23 +2911,32 @@ void runSystemComparisonTest( System& system1, System& system2,
        throw OpenMMException( msg.str() );
    }
 
+#if TEST_PLATFORM == TEST_OPENCL_PLATFORM
+    ReferencePlatform platform1;   
+    OpenCLPlatform platform2;
+#elif TEST_PLATFORM == TEST_CUDA_PLATFORM
+    ReferencePlatform platform1;   
+    CudaPlatform platform2;
+#else
    Platform& platform1 = Platform::getPlatformByName( platformName1 );
    if( deviceId1 ){
        setDeviceId( platform1, deviceId1, log );
    }
    setDeviceIdUsingEnvVariable( platform1, log );
-    Context context1( system1, integrator1, platform1 );
-    context1.setPositions(positions);
-    State state1 = context1.getState(State::Forces | State::Energy);

    Platform& platform2 = Platform::getPlatformByName( platformName2 );
    if( deviceId2 ){
        setDeviceId( platform2, deviceId2, log );
    }
    setDeviceIdUsingEnvVariable( platform2, log );
+#endif
+
+    Context context1( system1, integrator1, platform1 );
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+
    Context context2( system2, integrator2, platform2 );
    context2.setPositions(positions);
-
    State state2 = context2.getState(State::Forces | State::Energy);

    double energyDiff = 0.0;
@@ -3419,10 +3439,19 @@ int main() {
            }
        }

+        // if TEST_PLATFORM is not set, then check that required libs are available
+     
+#if TEST_PLATFORM != TEST_OPENCL_PLATFORM && TEST_PLATFORM != TEST_CUDA_PLATFORM
        envVariableIsSet = checkRequiredLibsAreAvailable( requiredLibs, loadedLibs, log );
        if( envVariableIsSet == 0 && log ){
            (void) fprintf( log, "Aborting tests due to missing libs.\n" );
        }
+#else
+
+        // unit test path: force tests to run
+        
+        envVariableIsSet = 1;
+#endif

        if( platformId2s.size() > 0 ){
            generativeArgumentMaps["platformId2"] = platformId2s;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-//#define AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -591,8 +611,7 @@ __device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& ato

 // reduce psWorkArray_3_1 -> torque

-static void kReduceTorque(amoebaGpuContext amoebaGpu )
-{
+static void kReduceTorque(amoebaGpuContext amoebaGpu ){
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
@@ -609,21 +628,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu )

   --------------------------------------------------------------------------------------- */

-void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce )
-{
-  
-   // ---------------------------------------------------------------------------------------
-
-
-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "cudaComputeAmoebaElectrostatic";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
+void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce ){
  
   // ---------------------------------------------------------------------------------------

@@ -631,20 +636,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo

    // apparently debug array can take up nontrivial no. registers

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-    }   
-    static const int maxSlots                 =20;
-    int paddedNumberOfAtoms                   = gpu->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
-    debugArray->Upload();
-    //unsigned int targetAtom                   = 1137;
-    unsigned int targetAtom                   = 1;
-#endif
-
    // on first pass, set threads/block

    static unsigned int threadsPerBlock = 0;
@@ -662,20 +653,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo

    kClearFields_3( amoebaGpu, 1 );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
-    }
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, debugArray->_pDevData, targetAtom );
-    } else {
-        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData, debugArray->_pDevData, targetAtom );
-    }
-#else
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
@@ -683,66 +660,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
    }
-#endif
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        debugArray->Download();
-        std::vector<double> conversions;
-        conversions.push_back( 0.1f/4.184f );
-        conversions.push_back( 0.1f/4.184f );
-        unsigned int kkBlocks = 4;
-        (void) fprintf( stderr, "\nTarget atom output %5u\n", targetAtom );
-        for( unsigned int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
-            double sum = 0.0;
-            for( unsigned int kk = 0; kk < kkBlocks && sum == 0.0; kk++ ){
-                unsigned int index = ii + kk*amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-                sum               += debugArray->_pSysData[index].x + debugArray->_pSysData[index].y + debugArray->_pSysData[index].z +  debugArray->_pSysData[index].w;
-            }
-            if( sum > 0.0 ){
-                (void) fprintf( stderr, "%5u", ii );
-                for( unsigned int kk = 0; kk < kkBlocks; kk++ ){
-                    unsigned int index = ii + kk*amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-                    (void) fprintf( stderr, " %15.7e %15.7e %15.7e %5.1f",
-                                    conversions[kk]*debugArray->_pSysData[index].x, conversions[kk]*debugArray->_pSysData[index].y, conversions[kk]*debugArray->_pSysData[index].z,
-                                    debugArray->_pSysData[index].w );
-                    if( ((kk+1) % 2) == 0 && (kk != (kkBlocks-1) ) ){
-                        (void) fprintf( stderr, "\n%5u", ii );
-                    }
-                }
-                (void) fprintf( stderr, "\n" );
-                if( kkBlocks > 2 ){
-                    (void) fprintf( stderr, "\n" );
-                }
-            }
-        }
-    }
-#endif
-
-#ifdef AMOEBA_DEBUG
-    if( 0 ){ 
-        VectorOfDoubleVectors outputVector;
-
-        std::vector<int> fileId;
-        static int call = 0; 
-        fileId.push_back( call++ );
-
-        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "ElectrostaticTemp");
-
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
-        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
-
-        reduceAndCopyCUDAStreamFloat( amoebaGpu->psWorkArray_3_1, temp, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
-
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostaticTorque", fileId, outputVector );
-        delete temp;
-    }    
-#endif
-
    if( addTorqueToForce ){
        kReduceTorque( amoebaGpu );
        cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.h
@@ -35,12 +35,7 @@ __launch_bounds__(128, 1)
 __launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
-                            unsigned int* workUnit, float* outputTorque
-
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
+                            unsigned int* workUnit, float* outputTorque){

    extern __shared__ volatile ElectrostaticParticle sA[];


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
-
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -31,6 +49,7 @@ void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
    RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }
+
 // reduce psWorkArray_3_1 -> E_Field
 // reduce psWorkArray_3_2 -> E_FieldPolar
 // reduce psWorkArray_3_3 -> Gk_FieldPolar
@@ -66,10 +85,6 @@ __device__ void calculateFixedGkFieldPairIxn_kernel( float4 atomCoordinatesI,
                                                     float* labFrameQuadrupoleI,    float* labFrameQuadrupoleJ,
                                                     float  rb2,
                                                     float outputField[2][3]
-#ifdef AMOEBA_DEBUG
-                                          , float4* debugArray
-#endif
-
 ){
  
    float xi,yi,zi;
@@ -325,32 +340,10 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
  
   // ---------------------------------------------------------------------------------------

-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "computeCudaAmoebaFixedEAndGKFields";
-#endif
-
   // ---------------------------------------------------------------------------------------

    gpuContext gpu                             = amoebaGpu->gpuContext;

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
-    }
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-
-    // N2 debug array
- 
-    int maxSlots                               = 10;
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
-    debugArray->Upload();
-
-    // print intermediate results for the targetAtom 
-
-    unsigned int targetAtom  = 0;
-#endif
-
    // on first pass, set threads/block

    static unsigned int threadsPerBlock        = 0;
@@ -367,133 +360,21 @@ void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )

    kClearFields_3( amoebaGpu, 3 );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                           amoebaGpu->psWorkArray_3_3->_pDevData,
-                                                                           debugArray->_pDevData, targetAtom );
-#else
                                                                           amoebaGpu->psWorkArray_3_3->_pDevData );
-#endif

    } else {

        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                          gpu->psWorkUnit->_pDevData,
                                                          amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                          amoebaGpu->psWorkArray_3_3->_pDevData,
-                                                          debugArray->_pDevData, targetAtom );
-#else
                                                          amoebaGpu->psWorkArray_3_3->_pDevData );
-#endif
    }
    LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel");

    kReduceEAndGkFields( amoebaGpu );
 
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        gpu->psInteractionCount->Download();
-        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-
-        amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_3_2->Download();
-        amoebaGpu->psWorkArray_3_3->Download();
-
-        amoebaGpu->psE_Field->Download();
-        amoebaGpu->psE_FieldPolar->Download();
-        amoebaGpu->psGk_Field->Download();
-        gpu->psBornRadii->Download();
-
-        (void) fprintf( amoebaGpu->log, "OutE & Gk Fields\n" );
-        int maxPrint        = 32;
-
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-
-           // E_Field
-
-           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_Field->_pSysData[indexOffset],
-                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
-                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
-   
-           // E_Field polar
-
-           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
-
-           // Gk_Field polar
-
-           (void) fprintf( amoebaGpu->log,"Gk[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psGk_Field->_pSysData[indexOffset],
-                           amoebaGpu->psGk_Field->_pSysData[indexOffset+1],
-                           amoebaGpu->psGk_Field->_pSysData[indexOffset+2] );
-
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-        (void) fprintf( amoebaGpu->log, "EFields End\n" );
-        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
-        debugArray->Download();
-        if( 0 ){
-            int ii = targetAtom;
-            (void) fprintf( amoebaGpu->log,"\n" );
-            int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-            for( int jj = 0; jj < gpu->natoms; jj++ ){
-                int debugIndex = jj; 
-                (void) fprintf( amoebaGpu->log,"%4d %4d Qint [%16.9e %16.9e %16.9e %16.9e] %16.9e ",
-                                   ii, jj, 
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w,
-                                   gpu->psBornRadii->_pSysData[jj] );
-
-                for( int kk = 0; kk < 2; kk++ ){
-                    debugIndex += paddedNumberOfAtoms;
-                    (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
-                                    debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
-                }
-                (void) fprintf( amoebaGpu->log,"\n" );
-            }
-        }
-
-        // write results to file
-
-        if( 0 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psGk_Field,     outputVector, NULL, 1.0f);
-            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, NULL, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaEAndGkField", fileId, outputVector );
-
-         }
-         delete debugArray;
-    }
-#endif
- 
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
@@ -38,15 +38,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
                            unsigned int* workUnit,
                            float* outputEField,
                            float* outputEFieldPolar,
-                            float* outputGkField
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
-
-#ifdef AMOEBA_DEBUG
-    float4 pullBack[12];
-#endif
+                            float* outputGkField){

    extern __shared__ FixedFieldParticle sA[];

@@ -127,11 +119,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(

                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );

-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);

                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;

@@ -150,11 +138,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
                    calculateFixedGkFieldPairIxn_kernel( iCoord,                             jCoord,
                                                         &(labFrameDipole[atomI*3]),         jDipole,
                                                         &(labFrameQuadrupole[atomI*9]),     jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,       ijField
-#ifdef AMOEBA_DEBUG
-                                                  , pullBack
-#endif
-                    );
+                                                         bornRadii[atomI]*jBornRadius,       ijField);

                    gkFieldSum[0]          += match ? 0.0f : ijField[0][0];
                    gkFieldSum[1]          += match ? 0.0f : ijField[0][1];
@@ -179,11 +163,7 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(

                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );

-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);


                    float dScaleVal;
@@ -206,85 +186,12 @@ void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
                    eFieldPolarSum[1]      += match ? 0.0f : pScaleVal*ijField[0][1];
                    eFieldPolarSum[2]      += match ? 0.0f : pScaleVal*ijField[0][2];

-//#ifdef AMOEBA_DEBUG
-#if 0
-if( 0 && atomI == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
-            unsigned int pullBackIndex         = 0;
-            unsigned int indexI                = 0;
-            unsigned int indexJ                = indexI ? 0 : 1;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = dScaleVal;
-            debugArray[index].w                = pScaleVal;
-            pullBackIndex                     += 2;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            pullBackIndex++;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            pullBackIndex++;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : dScaleVal*ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : dScaleVal*ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : dScaleVal*ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : pScaleVal*ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : pScaleVal*ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : pScaleVal*ijField[indexI][2];
-
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleVal + 10.0f;
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-*/
-
-}
-#endif
                    // GK field

                    calculateFixedGkFieldPairIxn_kernel( iCoord,                                 jCoord,
                                                         &(labFrameDipole[atomI*3]),             jDipole,
                                                         &(labFrameQuadrupole[atomI*9]),         jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,           ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                                                         bornRadii[atomI]*jBornRadius,           ijField);

                    match                   = (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ? 1 : 0;
                    gkFieldSum[0]          += match ? 0.0f : ijField[0][0];
@@ -292,45 +199,6 @@ if( 0 && atomI == targetAtom ){
                    gkFieldSum[2]          += match ? 0.0f : ijField[0][2];


-//#ifdef AMOEBA_DEBUG
-#if 0
-if( atomI == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
-            ///unsigned int pullBackIndex         = 0;
-            unsigned int indexI                = 0;
-            unsigned int indexJ                = indexI ? 0 : 1;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = bornRadii[atomI];
-            debugArray[index].w                = jBornRadius;
-
-/*
-            pullBackIndex                     += 2;
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            pullBackIndex++;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex].x;
-            debugArray[index].y                = pullBack[pullBackIndex].y;
-            debugArray[index].z                = pullBack[pullBackIndex].z;
-            pullBackIndex++;
-*/
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  match ? 0.0f :  ijField[indexI][0];
-            debugArray[index].y                =  match ? 0.0f :  ijField[indexI][1];
-            debugArray[index].z                =  match ? 0.0f :  ijField[indexI][2];
-
-            index                             +=  match ? 0.0f :  cSim.paddedNumberOfAtoms;
-            debugArray[index].x                =  match ? 0.0f :  ijField[indexJ][0];
-            debugArray[index].y                =  match ? 0.0f :  ijField[indexJ][1];
-            debugArray[index].z                =  match ? 0.0f :  ijField[indexJ][2];
-
-}
-#endif
                }
            }

@@ -377,11 +245,7 @@ if( atomI == targetAtom ){
    
                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole, &jBornRadius );

-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-                                                 , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
    
                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole

@@ -403,84 +267,12 @@ if( atomI == targetAtom ){
                    psA[tj].eFieldP[1] += ijField[1][1];
                    psA[tj].eFieldP[2] += ijField[1][2];

-// #ifdef AMOEBA_DEBUG
-#if 0
-if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-            unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-            //unsigned int pullBackIndex         = 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-/*
-            debugArray[index].z                = pullBack[pullBackIndex++];
-            debugArray[index].w                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-*/
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-#if 0
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleVal + 10.0f;
-#endif
-}
-#endif
                    // Gk field

                    calculateFixedGkFieldPairIxn_kernel( iCoord,                                          jCoord,
                                                         &(labFrameDipole[atomI*3]),                      jDipole,
                                                         &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,                    ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                                                         bornRadii[atomI]*jBornRadius,                    ijField);

                    gkFieldSum[0]              += ijField[0][0];
                    gkFieldSum[1]              += ijField[0][1];
@@ -491,41 +283,6 @@ if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){
                    psA[tj].gkField[2]         += ijField[1][2];


-//#ifdef AMOEBA_DEBUG
-#if 0
-if( (atomI == targetAtom  || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-            unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-            //unsigned int pullBackIndex         = 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-            debugArray[index].z                = bornRadii[atomI];
-            debugArray[index].w                = jBornRadius;
-/*
-            debugArray[index].z                = pullBack[pullBackIndex++];
-            debugArray[index].w                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-*/
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-}
-#endif

                    tj                  = (tj + 1) & (GRID - 1);

@@ -548,11 +305,7 @@ if( (atomI == targetAtom  || (y + tj) == targetAtom) ){

                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole, &jBornRadius );
 
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);

                    float dScaleVal;
                    float pScaleVal;
@@ -579,85 +332,12 @@ if( (atomI == targetAtom  || (y + tj) == targetAtom) ){
                    psA[tj].eFieldP[1]   += pScaleVal*ijField[1][1];
                    psA[tj].eFieldP[2]   += pScaleVal*ijField[1][2];

-//#ifdef AMOEBA_DEBUG
-#if 0
-if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-//            unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-//            unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-//            unsigned int pullBackIndex         = 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-            debugArray[index].z                = dScaleVal;
-            debugArray[index].w                = pScaleVal;
-//            pullBackIndex                     += 2;
-
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = dScaleVal*ijField[indexI][0];
-            debugArray[index].y                = dScaleVal*ijField[indexI][1];
-            debugArray[index].z                = dScaleVal*ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pScaleVal*ijField[indexI][0];
-            debugArray[index].y                = pScaleVal*ijField[indexI][1];
-            debugArray[index].z                = pScaleVal*ijField[indexI][2];
-*/
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleVal + 10.0f;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-*/
-
-}
-#endif
                    // GK field

                    calculateFixedGkFieldPairIxn_kernel( iCoord,                             jCoord,
                                                         &(labFrameDipole[atomI*3]),         jDipole,
                                                         &(labFrameQuadrupole[atomI*9]),     jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,       ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                                                         bornRadii[atomI]*jBornRadius,       ijField);

                    if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
                        gkFieldSum[0]        += ijField[0][0];
@@ -668,61 +348,6 @@ if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
                        psA[tj].gkField[1]   += ijField[1][1];
                        psA[tj].gkField[2]   += ijField[1][2];
                    }
-
-
-//#ifdef AMOEBA_DEBUG
-#if 0
-if( (atomI == targetAtom || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-            unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-//            unsigned int pullBackIndex         = 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-            debugArray[index].z                = bornRadii[atomI];
-            debugArray[index].w                = jBornRadius;
-//            pullBackIndex                     += 2;
-
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-*/
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = (atomI == targetAtom || (y + tj) == targetAtom) ?  ijField[indexI][0] : 0.0f;
-            debugArray[index].y                = (atomI == targetAtom || (y + tj) == targetAtom) ?  ijField[indexI][1] : 0.0f;
-            debugArray[index].z                = (atomI == targetAtom || (y + tj) == targetAtom) ?  ijField[indexI][2] : 0.0f;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = (atomI == targetAtom || (y + tj) == targetAtom) ?  ijField[indexJ][0] : 0.0f;
-            debugArray[index].y                = (atomI == targetAtom || (y + tj) == targetAtom) ?  ijField[indexJ][1] : 0.0f;
-            debugArray[index].z                = (atomI == targetAtom || (y + tj) == targetAtom) ?  ijField[indexJ][2] : 0.0f;
-
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-*/
-
-}
-#endif
-
-
-
                    tj                  = (tj + 1) & (GRID - 1);
                }
            }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.cu
-
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"

-//#define AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -76,24 +95,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
  
    gpuContext gpu    = amoebaGpu->gpuContext;

-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "computeCudaAmoebaFixedEField";
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
-    }
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-
-    // N2 debug array
-
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(10*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-    debugArray->Upload();
-
-    // print intermediate results for the targetAtom 
-
-    unsigned int targetAtom  = 3;
-#endif
-
    kClearFields_3( amoebaGpu, 2 );

    static unsigned int threadsPerBlock = 0;
@@ -108,15 +109,6 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaFixedEField numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData,
@@ -131,78 +123,5 @@ void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
    }

    LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
-
-#if 0
-        for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
-            //float index = 1.0f;
-            float index = (float) ii;
-            for( unsigned int jj = 0; jj < 3*amoebaGpu->paddedNumberOfAtoms; jj += 3 ){
-                unsigned int kk = 3*ii*amoebaGpu->paddedNumberOfAtoms + jj;
-                amoebaGpu->psWorkArray_3_1->_pSysData[kk]   = index;
-                amoebaGpu->psWorkArray_3_1->_pSysData[kk+1] = index;
-                amoebaGpu->psWorkArray_3_1->_pSysData[kk+2] = index;
-            }
-        }
-        amoebaGpu->psWorkArray_3_1->Upload();
-#endif
-
    kReduceE_Fields_kernel( amoebaGpu );
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        gpu->psInteractionCount->Download();
-        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-        amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_3_2->Download();
-        amoebaGpu->psE_Field->Download();
-        amoebaGpu->psE_FieldPolar->Download();
-        (void) fprintf( amoebaGpu->log, "OutEFields\n" );
-        int maxPrint        = 32;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-
-           // E_Field
-
-           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_Field->_pSysData[indexOffset],
-                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
-                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
-   
-           // E_Field polar
-
-           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
-                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
-
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-
-        (void) fprintf( amoebaGpu->log, "EFields End\n" );
-
-        // write results to file
-
-        if( 0 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, NULL, 1.0f);
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
-
-         }
-         //delete debugArray;
-    }
-#endif
-
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedEField.h
@@ -37,15 +37,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
                            unsigned int* workUnit,
                            float* outputEField,
-                            float* outputEFieldPolar
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
-
-#ifdef AMOEBA_DEBUG
-    float4 pullBack[12];
-#endif
+                            float* outputEFieldPolar){

    extern __shared__ FixedFieldParticle sA[];

@@ -106,11 +98,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(

                    float ijField[2][3];

-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);

                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;

@@ -142,11 +130,7 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(

                    //loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );

-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);

                    float dScaleVal;
                    float pScaleVal;
@@ -168,73 +152,6 @@ void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
                    fieldPolarSum[1]       += match ? 0.0f : pScaleVal*ijField[0][1];
                    fieldPolarSum[2]       += match ? 0.0f : pScaleVal*ijField[0][2];

-#ifdef AMOEBA_DEBUG
-if( 0 && atomI == targetAtom ){
-            unsigned int index                 = atomI == targetAtom ? (y + j) : atomI;
-//            unsigned int pullBackIndex         = 0;
-            unsigned int indexI                = 0;
-            unsigned int indexJ                = indexI ? 0 : 1;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + j);
-            debugArray[index].z                = dScaleVal;
-            debugArray[index].w                = pScaleVal;
-/*
-            pullBackIndex                     += 2;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-*/
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : dScaleVal*ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : dScaleVal*ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : dScaleVal*ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = match ? 0.0f : pScaleVal*ijField[indexI][0];
-            debugArray[index].y                = match ? 0.0f : pScaleVal*ijField[indexI][1];
-            debugArray[index].z                = match ? 0.0f : pScaleVal*ijField[indexI][2];
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleVal + 10.0f;
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-*/
-
-}
-#endif
                }
            }

@@ -274,11 +191,7 @@ if( 0 && atomI == targetAtom ){

                    float ijField[2][3];
    
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-                                                 , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
    
                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole

@@ -300,74 +213,6 @@ if( 0 && atomI == targetAtom ){
                    psA[tj].eFieldP[1] += ijField[1][1];
                    psA[tj].eFieldP[2] += ijField[1][2];

-
-#ifdef AMOEBA_DEBUG
-if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-            unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-//            unsigned int pullBackIndex         = 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-/*
-            debugArray[index].z                = pullBack[pullBackIndex++];
-            debugArray[index].w                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-*/
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-#if 0
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleVal + 10.0f;
-#endif
-}
-#endif
-
                    tj                  = (tj + 1) & (GRID - 1);

                }
@@ -388,11 +233,7 @@ if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){

                    float ijField[2][3];

-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-                                                , pullBack
-#endif
-                    );
+                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);

                    float dScaleVal;
                    float pScaleVal;
@@ -419,79 +260,6 @@ if( 0 && (atomI == targetAtom  || (y + tj) == targetAtom) ){
                    psA[tj].eFieldP[1] += pScaleVal*ijField[1][1];
                    psA[tj].eFieldP[2] += pScaleVal*ijField[1][2];

-
-#ifdef AMOEBA_DEBUG
-if( 0 && (atomI == targetAtom || (y + tj) == targetAtom) ){
-            unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-            unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-            unsigned int indexJ                = (atomI == targetAtom) ? 1 : 0;
-//            unsigned int pullBackIndex         = 0;
-
-            debugArray[index].x                = (float) atomI;
-            debugArray[index].y                = (float) (y + tj);
-            debugArray[index].z                = dScaleVal;
-            debugArray[index].w                = pScaleVal;
-/*
-            pullBackIndex                     += 2;
-
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pullBack[pullBackIndex++];
-            debugArray[index].y                = pullBack[pullBackIndex++];
-            debugArray[index].z                = pullBack[pullBackIndex++];
-*/
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexI][0];
-            debugArray[index].y                = ijField[indexI][1];
-            debugArray[index].z                = ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = ijField[indexJ][0];
-            debugArray[index].y                = ijField[indexJ][1];
-            debugArray[index].z                = ijField[indexJ][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = dScaleVal*ijField[indexI][0];
-            debugArray[index].y                = dScaleVal*ijField[indexI][1];
-            debugArray[index].z                = dScaleVal*ijField[indexI][2];
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = pScaleVal*ijField[indexI][0];
-            debugArray[index].y                = pScaleVal*ijField[indexI][1];
-            debugArray[index].z                = pScaleVal*ijField[indexI][2];
-/*
-            index                             += cSim.paddedNumberOfAtoms;
-            unsigned int mask                  = 1 << j;
-            unsigned int pScaleIndex           = (scaleMask.x & mask) ? 1 : 0;
-            pScaleIndex                       += (scaleMask.y & mask) ? 2 : 0;
-            debugArray[index].x                = (float) pScaleIndex;
-
-            debugArray[index].y                = scaleMask.x & mask ? 1.0f : -1.0f;
-            debugArray[index].z                = scaleMask.y & mask ? 1.0f : -1.0f;
-            debugArray[index].w                = pScaleVal + 10.0f;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = jCoord.x;
-            debugArray[index].y                = jCoord.y;
-            debugArray[index].z                = jCoord.z;
-
-            index                             += cSim.paddedNumberOfAtoms;
-            debugArray[index].x                = iCoord.x;
-            debugArray[index].y                = iCoord.y;
-            debugArray[index].z                = iCoord.z;
-*/
-
-}
-#endif
-
-
-
                    tj                  = (tj + 1) & (GRID - 1);
                }
            }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
@@ -151,11 +151,7 @@ __device__ static void zeroFixedFieldParticleSharedField( struct FixedFieldParti
 // body of fixed E-field calculation

 __device__ static void calculateFixedEFieldPairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
-                                                           float field[2][3]
-#ifdef AMOEBA_DEBUG
-                                                    , float4 debugArray[12]
-#endif
-)
+                                                           float field[2][3])
 {

 

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
-//----------------------------------------------------------------------------------------- 
-//----------------------------------------------------------------------------------------- 
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -204,44 +224,12 @@ void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
 void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
 {

-#ifdef AMOEBA_DEBUG
-    static const char* methodName       = "kCalculateAmoebaGrycukBornRadii";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-                        amoebaGpu->scalingDistanceCutoff );
-    }
-    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-    debugArray->Upload();
-    unsigned int targetAtom                   = 0;
-
-    gpu->psBornRadii->Download();
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
-        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
-            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
-                            gpu->psBornRadii->_pSysData[ii] );
-        }
-    }
-#endif
-
    // on first pass, set threads/block and based on that setting the energy buffer array

    static unsigned int threadsPerBlock = 0;
@@ -255,28 +243,7 @@ void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
        else
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-
-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
-                            sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
-                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-            (void) fflush( amoebaGpu->log );
    }
-#endif
-
-    }
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif

    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
@@ -330,13 +297,7 @@ __device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRulePa

 }

-//#define AMOEBA_DEBUG
-
-__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3] 
-#ifdef AMOEBA_DEBUG
-,  float4 pullDebug[5]
-#endif
-){
+__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3] ){

    const float pi         = 3.1415926535897f;
    float third            = 1.0f/3.0f;
@@ -390,18 +351,6 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
    float dbr  = term * de/r;
          de   = dbr*atomI.bornForce;

-#ifdef AMOEBA_DEBUG
-    pullDebug[0].x = de;
-    pullDebug[0].y = r;
-    pullDebug[0].z = factor;
-    pullDebug[0].w = -4.0f;
-
-    pullDebug[1].x = atomI.bornForce/4.184f;
-    pullDebug[1].y = atomI.bornRadius;
-    pullDebug[1].z = atomJ.bornForce/4.184f;
-    pullDebug[1].w = -5.0f;
-#endif
-
    force[0]   = xr*de;
    force[1]   = yr*de;
    force[2]   = zr*de;
@@ -430,43 +379,12 @@ __device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle&
 void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
 {

-#ifdef AMOEBA_DEBUG
-    static const char* methodName       = "kCalculateGrycukGbsaForces2";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-    }
-    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(20*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*20*paddedNumberOfAtoms);
-    debugArray->Upload();
-    unsigned int targetAtom                   = 0;
-
-    gpu->psBornRadii->Download();
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
-        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
-            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
-                            gpu->psBornRadii->_pSysData[ii] );
-        }
-    }
-#endif
-
    // on first pass, set threads/block and based on that setting the energy buffer array

    static unsigned int threadsPerBlock = 0;
@@ -480,64 +398,15 @@ void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
        else
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukChainRuleParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-
-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
-                            sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
-                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-            (void) fflush( amoebaGpu->log );
-        }
-#endif
-
    }

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
-//kClearForces( gpu );
-
    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
-#ifdef AMOEBA_DEBUG
-    ,debugArray->_pDevData, targetAtom
-#endif
-);
+        kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
    } else {
-        kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
-#ifdef AMOEBA_DEBUG
-    ,debugArray->_pDevData, targetAtom
-#endif
-);
+        kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
    }
    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        debugArray->Download();
-        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        for( int jj = 0; jj < gpu->natoms; jj++ ){
-            int debugIndex = jj; 
-            (void) fprintf( amoebaGpu->log,"%5d %5d DebugGrycukChain\n", targetAtom, jj );
-            for( int kk = 0; kk < 7; kk++ ){
-                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
-                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                debugIndex += paddedNumberOfAtoms;
-            }   
-            (void) fprintf( amoebaGpu->log,"\n" );
-        }   
-    }   
-#endif
-
   if( 0 ){ 
        static int callId                    = 0;
        gpuContext gpu                       = amoebaGpu->gpuContext;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
@@ -34,11 +34,7 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
-void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit 
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
+void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit ){

    extern __shared__ GrycukChainRuleParticle sAChainRule[];

@@ -49,10 +45,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;

-#ifdef AMOEBA_DEBUG
-    float4 pullDebug[5];
-#endif
-
    while (pos < end)
    {

@@ -85,11 +77,7 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
            {
                float localForce[3];
-                calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce
-#ifdef AMOEBA_DEBUG
-,  pullDebug
-#endif
- );
+                calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce);
                if( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ){

                    localParticle.force[0]     -= localForce[0];
@@ -100,54 +88,6 @@ void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUn
                    psAChainRule[j].force[1]   += localForce[1];
                    psAChainRule[j].force[2]   += localForce[2];

-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+j) == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? (y + j) : atomI;
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + j); 
-        debugArray[index].z                = -1.0f;
-        debugArray[index].w                = -1.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) tgx;
-        debugArray[index].w                = -2.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = pullDebug[0].y;
-        debugArray[index].z                = pullDebug[0].z;
-        debugArray[index].w                = pullDebug[0].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[1].x;
-        debugArray[index].y                = pullDebug[1].y;
-        debugArray[index].z                = pullDebug[1].z;
-        debugArray[index].w                = pullDebug[1].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = localForce[0];
-        debugArray[index].y                = localForce[1];
-        debugArray[index].z                = localForce[2];
-        debugArray[index].w                = -12.0f;
-
- calculateGrycukChainRulePairIxn_kernel( psAChainRule[j], localParticle, localForce ,  pullDebug );
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = pullDebug[0].y;
-        debugArray[index].z                = pullDebug[0].z;
-        debugArray[index].w                = -13.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = localForce[0];
-        debugArray[index].y                = localForce[1];
-        debugArray[index].z                = localForce[2];
-        debugArray[index].w                = -14.0f;
-}
-#endif

                }
            }
@@ -181,11 +121,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){

                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
                    float localForce[3];
-                    calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce 
-#ifdef AMOEBA_DEBUG
-,  pullDebug
-#endif
-);
+                    calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce );
    
                    localParticle.force[0]     -= localForce[0];
                    localParticle.force[1]     -= localForce[1];
@@ -195,54 +131,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
                    psAChainRule[tj].force[1]  += localForce[1];
                    psAChainRule[tj].force[2]  += localForce[2];
    
-#ifdef AMOEBA_DEBUG
-unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-if( atomI == targetAtom || (y+tj) == targetAtom ){
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + tj); 
-        debugArray[index].z                = -1.0f;
-        debugArray[index].w                = -1.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) tgx;
-        debugArray[index].w                = -2.0f;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = pullDebug[0].y;
-        debugArray[index].z                = pullDebug[0].z;
-        debugArray[index].w                = pullDebug[0].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[1].x;
-        debugArray[index].y                = pullDebug[1].y;
-        debugArray[index].z                = pullDebug[1].z;
-        debugArray[index].w                = pullDebug[1].w;
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = localForce[0];
-        debugArray[index].y                = localForce[1];
-        debugArray[index].z                = localForce[2];
-        debugArray[index].w                = -10.0f;
-}
-#endif
-                    calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce
-#ifdef AMOEBA_DEBUG
-,  pullDebug
-#endif
- );
-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+tj) == targetAtom ){
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = localForce[1];
-        debugArray[index].z                = localForce[2];
-        debugArray[index].w                = -11.0f;
-}
-#endif
+                    calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce);
    
                    localParticle.force[0]     += localForce[0];
                    localParticle.force[1]     += localForce[1];

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
-//----------------------------------------------------------------------------------------- 
-//----------------------------------------------------------------------------------------- 
-//#include "amoebaGpuTypes.h"
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"
-//#define INCLUDE_TORQUE
 #include "kCalculateAmoebaCudaKirkwoodParticle.h"
 extern void kCalculateObcGbsaForces2(gpuContext gpu);

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -1638,26 +1657,6 @@ __device__ void zeroKirkwoodParticleSharedField( struct KirkwoodParticle* sA )
 // reduce psWorkArray_1_1 -> dBorn
 // reduce psWorkArray_1_2 -> dBornPolar

-#ifdef AMOEBA_DEBUG
-static void kReduce_dBorn(amoebaGpuContext amoebaGpu )
-{
-
-    gpuContext gpu = amoebaGpu->gpuContext;
-/*
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                           gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_1_1->_pDevData, amoebaGpu->psBorn->_pDevData, 0 );
-    LAUNCHERROR("kReduce_dBorn1");
-*/
-
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                           gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
-                           amoebaGpu->psWorkArray_1_2->_pDevData, amoebaGpu->psBornPolar->_pDevData, 0 );
-
-    LAUNCHERROR("kReduce_dBorn2");
-}
-#endif
-
 __global__
 #if (__CUDA_ARCH__ >= 200)
 __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
@@ -1940,40 +1939,6 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
                                             amoebaGpu->gpuContext->psBornForce->_pDevData );
    }
    LAUNCHERROR("kReduceToBornForcePrefactor");
-
-//#define AMOEBA_DEBUG
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        // kClearEnergy() should be called prior to kReduceToBornForcePrefactorAndSASA_kernel
-
-        double energy = kReduceEnergy( amoebaGpu->gpuContext );
-        (void) fprintf( amoebaGpu->log, "Born force w/ cavity energy=%15.7e.\n", energy ); (void) fflush( amoebaGpu->log );
-/*
-        for( int ii = 0; ii < amoebaGpu->gpuContext->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii);
-           (void) fprintf( amoebaGpu->log,"bF %16.9e obc=%16.9e bR=%16.9e\n",
-                           amoebaGpu->gpuContext->psBornForce->_pSysData[ii],
-                           amoebaGpu->gpuContext->psObcData->_pSysData[ii].x,
-                           amoebaGpu->gpuContext->psBornRadii->_pSysData[ii] );
-        }
-        (void) fflush( amoebaGpu->log );
-*/
-        if( 0 ){
-            std::vector<int> fileId;
-            //fileId.push_back( 0 );
-            VectorOfDoubleVectors outputVector;
-            cudaLoadCudaFloat4Array( amoebaGpu->gpuContext->natoms,  3, amoebaGpu->gpuContext->psPosq4,       outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray(  amoebaGpu->gpuContext->natoms,  1, amoebaGpu->gpuContext->psBornRadii,   outputVector, NULL, 1.0f );
-            cudaLoadCudaFloat2Array( amoebaGpu->gpuContext->natoms,  2, amoebaGpu->gpuContext->psObcData,     outputVector, NULL, 1.0f );
-            cudaLoadCudaFloatArray(  amoebaGpu->gpuContext->natoms,  1, amoebaGpu->gpuContext->psBornForce,   outputVector, NULL, 1.0f );
-            cudaWriteVectorOfDoubleVectorsToFile( "CudaBornForce", fileId, outputVector );
-        }
-
-    }
-#endif
-#undef AMOEBA_DEBUG
-
 }

 /**---------------------------------------------------------------------------------------
@@ -1988,44 +1953,11 @@ static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
 void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
 {

-#ifdef AMOEBA_DEBUG
-    static const char* methodName       = "kCalculateAmoebaKirkwood";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers
-
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-                        amoebaGpu->scalingDistanceCutoff );
-    }
-    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-    debugArray->Upload();
-    unsigned int targetAtom                   = 0;
-
-    gpu->psBornRadii->Download();
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "Kirkwood input\n" ); (void) fflush( amoebaGpu->log );
-        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
-            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
-                            gpu->psBornRadii->_pSysData[ii] );
-        }
-    }
-#endif
-
    // on first pass, set threads/block and based on that setting the energy buffer array

    static unsigned int threadsPerBlock = 0;
@@ -2040,47 +1972,17 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(KirkwoodParticle), gpu->sharedMemoryPerBlock ), maxThreads);

-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwood: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
-                            sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock,
-                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-            (void) fflush( amoebaGpu->log );
-        }
-#endif
-
    }

    kClearFields_1( amoebaGpu );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
-                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(KirkwoodParticle), sizeof(KirkwoodParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaCudaKirkwoodN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData
-#ifdef AMOEBA_DEBUG
-                                                                           , debugArray->_pDevData, targetAtom );
-#else
-                                                                           );
-#endif
+                                                                           gpu->psWorkUnit->_pDevData);
    } else {

        kCalculateAmoebaCudaKirkwoodN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData
-#ifdef AMOEBA_DEBUG
-                                                                           , debugArray->_pDevData, targetAtom );
-#else
-                                                                           );
-#endif
+                                                                           gpu->psWorkUnit->_pDevData);
    }
    LAUNCHERROR("kCalculateAmoebaCudaKirkwoodN2Forces");


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.h
@@ -35,15 +35,7 @@ __launch_bounds__(128, 1)
 __launch_bounds__(64, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaCudaKirkwood, Forces_kernel)(
-                            unsigned int* workUnit
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
-
-#ifdef AMOEBA_DEBUG
-    float4 pullBack[20];
-#endif
+                            unsigned int* workUnit){

    extern __shared__ KirkwoodParticle sA[];


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"
-//#include "kCalculateAmoebaCudaKirkwoodParticle.h"
 #include "kCalculateAmoebaCudaKirkwoodEDiffParticle.h"

-//#define AMOEBA_DEBUG
-
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

@@ -865,19 +884,6 @@ __device__ void calculateKirkwoodEDiffPairIxn_kernel( KirkwoodEDiffParticle& ato

 #endif

-#ifdef AMOEBA_DEBUG
-__device__ static int debugAccumulate( unsigned int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
-{
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = addMask ? field[0] : 0.0f;
-    debugArray[index].y                = addMask ? field[1] : 0.0f;
-    debugArray[index].z                = addMask ? field[2] : 0.0f;
-    debugArray[index].w                = idLabel;
-
-    return index;
-}
-#endif
-
 __device__ void zeroKirkwoodEDiffParticleSharedField( struct KirkwoodEDiffParticle* sA )
 {
    // zero shared fields
@@ -929,24 +935,12 @@ static void kReduceTorque( amoebaGpuContext amoebaGpu )
 void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
 {
  
-   // ---------------------------------------------------------------------------------------
-
-    static int timestep                 = 0;
-    timestep++;
-#ifdef AMOEBA_DEBUG
-    static const char* methodName       = "kCalculateAmoebaKirkwoodEDiff";
-    std::vector<int> fileId;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
   // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers
-    //
+
    static unsigned int threadsPerBlock = 0;
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
@@ -960,23 +954,10 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(KirkwoodEDiffParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }   
    
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaKirkwoodEDiffN2Forces: blocks=%u threads=%u bffr/Warp=%u atm=%lu shrd=%lu"
-                                        " ixnCt=%lu workUnits=%u sm=%d device=%d sharedMemoryPerBlock=%u step=%d\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(KirkwoodEDiffParticle), sizeof(KirkwoodEDiffParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sm_version, gpu->device, gpu->sharedMemoryPerBlock, timestep );
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
-
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaCudaKirkwoodEDiffN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
-
    } else {
-
        kCalculateAmoebaCudaKirkwoodEDiffN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(KirkwoodEDiffParticle)*threadsPerBlock>>>(
                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
    }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.h
@@ -35,8 +35,7 @@ __launch_bounds__(96, 1)
 __launch_bounds__(32, 1)
 #endif
 void METHOD_NAME(kCalculateAmoebaCudaKirkwoodEDiff, Forces_kernel)(
-                            unsigned int* workUnit, float* outputTorque
-){
+                            unsigned int* workUnit, float* outputTorque){

    extern __shared__ KirkwoodEDiffParticle sA[];


--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaLocalForces.cu
@@ -1616,19 +1616,7 @@ void kCalculateAmoebaLocalForces_kernel()

 void kCalculateAmoebaLocalForces(amoebaGpuContext gpu)
 {
-   
-#ifdef AMOEBA_DEBUG 
-    if( gpu->log ){
-        static int call = 1;
-        if( call == 0 ){
-            (void) fprintf( gpu->log,"kCalculateAmoebaLocalForces: blks=%u thrds/blk=%u\n",
-                            gpu->gpuContext->sim.blocks, gpu->gpuContext->sim.localForces_threads_per_block); fflush( gpu->log );
-            call++;
-        }
-    }
-#endif
    kCalculateAmoebaLocalForces_kernel<<<gpu->gpuContext->sim.blocks, gpu->gpuContext->sim.localForces_threads_per_block>>>();
-
    LAUNCHERROR("kCalculateAmoebaLocalForces");

 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMapTorques.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"

-//#define AMOEBA_DEBUG
-#define BLOCK_SIZE 128
-
 using namespace std; 

 static __constant__ cudaGmxSimulation cSim;
@@ -469,41 +491,4 @@ void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext amoebaGpu, CUDASt
        amoebaAddMapTorqueForceToForce_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block>>> ( );
        LAUNCHERROR("amoebaAddMapTorqueForceToForce");
    }
-
-#ifdef AMOEBA_DEBUG
-    if( 0 ){  
-        VectorOfDoubleVectors outputVector;
-
-        std::vector<int> fileId;
-        static int call = 0;  
-        fileId.push_back( call++ );
-
-        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-        CUDAStream<float>* temp  = new CUDAStream<float>(3*paddedNumberOfAtoms, 1, "TempMapTorqueAndAddToForce2");
-
-        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
-
-        reduceAndCopyCUDAStreamFloat4( amoebaGpu->psTorqueMapForce4, temp, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, NULL, 1.0f/4.184f );
-        for( int pId = 0; pId < 5; pId++ ){
-        float sum[3] = { 0.0f,  0.0f,  0.0f };
-        (void) fprintf( stderr, "\n\nTorqueForceToForce for part=%d\n", pId );
-        for( int ii = 0; ii < amoebaGpu->amoebaSim.maxTorqueBufferIndex; ii++ ){
-            (void) fprintf( stderr, "%4d [%15.7e %15.7e %15.7e]\n", ii,
-                            amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x,
-                            amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y,
-                            amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z );
-            sum[0] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].x;
-            sum[1] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].y;
-            sum[2] += amoebaGpu->psTorqueMapForce4->_pSysStream[ii][pId].z;
-        }
-        (void) fprintf( stderr, "TorqueForceToForce for partcle=%d [%15.7e %15.7e %15.7e]  [%15.7e %15.7e %15.7e]\n", pId, sum[0], sum[1], sum[2], sum[0]/4.184f, sum[1]/4.184f, sum[2]/4.184f );
-        }
-
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaElectrostatiPostTorqueForce", fileId, outputVector );
-        delete temp;
-    }
-#endif
-
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
@@ -33,21 +55,12 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 #define GK
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"
 #undef GK

 __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                                 float fields[8][3]
-
-#ifdef AMOEBA_DEBUG
-               , float4* debugArray
-#endif
-
- )
+                                                                 float fields[8][3] )
 {

    float deltaR[3];
@@ -119,13 +132,7 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedPa
 }

 __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                                   float gkField[8][3]
-
-#ifdef AMOEBA_DEBUG
-               , float4* debugArray
-#endif
-
- )
+                                                                   float gkField[8][3] )
 {

    float gux[5];
@@ -213,20 +220,6 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInduced

 }

-#ifdef AMOEBA_DEBUG
-__device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
-{
-    index                             += cSim.paddedNumberOfAtoms;
-    debugArray[index].x                = addMask ? field[0] : 0.0f;
-    debugArray[index].y                = addMask ? field[1] : 0.0f;
-    debugArray[index].z                = addMask ? field[2] : 0.0f;
-    debugArray[index].w                = idLabel;
-
-    return index;
-}
-#endif
-
-
 // Include versions of the kernels for N^2 calculations.

 #define METHOD_NAME(a, b) a##N2##b
@@ -324,12 +317,6 @@ void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* a
        epsilon[0]  = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) cSim.atoms ) );
-#ifdef AMOEBA_DEBUG
-        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) cSim.atoms ) );
-        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) cSim.atoms ) );
-        epsilon[3]  = 48.033324f*sqrtf( delta[0].z/( (float) cSim.atoms ) );
-        epsilon[4]  = 48.033324f*sqrtf( delta[0].w/( (float) cSim.atoms ) );
-#endif
    }   
 }

@@ -462,20 +449,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon

    gpuContext gpu    = amoebaGpu->gpuContext;

-#ifdef AMOEBA_DEBUG
-    static int iteration = 1;
-    int targetAtom    = 0;
-    static const char* methodName       = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply";
-    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
-        (void) fflush( amoebaGpu->log );
-    }
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-    debugArray->Upload();
-#endif
-
    // clear output arrays

    kClearFields_3( amoebaGpu, 4 );
@@ -493,28 +466,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }
    
-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_3->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                 amoebaGpu->psWorkArray_3_4->_pDevData,
-                                                                 debugArray->_pDevData, targetAtom );
-#else
                                                                 amoebaGpu->psWorkArray_3_4->_pDevData );
-#endif
    } else {

        kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
@@ -522,120 +480,13 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
                                                            amoebaGpu->psWorkArray_3_2->_pDevData,
                                                            amoebaGpu->psWorkArray_3_3->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                            amoebaGpu->psWorkArray_3_4->_pDevData,
-                                                            debugArray->_pDevData, targetAtom );
-#else
                                                            amoebaGpu->psWorkArray_3_4->_pDevData );
-#endif

    }
    LAUNCHERROR("kCalculateAmoebaMutualInducedAndGkFields");  

    kReduceMutualInducedAndGkFields( amoebaGpu, outputArray, outputPolarArray, outputArrayS, outputPolarArrayS );

-#ifdef AMOEBA_DEBUG
-        amoebaGpu->psWorkArray_3_1->Download();
-        amoebaGpu->psWorkArray_3_2->Download();
-        amoebaGpu->psWorkArray_3_3->Download();
-        amoebaGpu->psWorkArray_3_4->Download();
-
-    if( amoebaGpu->log && iteration == 1 ){
-
-        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
-
-        outputArray->Download();
-        outputPolarArray->Download();
-
-        outputArrayS->Download();
-        outputPolarArrayS->Download();
-
-        debugArray->Download();
-
-        int maxPrint        = 33;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-    
-           // Mi
-
-           (void) fprintf( amoebaGpu->log," Mult[%16.9e %16.9e %16.9e] ",
-                           outputArray->_pSysData[indexOffset],
-                           outputArray->_pSysData[indexOffset+1],
-                           outputArray->_pSysData[indexOffset+2] );
-    
-           // Mi polar
-
-           (void) fprintf( amoebaGpu->log," MultP[%16.9e %16.9e %16.9e]\n",
-                           outputPolarArray->_pSysData[indexOffset],
-                           outputPolarArray->_pSysData[indexOffset+1],
-                           outputPolarArray->_pSysData[indexOffset+2] );
-
-           // MiS
-
-           (void) fprintf( amoebaGpu->log,"      MultS[%16.9e %16.9e %16.9e] ",
-                           outputArrayS->_pSysData[indexOffset],
-                           outputArrayS->_pSysData[indexOffset+1],
-                           outputArrayS->_pSysData[indexOffset+2] );
-    
-           // Mi polarS
-
-           (void) fprintf( amoebaGpu->log,"MultPS[%16.9e %16.9e %16.9e]\n",
-                           outputPolarArrayS->_pSysData[indexOffset],
-                           outputPolarArrayS->_pSysData[indexOffset+1],
-                           outputPolarArrayS->_pSysData[indexOffset+2] );
-
-           // coords
-
-#if 0
-            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
-                            gpu->psPosq4->_pSysData[ii].x,
-                            gpu->psPosq4->_pSysData[ii].y,
-                            gpu->psPosq4->_pSysData[ii].z);
-
-
-           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
-               int debugIndex = jj*gpu->natoms + ii;
-               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
-               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
-               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
-               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
-                               ii, jj, xx, yy, zz,
-                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
-
-           }
-#endif
-           if( ii == targetAtom ){
-               (void) fprintf( amoebaGpu->log,"\n" );
-               int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-               for( int jj = 0; jj < gpu->natoms; jj++ ){
-                   int debugIndex = jj;
-                   (void) fprintf( amoebaGpu->log,"%4d %4d Rint [%16.9e %16.9e %16.9e %16.9e]\n",
-                                   ii, jj,
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-
-                   for( int kk = 0; kk < 9; kk++ ){
-                       debugIndex += paddedNumberOfAtoms;
-                       (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %5.1f]\n",
-                                       debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-                   }
-               }
-               (void) fprintf( amoebaGpu->log,"\n" );
-           }
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-        iteration++;
-
-     }
-     delete debugArray;
-#endif
-
 }

 /**---------------------------------------------------------------------------------------
@@ -649,22 +500,12 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
 static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoebaGpu )
 {
  
-   // ---------------------------------------------------------------------------------------
-
-    static int timestep                  = 0;
-    timestep++;
-#ifdef AMOEBA_DEBUG
-    static const char* methodName        = "cudaComputeAmoebaMutualInducedAndGkFieldBySOR";
-    std::vector<int> fileId;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
   // ---------------------------------------------------------------------------------------

    int done;
    int iteration;
+    static int timestep = 0;
+    timestep++;

    gpuContext gpu     = amoebaGpu->gpuContext;

@@ -685,49 +526,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
    cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData,        amoebaGpu->psE_Field->_pDevData,       3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
    cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData,   amoebaGpu->psE_FieldPolar->_pDevData,  3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR Initial setup for matrix multiply\n" ); fflush( amoebaGpu->log );
-
-        gpu->psPosq4->Download();
-        amoebaGpu->psE_Field->Download();
-        amoebaGpu->psE_FieldPolar->Download();
-        amoebaGpu->psInducedDipole->Download(),
-        amoebaGpu->psInducedDipolePolar->Download();
-        amoebaGpu->psInducedDipoleS->Download(),
-        amoebaGpu->psInducedDipolePolarS->Download();
-        amoebaGpu->psPolarizability->Download();
-
-        int offset   = 0;
-        int maxPrint = 10;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-            (void) fprintf( amoebaGpu->log, "\n%4d pol=%12.4e\n", ii, 
-                            amoebaGpu->psPolarizability->_pSysData[offset] );
-            if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
-                amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
-                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); 
-            }
-
-            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e]  Mi[%14.6e %14.6e %14.6e] Pos[%14.6e %14.6e %14.6e]\n",
-                            amoebaGpu->psE_Field->_pSysData[offset],       amoebaGpu->psE_Field->_pSysData[offset+1],       amoebaGpu->psE_Field->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2],
-                            gpu->psPosq4->_pSysData[ii].x, gpu->psPosq4->_pSysData[ii].y, gpu->psPosq4->_pSysData[ii].z );
-            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
-                            amoebaGpu->psE_FieldPolar->_pSysData[offset],       amoebaGpu->psE_FieldPolar->_pSysData[offset+1],       amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
-
-            (void) fprintf( amoebaGpu->log,"Gk[%14.6e %14.6e %14.6e] MiS[%14.6e %14.6e %14.6e] MipS[%14.6e %14.6e %14.6e]\n",
-                            amoebaGpu->psGk_Field->_pSysData[offset],            amoebaGpu->psGk_Field->_pSysData[offset+1],            amoebaGpu->psGk_Field->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipoleS->_pSysData[offset],      amoebaGpu->psInducedDipoleS->_pSysData[offset+1],      amoebaGpu->psInducedDipoleS->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
-            offset += 3;
-            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
-        }   
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
-
    // if polarization type is direct, set flags signalling done and return

    if( amoebaGpu->amoebaSim.polarizationType )
@@ -800,91 +598,11 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
            done = 1;
        }

-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-            (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldBySOR step=%d iteration=%3d eps %14.6e done=%d\n",
-                            timestep, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
-        }
-
-        if( amoebaGpu->log ){
-           amoebaGpu->psInducedDipole->Download();
-           amoebaGpu->psInducedDipolePolar->Download();
-           amoebaGpu->psInducedDipoleS->Download();
-           amoebaGpu->psInducedDipolePolarS->Download();
-           amoebaGpu->psWorkVector[2]->Download();
-           amoebaGpu->psWorkVector[3]->Download();
-           amoebaGpu->psGk_Field->Download();
-
-#if 1
-           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
-                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
-                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
-                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
-#else
-           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
-                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
-                           amoebaGpu->psCurrentEpsilon->_pSysData[0], 
-                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
-                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
-#endif
-           (void) fflush( amoebaGpu->log );
-
-            int offset   = 0;
-            int maxPrint = 20;
-            for( int ii = 0; ii < gpu->natoms; ii++ ){
-                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
-    
-                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]",
-                                amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log," MiS[%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psInducedDipoleS->_pSysData[offset], amoebaGpu->psInducedDipoleS->_pSysData[offset+1], amoebaGpu->psInducedDipoleS->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log,"MipS[%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
-/*
-                (void) fprintf( amoebaGpu->log,"Gk [%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log,"W2 [%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psWorkVector[2]->_pSysData[offset], amoebaGpu->psWorkVector[2]->_pSysData[offset+1], amoebaGpu->psWorkVector[2]->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log,"W3 [%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psWorkVector[3]->_pSysData[offset], amoebaGpu->psWorkVector[3]->_pSysData[offset+1], amoebaGpu->psWorkVector[3]->_pSysData[offset+2] );
-*/
-                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
-                    ii =  (gpu->natoms - maxPrint);
-                    offset = 3*(ii+1);
-                } else {
-                    offset += 3;
-                }
-            }   
-            (void) fflush( amoebaGpu->log );
-        }
-#endif
        iteration++;
    }

    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
-
-#ifdef AMOEBA_DEBUG
-    if( 0 && amoebaGpu->log ){
-        trackMutualInducedIterations( amoebaGpu, iteration );
-    }
-
-    if( 0 ){
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,       outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar,  outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipoleS,      outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolarS, outputVector, NULL, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "Cuda_GK_MI", fileId, outputVector );
-     }
-#endif
-
-   // ---------------------------------------------------------------------------------------
 }

 void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext amoebaGpu )

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.h
@@ -39,11 +39,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
                            float* outputField,
                            float* outputFieldPolar,
                            float* outputFieldS,
-                            float* outputFieldPolarS
-#ifdef AMOEBA_DEBUG
-                           , float4* debugArray, unsigned int targetAtom
-#endif
-){
+                            float* outputFieldPolarS){

    extern __shared__ MutualInducedParticle sA[];

@@ -112,11 +108,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(

                // load coords, charge, ...

-                calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-,  debugArray
-#endif
-);
+                calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[j], ijField);

                unsigned int mask       =  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0 : 1;

@@ -138,27 +130,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
                fieldPolarSumS[1]      += mask ? ijField[5][1] : 0.0f;
                fieldPolarSumS[2]      += mask ? ijField[5][2] : 0.0f;

-//#ifdef AMOEBA_DEBUG
-#if 0
-unsigned int index                 = y + j;
-if( atomI == targetAtom ){
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + j);
-        debugArray[index].z                = bornRadii[atomI];
-        debugArray[index].w                = jBornRadius;
-
-        index                              = debugAccumulate( index, debugArray, ijField[0],   mask, 1.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[1],   mask, 2.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[4],   mask, 3.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[5],   mask, 4.0f );
-}
-#endif
-                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[j], ijField
-#ifdef AMOEBA_DEBUG
-                                                                   , debugArray
-#endif
-);
+                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[j], ijField);

                // atomI == atomJ contribution included

@@ -171,24 +143,6 @@ if( atomI == targetAtom ){
                fieldPolarSumS[1]      += mask ? ijField[2][1] : 0.0f;
                fieldPolarSumS[2]      += mask ? ijField[2][2] : 0.0f;

-//#ifdef AMOEBA_DEBUG
-#if 0
-if( atomI == targetAtom ){
-
-        index                              = debugAccumulate( index, debugArray, ijField[0],   mask, 5.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[2],   mask, 6.0f );
-
-        index                              = debugAccumulate( index, debugArray, jDipoleS, 1, 7.0f );
-        index                              = debugAccumulate( index, debugArray, jDipolePolarS, 1, 8.0f );
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = bornRadii[atomI];
-        debugArray[index].y                = jBornRadius;
-        debugArray[index].w                = 9.0f;
-
-
-}
-#endif
            }

            // Write results
@@ -213,9 +167,8 @@ if( atomI == targetAtom ){
            load3dArray( offset, fieldPolarSumS,  outputFieldPolarS );
 #endif

-        }
-        else        // 100% utilization
-        {
+        } else {
+
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
@@ -235,11 +188,7 @@ if( atomI == targetAtom ){

                // load coords, charge, ...

-                calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-,  debugArray
-#endif
-   );
+                calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[tj], ijField);

                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
           
@@ -289,29 +238,7 @@ if( atomI == targetAtom ){
    
                }

-//#ifdef AMOEBA_DEBUG
-#if 0
-unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-if( atomI == targetAtom  || (y + tj) == targetAtom ){
-        unsigned int indexI                = (atomI == targetAtom) ? 0 : 2;
-        unsigned int maskD                 = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
-
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + tj);
-        debugArray[index].z                = bornRadii[atomI];
-        debugArray[index].w                = jBornRadius;
-
-        index                              = debugAccumulate( index, debugArray, ijField[indexI],   maskD, -1.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[indexI+1], maskD, -2.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[indexI+4], maskD, -3.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f );
-}
-#endif
-                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[tj], ijField
-#ifdef AMOEBA_DEBUG
-,  debugArray
-#endif
-);
+                calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[tj], ijField);


                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
@@ -337,24 +264,6 @@ if( atomI == targetAtom  || (y + tj) == targetAtom ){
                    psA[tj].fieldPolarS[2] += ijField[3][2];
                }
   
-//#ifdef AMOEBA_DEBUG
-#if 0
-if( atomI == targetAtom  || (y + tj) == targetAtom ){
-        unsigned int indexI                = (atomI == targetAtom) ? 0 : 1;
-        unsigned int maskD                 = (atomI < cSim.atoms) && ((y+tj) < cSim.atoms);
-
-        index                              = debugAccumulate( index, debugArray, ijField[indexI],   maskD, -5.0f );
-        index                              = debugAccumulate( index, debugArray, ijField[indexI+2], maskD, -6.0f );
-        index                              = debugAccumulate( index, debugArray, jDipoleS, 1, -7.0f );
-        index                              = debugAccumulate( index, debugArray, jDipolePolarS, 1, -8.0f );
-
-        index                             += cSim.paddedNumberOfAtoms;
-        debugArray[index].x                = bornRadii[atomI];
-        debugArray[index].y                = jBornRadius;
-        debugArray[index].w                = -9.0f;
-}
-#endif
-
                tj                  = (tj + 1) & (GRID - 1);

            }

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedField.cu
-//-----------------------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------------------
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */

 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
@@ -33,19 +55,10 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
 }

-//#define AMOEBA_DEBUG
-#undef AMOEBA_DEBUG
-
 #include "kCalculateAmoebaCudaMutualInducedParticle.h"

 __device__ void calculateMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
-                                                           float fields[4][3]
-
-#ifdef AMOEBA_DEBUG
-               , float4* debugArray
-#endif
-
- )
+                                                           float fields[4][3] )
 {

    float deltaR[3];
@@ -252,20 +265,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
  
    gpuContext gpu    = amoebaGpu->gpuContext;

-#ifdef AMOEBA_DEBUG
-    int targetAtom    = 1231;
-    static const char* methodName = "cudaComputeAmoebaMutualInducedFieldMatrixMultiply";
-    static int iteration = 1;
-    if( 1 && amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
-        (void) fflush( amoebaGpu->log );
-    }
-    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
-    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
-    debugArray->Upload();
-#endif
-
    kClearFields_3( amoebaGpu, 2 );

    if( threadsPerBlock == 0 ){  
@@ -279,169 +278,23 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
    }   

-#ifdef AMOEBA_DEBUG
-        (void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n", methodName,
-                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
-                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
-                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
-#endif
-
    if (gpu->bOutputBufferPerWarp){
        kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                 debugArray->_pDevData, targetAtom );
-#else
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
-#endif

    } else {

        kCalculateAmoebaMutualInducedFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
-#ifdef AMOEBA_DEBUG
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                 debugArray->_pDevData, targetAtom );
-#else
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
-#endif
-
-
    }
    LAUNCHERROR("kCalculateAmoebaMutualInducedField");

    kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );

-#ifdef AMOEBA_DEBUG
-    amoebaGpu->psWorkArray_3_1->Download();
-    amoebaGpu->psWorkArray_3_2->Download();
-
-    if( amoebaGpu->log && iteration == -1 ){
-        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
-        outputArray->Download();
-        outputPolarArray->Download();
-        debugArray->Download();
-
-        int maxPrint        = 1400;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-           (void) fprintf( amoebaGpu->log, "%5d ", ii); 
-
-            int indexOffset     = ii*3;
-    
-           // MI
-
-           (void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
-                           outputArray->_pSysData[indexOffset],
-                           outputArray->_pSysData[indexOffset+1],
-                           outputArray->_pSysData[indexOffset+2] );
-    
-           // MI polar
-
-           (void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e] ",
-                           outputPolarArray->_pSysData[indexOffset],
-                           outputPolarArray->_pSysData[indexOffset+1],
-                           outputPolarArray->_pSysData[indexOffset+2] );
-
-           // coords
-
-#if 0
-            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
-                            gpu->psPosq4->_pSysData[ii].x,
-                            gpu->psPosq4->_pSysData[ii].y,
-                            gpu->psPosq4->_pSysData[ii].z);
-
-
-           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
-               int debugIndex = jj*gpu->natoms + ii;
-               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
-               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
-               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
-               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
-                               ii, jj, xx, yy, zz,
-                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
-
-           }
-#endif
-           if( ii == targetAtom ){
-               float sums[4][3] = { { 0.0f, 0.0f, 0.0f },
-                                    { 0.0f, 0.0f, 0.0f },
-                                    { 0.0f, 0.0f, 0.0f },
-                                    { 0.0f, 0.0f, 0.0f } };
-               (void) fprintf( amoebaGpu->log,"\n" );
-               int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-               unsigned int count                         = 0;
-               for( int jj = 0; jj < gpu->natoms; jj++ ){
-                   int debugIndex = jj;
-                   (void) fprintf( amoebaGpu->log,"%4d %4d Pint [%16.9e %16.9e %16.9e %16.9e] ",
-                                   ii, jj,
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
-                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
-
-                   //debugIndex += gpu->natoms;
-                   debugIndex += paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
-
-                   int index = 0;
-                   sums[index][0] += debugArray->_pSysData[debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysData[debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysData[debugIndex].z; 
-                   
-                   if( count && ( (count % 31) == 0) ){
-                      static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
-                      (void) fprintf( amoebaGpu->log,"Block sum [%16.9e %16.9e %16.9e] ",
-                                      sums[index][0] - saveSum[0], sums[index][1] - saveSum[1], sums[index][2] - saveSum[2] );
-                      saveSum[0] = sums[index][0];
-                      saveSum[1] = sums[index][1];
-                      saveSum[2] = sums[index][2];
-                     
-                   }
-                   
-
-                   debugIndex += paddedNumberOfAtoms;
-                   (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e] ",
-                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
-
-                   index++;
-                   sums[index][0] += debugArray->_pSysData[debugIndex].x; 
-                   sums[index][1] += debugArray->_pSysData[debugIndex].y; 
-                   sums[index][2] += debugArray->_pSysData[debugIndex].z; 
-
-                   if( count && ( (count % 31) == 0) ){
-                      static float saveSum[3] = { 0.0f, 0.0f, 0.0f };
-                      (void) fprintf( amoebaGpu->log,"Block sumP [%16.9e %16.9e %16.9e] ",
-                                      sums[index][0] - saveSum[0], sums[index][1] - saveSum[1], sums[index][2] - saveSum[2] );
-                      saveSum[0] = sums[index][0];
-                      saveSum[1] = sums[index][1];
-                      saveSum[2] = sums[index][2];
-                   }
-                   (void) fprintf( amoebaGpu->log,"\n" );
-                   count++;
-               }
-
-               (void) fprintf( amoebaGpu->log,"\n" );
-               int index = 0;
-               (void) fprintf( amoebaGpu->log,"Sum1 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
-               (void) fprintf( amoebaGpu->log,"Sum2 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
-               (void) fprintf( amoebaGpu->log,"Sum3 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
-               (void) fprintf( amoebaGpu->log,"Sum4 [%16.9e %16.9e %16.9e]\n", sums[index][0], sums[index][1],sums[index][2] ); index++;
-           }
-           (void) fprintf( amoebaGpu->log,"\n" );
-           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
-                ii = gpu->natoms - maxPrint;
-           }
-        }
-        (void) fflush( amoebaGpu->log );
-        iteration++;
-
-     }
-     delete debugArray;
-#endif
-
 }

 /**---------------------------------------------------------------------------------------
@@ -457,18 +310,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
  
   // ---------------------------------------------------------------------------------------

-#ifdef AMOEBA_DEBUG
-    static const char* methodName = "cudaComputeAmoebaMutualInducedFieldBySOR";
-    static int timestep = 0;
-    std::vector<int> fileId;
-    timestep++;
-    fileId.resize( 2 );
-    fileId[0] = timestep;
-    fileId[1] = 1;
-#endif
-
-   // ---------------------------------------------------------------------------------------
-
    int done;
    int iteration;

@@ -489,38 +330,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
    cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData,        amoebaGpu->psE_Field->_pDevData,       3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
    cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData,   amoebaGpu->psE_FieldPolar->_pDevData,  3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-
-        amoebaGpu->psE_Field->Download();
-        amoebaGpu->psE_FieldPolar->Download();
-        amoebaGpu->psInducedDipole->Download(),
-        amoebaGpu->psInducedDipolePolar->Download();
-        amoebaGpu->psPolarizability->Download();
-        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
-        int offset   = 0;
-        int maxPrint = 20000;
-        for( int ii = 0; ii < gpu->natoms; ii++ ){
-            (void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii, 
-                            amoebaGpu->psPolarizability->_pSysData[offset] );
-            if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
-                amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
-                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); 
-            }
-
-            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
-                            amoebaGpu->psE_Field->_pSysData[offset],       amoebaGpu->psE_Field->_pSysData[offset+1],       amoebaGpu->psE_Field->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
-            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
-                            amoebaGpu->psE_FieldPolar->_pSysData[offset],       amoebaGpu->psE_FieldPolar->_pSysData[offset+1],       amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
-                            amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
-            offset += 3;
-            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
-        }   
-        (void) fflush( amoebaGpu->log );
-    }   
-#endif
-
    // if polarization type is direct, set flags signalling done and return

    if( amoebaGpu->amoebaSim.polarizationType )
@@ -558,12 +367,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
           amoebaGpu->psCurrentEpsilon->_pDevData );
        LAUNCHERROR("kReduceMutualInducedFieldDelta");

-#ifdef AMOEBA_DEBUG
-        if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
-            trackMutualInducedIterations( amoebaGpu, iteration);
-        }
-#endif
-
        // Debye=48.033324f
        amoebaGpu->psCurrentEpsilon->Download();
        float currentEpsilon          = amoebaGpu->psCurrentEpsilon->_pSysData[0];
@@ -572,54 +375,12 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
            done = 1;
        }
-
-#ifdef AMOEBA_DEBUG
-        if( amoebaGpu->log ){
-           amoebaGpu->psInducedDipole->Download();
-           amoebaGpu->psInducedDipolePolar->Download();
-           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e done=%d\n",
-                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
-           (void) fflush( amoebaGpu->log );
-
-            int offset   = 0;
-            int maxPrint = 20;
-            for( int ii = 0; ii < gpu->natoms; ii++ ){
-                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
-    
-                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
-                                amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
-                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
-                                amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
-                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
-                    ii =  (gpu->natoms - maxPrint);
-                    offset = 3*(ii+1);
-                } else {
-                    offset += 3;
-                }
-            }   
-            (void) fflush( amoebaGpu->log );
-        }
-#endif
        iteration++;
    }

    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;

-#ifdef AMOEBA_DEBUG
-    if( 0 ){
-        std::vector<int> fileId;
-        //fileId.push_back( 0 );
-        VectorOfDoubleVectors outputVector;
-//        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, NULL, 1.0f );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, NULL, 1.0f );
-        cudaWriteVectorOfDoubleVectorsToFile( "CudaMI", fileId, outputVector );
-     }
-
-#endif
-
-   // ---------------------------------------------------------------------------------------
 }

 void cudaComputeAmoebaMutualInducedField( amoebaGpuContext amoebaGpu )