Replaced calculation of Born radii using OBC algorithm w/ Grycuk algorithm

TorsionTorsion grids reordered, if needed, so that first angle is the 'slow' index Fixes for MonteCarloBarostat

Replaced calculation of Born radii using OBC algorithm w/ Grycuk algorithm
TorsionTorsion grids reordered, if needed, so that first angle is the 'slow' index Fixes for MonteCarloBarostat
25058a77 · Mark Friedrichs · e772769e · 25058a77 · 25058a77 · 25058a77
Commit 25058a77 authored Sep 30, 2011 by Mark Friedrichs
16 changed files
--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -18,6 +18,7 @@

 # ----------------------------------------------------------------------------
 #SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA OFF CACHE BOOL "Build verison of OpenMMAmoeba w/ backdoor serialization capability")
+#SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA TRUE )
 SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA FALSE )
 # ----------------------------------------------------------------------------


--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaTorsionTorsionForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaTorsionTorsionForce.h
@@ -41,6 +41,7 @@
 namespace OpenMM {

 typedef std::vector< std::vector< std::vector<double> > > TorsionTorsionGrid;
+typedef std::vector< std::vector< std::vector<float> > > TorsionTorsionGridFloat;

 /**
 * This class implements the Amoeba torsion-torsion interaction
@@ -198,6 +199,13 @@ public:
                }
            }
        }   
+/*
+        for( unsigned int kk = 0; kk < grid.size(); kk++ ){
+            for( unsigned int jj = 0; jj < grid[kk].size(); jj++ ){
+                fprintf( stderr, "xGrid %4d %4d %12.3f %12.3f  %15.7e  %15.7e\n", kk, jj,  grid[kk][jj][0], grid[kk][jj][1], grid[kk][jj][2], grid[kk][jj][2]/4.184 );
+            }
+        }
+*/

        _startValues[0] =  _grid[0][0][0];
        _startValues[1] =  _grid[0][0][1];

--- a/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaTorsionTorsionForceImpl.h
+++ b/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaTorsionTorsionForceImpl.h
@@ -60,6 +60,8 @@ public:
        return std::map<std::string, double>(); // This force field doesn't define any parameters.
    }
    std::vector<std::string> getKernelNames();
+
+    static void reorderGrid( const TorsionTorsionGrid& grid, TorsionTorsionGrid& reorderedGrid );
 private:
    AmoebaTorsionTorsionForce& owner;
    Kernel kernel;

--- a/plugins/amoeba/openmmapi/src/AmoebaTorsionTorsionForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaTorsionTorsionForceImpl.cpp
@@ -54,6 +54,121 @@ double AmoebaTorsionTorsionForceImpl::calcForcesAndEnergy(ContextImpl& context,
    return dynamic_cast<CalcAmoebaTorsionTorsionForceKernel&>(kernel.getImpl()).execute(context, includeForces, includeEnergy);
 }

+struct IntPair {
+    unsigned int index1;
+    unsigned int index2;
+};
+
+typedef std::map< double, struct IntPair > Map_Double_IntPair;
+typedef Map_Double_IntPair::iterator Map_Double_IntPairI;
+typedef Map_Double_IntPair::const_iterator Map_Double_IntPairCI;
+
+typedef std::map< double, Map_Double_IntPair > Map_Double_MapDoubleIntPair;
+typedef Map_Double_MapDoubleIntPair::iterator Map_Double_MapDoubleIntPairI;
+typedef Map_Double_MapDoubleIntPair::const_iterator Map_Double_MapDoubleIntPairCI;
+
+void AmoebaTorsionTorsionForceImpl::reorderGrid( const TorsionTorsionGrid& grid, TorsionTorsionGrid& reorderedGrid ){
+
+    reorderedGrid.resize( grid.size() );
+    std::vector<Map_Double_IntPair> map_Double_IntPair_Vector( grid.size() );
+    Map_Double_MapDoubleIntPair mapAngles;
+    //(void) fprintf( stderr, "AmoebaTorsionTorsionForceImpl::reorder grid\n" );
+
+    // (1) set dimensions for reorderd grid
+    // (2) build map:
+    //         map[angleX][angleY] = <ii, jj> indices
+    //         assume map keys are sorted from least to greatest
+
+    for (unsigned int ii = 0; ii < grid.size(); ii++) {
+    
+        reorderedGrid[ii].resize( grid[ii].size() );
+        for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
+            reorderedGrid[ii][jj].resize( grid[ii][jj].size() );
+
+            double angleX =  grid[ii][jj][0]; 
+            double angleY =  grid[ii][jj][1]; 
+
+            if( mapAngles.find( angleX ) == mapAngles.end() ){
+                if( map_Double_IntPair_Vector[ii].size() > 0 ){
+                    char buffer[1024];
+                    (void) sprintf( buffer, "TorsionTorsion grid reorder: x-angle not set correctly: x=%15.7e y=%15.7e size=%u should be zero; ii/jj indies=%u %u.\n",
+                                    angleX, angleY, static_cast<unsigned int>(map_Double_IntPair_Vector[ii].size()), ii, jj );
+                    throw OpenMMException(buffer);
+                 }
+                 mapAngles[angleX] = map_Double_IntPair_Vector[ii];
+            }
+
+            Map_Double_IntPair& map_Double_IntPair  = mapAngles[angleX];
+            if( map_Double_IntPair.find( angleY ) != map_Double_IntPair.end() ){
+                char buffer[1024];
+                (void) sprintf( buffer, "TorsionTorsion grid reorder: angle pair found twice: %15.7e %15.7e %u\n", angleX, angleY, static_cast<unsigned int>(map_Double_IntPair.size()) );
+                throw OpenMMException(buffer);
+            }
+            struct IntPair pair; 
+            pair.index1 = ii;
+            pair.index2 = jj;
+            map_Double_IntPair[angleY] = pair; 
+        }
+    }
+
+#if 0
+     (void) fprintf( stderr, "TorsionTorsion grid reorder map\n" );
+     for( Map_Double_MapDoubleIntPairCI ii = mapAngles.begin(); ii != mapAngles.end(); ii++ ){
+         double angleX                          = ii->first;
+         Map_Double_IntPair map_Double_IntPair  = ii->second;
+         (void) fprintf( stderr, "   %15.7e %u \n", angleX, static_cast<unsigned int>(map_Double_IntPair.size()) );
+      }
+      for( Map_Double_MapDoubleIntPairCI ii = mapAngles.begin(); ii != mapAngles.end(); ii++ ){
+          double angleX                          = ii->first;
+          Map_Double_IntPair map_Double_IntPair  = ii->second;
+          for( Map_Double_IntPairCI jj = map_Double_IntPair.begin(); jj != map_Double_IntPair.end(); jj++ ){
+              double angle         = jj->first;
+              struct IntPair pair  = jj->second;
+              (void) fprintf( stderr, "   %15.7e %15.7e %d %d\n", angleX, angle, pair.index1, pair.index2 );
+          }
+            }
+#endif
+
+    // load reordered entries
+
+    Map_Double_MapDoubleIntPairCI mapII    = mapAngles.begin();
+    Map_Double_IntPair map_Double_IntPair  = mapII->second;
+    Map_Double_IntPairCI mapJJ             = map_Double_IntPair.begin();
+
+    for (unsigned int ii = 0; ii < grid.size(); ii++) {
+        for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
+
+            struct IntPair pair  = mapJJ->second;
+            int index1           = pair.index1;
+            int index2           = pair.index2;
+            //(void) fprintf( stderr, "   %3d %3d %15.7e %15.7e %3d %3d zzz\n", ii, jj, mapII->first, mapJJ->first, index1, index2 );
+
+            for (unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
+                reorderedGrid[ii][jj][kk] = static_cast<float>(grid[index1][index2][kk]);
+            }
+
+            // increment map iterators
+
+            mapJJ++;
+            if( mapJJ == map_Double_IntPair.end() ){
+                mapII++;
+                if( mapII == mapAngles.end() ){
+                    if( (jj != (grid[ii].size()-1)) && (ii != (grid.size()-1)) ){
+                        char buffer[1024];
+                        (void) sprintf( buffer, "AmoebaTorsionTorsionForceImpl::reorderGrid: error detected with map iterators.\n" );
+                        throw OpenMMException(buffer);
+                    }
+                } else {
+                    map_Double_IntPair  = mapII->second;
+                    mapJJ               = map_Double_IntPair.begin();
+                }
+            }
+        }
+    }
+
+    return;
+}
+
 std::vector<std::string> AmoebaTorsionTorsionForceImpl::getKernelNames() {
    std::vector<std::string> names;
    names.push_back(CalcAmoebaTorsionTorsionForceKernel::Name());

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -37,6 +37,7 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
    hasAmoebaBonds                = false;
    hasAmoebaGeneralizedKirkwood  = false;
    hasAmoebaMultipole            = false;
+    useGrycuk                     = true;
    amoebaGpu                     = amoebaGpuInit( cudaPlatformData.gpu );
    localForceKernel              = NULL;
    log                           = NULL;
@@ -45,6 +46,10 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
    applyMultipoleCutoff          = 0;
    useVdwNeighborList            = 0;
    multipoleForceCount           = 0;
+
+    boxDimensions[0]              = 0.0;
+    boxDimensions[1]              = 0.0;
+    boxDimensions[2]              = 0.0;
 }   

 AmoebaCudaData::~AmoebaCudaData() {
@@ -74,6 +79,10 @@ bool AmoebaCudaData::getHasAmoebaMultipole( void ) const {
    return hasAmoebaMultipole;
 }

+int AmoebaCudaData::getUseGrycuk( void ) const {
+    return useGrycuk;
+}
+
 void AmoebaCudaData::setHasAmoebaGeneralizedKirkwood( bool inputHasAmoebaGeneralizedKirkwood ) {
    hasAmoebaGeneralizedKirkwood = inputHasAmoebaGeneralizedKirkwood;
 }
@@ -108,6 +117,7 @@ void AmoebaCudaData::setContextImpl( void* inputContextImpl ) {
 }

 void AmoebaCudaData::initializeGpu( void ) {
+
    if( !gpuInitialized ){
        if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){
            throw OpenMMException("GK force requires Multipole force\n");
@@ -117,16 +127,33 @@ void AmoebaCudaData::initializeGpu( void ) {
        amoebaGpuBuildThreadBlockWorkList( amoebaGpu );
        amoebaGpuBuildVdwExclusionList( amoebaGpu );
        amoebaGpuBuildScalingList( amoebaGpu );
-        amoebaGpuSetConstants( amoebaGpu );
+        amoebaGpuSetConstants( amoebaGpu, 0 );

-        gpuInitialized = true;
+        boxDimensions[0] = amoebaGpu->gpuContext->sim.periodicBoxSizeX;
+        boxDimensions[1] = amoebaGpu->gpuContext->sim.periodicBoxSizeY;
+        boxDimensions[2] = amoebaGpu->gpuContext->sim.periodicBoxSizeZ;
+
+        gpuInitialized   = true;

        if( log ){
            gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() );
            (void) fprintf( log, "Gpu initialized\n" );
            (void) fflush( log );
        }
+    } else {
+
+        if( boxDimensions[0] != amoebaGpu->gpuContext->sim.periodicBoxSizeX ||
+            boxDimensions[1] != amoebaGpu->gpuContext->sim.periodicBoxSizeY ||
+            boxDimensions[2] != amoebaGpu->gpuContext->sim.periodicBoxSizeZ ){
+            amoebaGpuSetConstants( amoebaGpu, 1 );
+            
+            boxDimensions[0] = amoebaGpu->gpuContext->sim.periodicBoxSizeX;
+            boxDimensions[1] = amoebaGpu->gpuContext->sim.periodicBoxSizeY;
+            boxDimensions[2] = amoebaGpu->gpuContext->sim.periodicBoxSizeZ;
+        }
+
    }
+
    return;
 }


--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -76,6 +76,13 @@ public:
     */
    bool getHasAmoebaMultipole( void ) const;

+    /**
+     * Get use Grycuk flag
+     * 
+     * @return value of useGrycuk
+     */
+    int getUseGrycuk( void ) const;
+
    /**
     * Set value of hasAmoebaGeneralizedKirkwood
     * 
@@ -190,11 +197,13 @@ private:
    int multipoleForceCount;
    int applyMultipoleCutoff;
    int useVdwNeighborList;
+    int useGrycuk;
    KernelImpl* localForceKernel;
    unsigned int kernelCount;
    void* contextImpl;
    FILE* log;
    bool gpuInitialized;
+    double boxDimensions[3];
 };



--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -31,6 +31,7 @@
 #include "kernels/amoebaCudaKernels.h"
 #include "openmm/internal/AmoebaMultipoleForceImpl.h"
 #include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
+#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
 #include "CudaForceInfo.h"

@@ -738,22 +739,39 @@ void CudaCalcAmoebaTorsionTorsionForceKernel::initialize(const System& system, c
    // torsion-torsion grids

    numTorsionTorsionGrids = force.getNumTorsionTorsionGrids();
-    std::vector< std::vector< std::vector< std::vector<float> > > > floatGrids;
+    std::vector<TorsionTorsionGridFloat> floatGrids;

    floatGrids.resize(numTorsionTorsionGrids);
    for (int gridIndex = 0; gridIndex < numTorsionTorsionGrids; gridIndex++) {

        const TorsionTorsionGrid& grid = force.getTorsionTorsionGrid( gridIndex );
-
        floatGrids[gridIndex].resize( grid.size() );
+
+        // check if grid needs to be reordered: x-angle should be 'slow' index
+
+        TorsionTorsionGrid reorderedGrid;
+        int reorder = 0;
+        if( grid[0][0][0] != grid[0][1][0] ){
+            AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
+            reorder = 1;
+            if( data.getLog() ){
+                (void) fprintf( data.getLog(), "CudaCalcAmoebaTorsionTorsionForceKernel::initialize: reordering torsion-torsion grid %d.\n", gridIndex );
+            }
+        }
        for (unsigned int ii = 0; ii < grid.size(); ii++) {

            floatGrids[gridIndex][ii].resize( grid[ii].size() );
            for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {

                floatGrids[gridIndex][ii][jj].resize( grid[ii][jj].size() );
-                for (unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
-                    floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
+                if( reorder ){
+                    for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
+                        floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(reorderedGrid[ii][jj][kk]);
+                    }
+                } else {
+                    for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
+                        floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
+                    }
                }
            }
        }
@@ -788,12 +806,17 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {

    data.initializeGpu();

-    // calculate Born radii
+    // calculate Born radii using either the Grycuk or OBC algorithm if GK is active

    if( data.getHasAmoebaGeneralizedKirkwood() ){
        kClearBornSum( gpu->gpuContext );
-        kCalculateObcGbsaBornSum(gpu->gpuContext);
-        kReduceObcGbsaBornSum(gpu->gpuContext);
+        if( data.getUseGrycuk() ){
+            kCalculateAmoebaGrycukBornRadii( gpu );
+            kReduceGrycukGbsaBornSum( gpu );
+        } else {
+            kCalculateObcGbsaBornSum(gpu->gpuContext);
+            kReduceObcGbsaBornSum(gpu->gpuContext);
+       }
    }   

    // multipoles
@@ -990,23 +1013,27 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            zsize = pmeGridDimension[2];
            pmeParametersSetBasedOnEwaldErrorTolerance = 0;
        }
+
+        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
+
        if( data.getLog() ){
-            (void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d] -",
+            (void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d]\n",
                            force.getEwaldErrorTolerance(), force.getCutoffDistance(),  alpha, xsize, ysize, zsize );
            if( pmeParametersSetBasedOnEwaldErrorTolerance  ){
-                 (void) fprintf( data.getLog(), " parameters set based on error tolerance and OpenMM algorithm.\n" );
+                 (void) fprintf( data.getLog(), "Parameters based on error tolerance and OpenMM algorithm.\n" );
            } else {
                 double alphaT;
                 int xsizeT, ysizeT, zsizeT;
                 NonbondedForceImpl::calcPMEParameters(system, nb, alphaT, xsizeT, ysizeT, zsizeT);
                 double impliedTolerance  = alpha*force.getCutoffDistance();
                        impliedTolerance  = 0.5*exp( -(impliedTolerance*impliedTolerance) );
-                 (void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e;", impliedTolerance );
+                 (void) fprintf( data.getLog(), "Using input parameters implied tolerance=%12.3e;", impliedTolerance );
                 (void) fprintf( data.getLog(), "OpenMM param: aEwald=%12.3f [%6d %6d %6d]\n", alphaT, xsizeT, ysizeT, zsizeT);
            }
+            (void) fprintf( data.getLog(), "\n" );
            (void) fflush( data.getLog() );
        }
-        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
+
        data.setApplyMultipoleCutoff( 1 );

        data.cudaPlatformData.nonbondedMethod = PARTICLE_MESH_EWALD;
@@ -1067,12 +1094,25 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
        scale[ii]   = static_cast<float>( scalingFactor );
        charge[ii]  = static_cast<float>( particleCharge );
    }   
-    gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
-                               static_cast<float>( force.getSolventDielectric() ), 
-                               static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
-                               force.getIncludeCavityTerm(),
-                               static_cast<float>( force.getProbeRadius() ), 
-                               static_cast<float>( force.getSurfaceAreaFactor() ) ); 
+    if( data.getUseGrycuk() ){
+
+        gpuSetAmoebaGrycukParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
+                                      static_cast<float>( force.getSolventDielectric() ), 
+                                      static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
+                                      force.getIncludeCavityTerm(),
+                                      static_cast<float>( force.getProbeRadius() ), 
+                                      static_cast<float>( force.getSurfaceAreaFactor() ) ); 
+        
+    } else {
+
+        gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
+                                   static_cast<float>( force.getSolventDielectric() ), 
+                                   static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
+                                   force.getIncludeCavityTerm(),
+                                   static_cast<float>( force.getProbeRadius() ), 
+                                   static_cast<float>( force.getSurfaceAreaFactor() ) ); 
+
+    }
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -32,12 +32,16 @@
 #ifdef WIN32
  #define _USE_MATH_DEFINES /* M_PI */
 #endif
+
+#define PARAMETER_PRINT 1
+#define MAX_PARAMETER_PRINT 10
+
 #include "openmm/OpenMMException.h"
 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"

 // for some reason, these are not being included w/ cudaKernels.h on Windows
-extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
+//extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);

 #include <cmath>
@@ -503,20 +507,33 @@ void gpuSetAmoebaBondParameters(amoebaGpuContext amoebaGpu, const std::vector<in
        (*psBondID)[i].w         = gpu->pOutputBufferCounter[(*psBondID)[i].y]++;
        (*psBondParameter)[i].x  = length[i];
        (*psBondParameter)[i].y  = k[i];
+    }

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )                
-if( amoebaGpu->log && (i < DUMP_PARAMETERS || i > bonds - (DUMP_PARAMETERS + 1)  ) )
-        fprintf( amoebaGpu->log, "Bonds: %5d [%5d %5d %5d %5d] L=%15.7e k[%15.7e %15.7e %15.7e] [%5d %5d]\n",
-            i, (*psBondID)[i].x, (*psBondID)[i].y, (*psBondID)[i].z, (*psBondID)[i].w, 
-            (*psBondParameter)[i].x, (*psBondParameter)[i].y, cubic, quartic, 
-            gpu->pOutputBufferCounter[(*psBondID)[i].x],
-            gpu->pOutputBufferCounter[(*psBondID)[i].y] );
-#endif
-#undef DUMP_PARAMETERS
+    // logging info

+    if( amoebaGpu->log ){
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = bonds;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaBondParameters: number of bonds=%5d cubicK=%15.7e quarticK=%15.7e\n", bonds, cubic, quartic );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "    %5d [%5d %5d %5d %5d] l=%15.7e k=%15.7e counters: [%5d %5d]\n",
+                            ii, (*psBondID)[ii].x, (*psBondID)[ii].y, (*psBondID)[ii].z, (*psBondID)[ii].w, 
+                            (*psBondParameter)[ii].x, (*psBondParameter)[ii].y,
+                            gpu->pOutputBufferCounter[(*psBondID)[ii].x],
+                            gpu->pOutputBufferCounter[(*psBondID)[ii].y] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
+#endif
+        (void) fflush( amoebaGpu->log );
    }
+
    psBondID->Upload();
    psBondParameter->Upload();
 }
@@ -548,19 +565,34 @@ void gpuSetAmoebaUreyBradleyParameters(amoebaGpuContext amoebaGpu, const std::ve
        (*psUreyBradleyParameter)[i].x  = length[i];
        (*psUreyBradleyParameter)[i].y  = k[i];

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )                
-if( amoebaGpu->log && (i < DUMP_PARAMETERS || i > bonds - (DUMP_PARAMETERS + 1)  ) )
-        fprintf( amoebaGpu->log, "UreyBradleys: %5d [%5d %5d %5d %5d] L=%15.7e k[%15.7e %15.7e %15.7e] [%5d %5d]\n",
-            i, (*psUreyBradleyID)[i].x, (*psUreyBradleyID)[i].y, (*psUreyBradleyID)[i].z, (*psUreyBradleyID)[i].w, 
-            (*psUreyBradleyParameter)[i].x, (*psUreyBradleyParameter)[i].y, cubic, quartic, 
-            gpu->pOutputBufferCounter[(*psUreyBradleyID)[i].x],
-            gpu->pOutputBufferCounter[(*psUreyBradleyID)[i].y] );
-#endif
-#undef DUMP_PARAMETERS
+    }

+    // logging info
+
+    if( amoebaGpu->log ){
+
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = bonds;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaUreyBradleyParameters: number of bonds=%5d cubicK==%15.7e quarticK=%15.7e\n", bonds, cubic, quartic );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "    %5d [%5d %5d %5d %5d] L=%15.7e k=%15.7e counters: [%5d %5d]\n",
+                            ii, (*psUreyBradleyID)[ii].x, (*psUreyBradleyID)[ii].y, (*psUreyBradleyID)[ii].z, (*psUreyBradleyID)[ii].w, 
+            (*psUreyBradleyParameter)[ii].x, (*psUreyBradleyParameter)[ii].y,
+            gpu->pOutputBufferCounter[(*psUreyBradleyID)[ii].x],
+            gpu->pOutputBufferCounter[(*psUreyBradleyID)[ii].y] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
+#endif
+        (void) fflush( amoebaGpu->log );
    }
+
    psUreyBradleyID->Upload();
    psUreyBradleyParameter->Upload();
 }
@@ -602,21 +634,37 @@ void gpuSetAmoebaAngleParameters(amoebaGpuContext amoebaGpu, const std::vector<i
        psAngleID1->_pSysData[i].w = gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].x]++;
        psAngleID2->_pSysData[i].x = gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].y]++;
        psAngleID2->_pSysData[i].y = gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].z]++;
+    }
+
+    // logging info
+
+    if( amoebaGpu->log ){

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > bond_angles - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "Angles: %5d [%5d %5d %5d %5d] [%5d %5d] A=%15.7e k=%15.7e [%5d %5d %5d]\n", i, 
-                  (*psAngleID1)[i].x, (*psAngleID1)[i].y, (*psAngleID1)[i].z, (*psAngleID1)[i].w,
-                  (*psAngleID2)[i].x, (*psAngleID2)[i].y,
-                  (*psAngleParameter)[i].x, (*psAngleParameter)[i].y,
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].z] );
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = bond_angles;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaAngleParameters: number of angles=%5d cubicK=%15.7e quarticK=%15.7e penticK=%15.7e sexticK=%15.7e\n",
+                        bond_angles, cubicK, quarticK, penticK, sexticK );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "    %5d [%5d %5d %5d %5d] [%5d %5d] A=%15.7e k=%15.7e [%5d %5d %5d]\n", ii, 
+                           (*psAngleID1)[ii].x, (*psAngleID1)[ii].y, (*psAngleID1)[ii].z, (*psAngleID1)[ii].w,
+                           (*psAngleID2)[ii].x, (*psAngleID2)[ii].y,
+                           (*psAngleParameter)[ii].x, (*psAngleParameter)[ii].y,
+                           gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].x],
+                           gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].y],
+                           gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].z] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }
+
    psAngleID1->Upload();
    psAngleID2->Upload();
    psAngleParameter->Upload();
@@ -662,21 +710,36 @@ void gpuSetAmoebaInPlaneAngleParameters(amoebaGpuContext amoebaGpu, const std::v
        psAngleID2->_pSysData[i].y = gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].y]++;
        psAngleID2->_pSysData[i].z = gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].z]++;
        psAngleID2->_pSysData[i].w = gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].w]++;
+    }
+
+    // logging info
+
+    if( amoebaGpu->log ){

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 10
-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > bond_angles - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "InPlaneAngles: %5d [%5d %5d %5d %5d] [%5d %5d %5d %5d] A=%15.7e k=%15.7e [%5d %5d %5d %5d]\n", i, 
-                  (*psAngleID1)[i].x, (*psAngleID1)[i].y, (*psAngleID1)[i].z, (*psAngleID1)[i].w,
-                  (*psAngleID2)[i].x, (*psAngleID2)[i].y, (*psAngleID2)[i].z, (*psAngleID2)[i].w,
-                  (*psAngleParameter)[i].x, (*psAngleParameter)[i].y,
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].z],
-                  gpu->pOutputBufferCounter[psAngleID1->_pSysData[i].w] );
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = bond_angles;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaInPlaneAngleParameters: number of angles=%5d cubicK=%15.7e quarticK=%15.7e penticK==%15.7e sexticK=%15.7e\n",
+                        bond_angles, cubicK, quarticK, penticK, sexticK );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "    %5d [%5d %5d %5d %5d] [%5d %5d %5d %5d] A=%15.7e k=%15.7e [%5d %5d %5d %5d]\n", ii, 
+                            (*psAngleID1)[ii].x, (*psAngleID1)[ii].y, (*psAngleID1)[ii].z, (*psAngleID1)[ii].w,
+                            (*psAngleID2)[ii].x, (*psAngleID2)[ii].y, (*psAngleID2)[ii].z, (*psAngleID2)[ii].w,
+                            (*psAngleParameter)[ii].x, (*psAngleParameter)[ii].y,
+                            gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].x],
+                            gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].y],
+                            gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].z],
+                            gpu->pOutputBufferCounter[psAngleID1->_pSysData[ii].w] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }

    psAngleID1->Upload();
@@ -731,24 +794,37 @@ void gpuSetAmoebaTorsionParameters(amoebaGpuContext amoebaGpu, const std::vector
        psTorsionID2->_pSysData[i].y = gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].y]++;
        psTorsionID2->_pSysData[i].z = gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].z]++;
        psTorsionID2->_pSysData[i].w = gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].w]++;
+    }
+
+    // logging info
+
+    if( amoebaGpu->log ){

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > torsions - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "Torsions: %5d [%5d %5d %5d %5d] [%5d %5d %5d %5d] 0[%15.7e %15.7e] 1[%15.7e %15.7e] 2[%15.7e %15.7e] [%5d %5d %5d %5d]\n", i, 
-                  (*psTorsionID1)[i].x, (*psTorsionID1)[i].y, (*psTorsionID1)[i].z, (*psTorsionID1)[i].w,
-                  (*psTorsionID2)[i].x, (*psTorsionID2)[i].y, (*psTorsionID2)[i].z, (*psTorsionID2)[i].w,
-                  (*psTorsionParameter1)[i].x, (*psTorsionParameter1)[i].y, (*psTorsionParameter1)[i].z, (*psTorsionParameter1)[i].w,
-                  (*psTorsionParameter2)[i].x, (*psTorsionParameter2)[i].y,
-                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].z],
-                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[i].w] );
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = torsions;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaTorsionParameters: number of torsions=%5d\n", torsions );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "    %5d [%5d %5d %5d %5d] [%5d %5d %5d %5d] 0[%15.7e %15.7e] 1[%15.7e %15.7e] 2[%15.7e %15.7e] [%5d %5d %5d %5d]\n", ii, 
+                  (*psTorsionID1)[ii].x, (*psTorsionID1)[ii].y, (*psTorsionID1)[ii].z, (*psTorsionID1)[ii].w,
+                  (*psTorsionID2)[ii].x, (*psTorsionID2)[ii].y, (*psTorsionID2)[ii].z, (*psTorsionID2)[ii].w,
+                  (*psTorsionParameter1)[ii].x, (*psTorsionParameter1)[ii].y, (*psTorsionParameter1)[ii].z, (*psTorsionParameter1)[ii].w,
+                  (*psTorsionParameter2)[ii].x, (*psTorsionParameter2)[ii].y,
+                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[ii].x],
+                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[ii].y],
+                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[ii].z],
+                  gpu->pOutputBufferCounter[psTorsionID1->_pSysData[ii].w] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }
-
    psTorsionID1->Upload();
    psTorsionID2->Upload();
    psTorsionParameter1->Upload();
@@ -799,24 +875,38 @@ void gpuSetAmoebaPiTorsionParameters(amoebaGpuContext amoebaGpu, const std::vect
        psPiTorsionID3->_pSysData[i].y = gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[i].w]++;
        psPiTorsionID3->_pSysData[i].z = gpu->pOutputBufferCounter[psPiTorsionID2->_pSysData[i].x]++;
        psPiTorsionID3->_pSysData[i].w = gpu->pOutputBufferCounter[psPiTorsionID2->_pSysData[i].y]++;
+    }
+
+    // logging info
+
+    if( amoebaGpu->log ){

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > piTorsions - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "PiTorsions: %5d [%5d %5d %5d %5d %5d %5d [%5d %5d %5d %5d %5d %5d]  k=%15.7e [%5d %5d %5d %5d %5d %5d]\n", i, 
-                  (*psPiTorsionID1)[i].x, (*psPiTorsionID1)[i].y, (*psPiTorsionID1)[i].z, (*psPiTorsionID1)[i].w,
-                  (*psPiTorsionID2)[i].x, (*psPiTorsionID2)[i].y, (*psPiTorsionID2)[i].z, (*psPiTorsionID2)[i].w,
-                  (*psPiTorsionID3)[i].x, (*psPiTorsionID3)[i].y, (*psPiTorsionID3)[i].z, (*psPiTorsionID3)[i].w,
-                  (*psPiTorsionParameter)[i],
-                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[i].z],
-                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[i].w],
-                  gpu->pOutputBufferCounter[psPiTorsionID2->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psPiTorsionID2->_pSysData[i].y] );
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = piTorsions;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaPiTorsionParameters: number of pi torsions=%5d\n", piTorsions );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+         fprintf( amoebaGpu->log, "PiTorsions: %5d [%5d %5d %5d %5d %5d %5d [%5d %5d %5d %5d %5d %5d]  k=%15.7e [%5d %5d %5d %5d %5d %5d]\n", ii, 
+                  (*psPiTorsionID1)[ii].x, (*psPiTorsionID1)[ii].y, (*psPiTorsionID1)[ii].z, (*psPiTorsionID1)[ii].w,
+                  (*psPiTorsionID2)[ii].x, (*psPiTorsionID2)[ii].y, (*psPiTorsionID2)[ii].z, (*psPiTorsionID2)[ii].w,
+                  (*psPiTorsionID3)[ii].x, (*psPiTorsionID3)[ii].y, (*psPiTorsionID3)[ii].z, (*psPiTorsionID3)[ii].w,
+                  (*psPiTorsionParameter)[ii],
+                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[ii].x],
+                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[ii].y],
+                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[ii].z],
+                  gpu->pOutputBufferCounter[psPiTorsionID1->_pSysData[ii].w],
+                  gpu->pOutputBufferCounter[psPiTorsionID2->_pSysData[ii].x],
+                  gpu->pOutputBufferCounter[psPiTorsionID2->_pSysData[ii].y] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }

    psPiTorsionID1->Upload();
@@ -859,21 +949,36 @@ void gpuSetAmoebaStretchBendParameters(amoebaGpuContext amoebaGpu, const std::ve
        psStretchBendID1->_pSysData[i].w = gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[i].x]++;
        psStretchBendID2->_pSysData[i].x = gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[i].y]++;
        psStretchBendID2->_pSysData[i].y = gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[i].z]++;
+    }

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > stretchBends - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "StretchBends: %5d [%5d %5d %5d] [%5d %5d %5d] [%15.7e %15.7e %15.7e %15.7e [%5d %5d %5d]\n", i, 
-                  (*psStretchBendID1)[i].x, (*psStretchBendID1)[i].y, (*psStretchBendID1)[i].z, (*psStretchBendID1)[i].w,
-                  (*psStretchBendID2)[i].x, (*psStretchBendID2)[i].y,
-                  (*psStretchBendParameter)[i].x, (*psStretchBendParameter)[i].y, (*psStretchBendParameter)[i].z, (*psStretchBendParameter)[i].w,
-                  gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[i].z] );
+    // logging info
+
+    if( amoebaGpu->log ){
+
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = stretchBends;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaStretchBendParameters: number of stretch bends=%5d\n", stretchBends);
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, " %5d [%5d %5d %5d] [%5d %5d %5d] [%15.7e %15.7e %15.7e %15.7e [%5d %5d %5d]\n", ii, 
+                            (*psStretchBendID1)[ii].x, (*psStretchBendID1)[ii].y, (*psStretchBendID1)[ii].z, (*psStretchBendID1)[ii].w,
+                            (*psStretchBendID2)[ii].x, (*psStretchBendID2)[ii].y,
+                            (*psStretchBendParameter)[ii].x, (*psStretchBendParameter)[ii].y, (*psStretchBendParameter)[ii].z, (*psStretchBendParameter)[ii].w,
+                            gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[ii].x],
+                            gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[ii].y],
+                            gpu->pOutputBufferCounter[psStretchBendID1->_pSysData[ii].z] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }
+
    psStretchBendID1->Upload();
    psStretchBendID2->Upload();
    psStretchBendParameter->Upload();
@@ -906,13 +1011,6 @@ void gpuSetAmoebaOutOfPlaneBendParameters(amoebaGpuContext amoebaGpu, const std:
    amoebaGpu->amoebaSim.amoebaOutOfPlaneBendPenticK     = penticK;
    amoebaGpu->amoebaSim.amoebaOutOfPlaneBendSexticK     = sexticK;

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 10
-#if (DUMP_PARAMETERS > 0 )
-    if( amoebaGpu->log )
-        fprintf( amoebaGpu->log, "OutOfPlaneBends: global ks[%15.7e %15.7e %15.7e %15.7e]\n", cubicK, quarticK, penticK, sexticK );
-#endif
-
    for (int i = 0; i < outOfPlaneBends; i++)
    {
        (*psOutOfPlaneBendID1)[i].x         = particles1[i];
@@ -924,20 +1022,38 @@ void gpuSetAmoebaOutOfPlaneBendParameters(amoebaGpuContext amoebaGpu, const std:
        psOutOfPlaneBendID2->_pSysData[i].y = gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].y]++;
        psOutOfPlaneBendID2->_pSysData[i].z = gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].z]++;
        psOutOfPlaneBendID2->_pSysData[i].w = gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].w]++;
+    }
+
+    // logging info

-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > outOfPlaneBends - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "OutOfPlaneBends: %5d [%5d %5d %5d %5d] [%5d %5d %5d %5d] k=%15.7e [%5d %5d %5d %5d]\n", i, 
-                  (*psOutOfPlaneBendID1)[i].x, (*psOutOfPlaneBendID1)[i].y, (*psOutOfPlaneBendID1)[i].z, (*psOutOfPlaneBendID1)[i].w,
-                  (*psOutOfPlaneBendID2)[i].x, (*psOutOfPlaneBendID2)[i].y, (*psOutOfPlaneBendID2)[i].z, (*psOutOfPlaneBendID2)[i].w,
-                  (*psOutOfPlaneBendParameter)[i], 
-                  gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].z],
-                  gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[i].w] );
+    if( amoebaGpu->log ){
+
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = outOfPlaneBends;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaOutOfPlaneBendParameters: number of out-of-plane bends=%5d global ks[%15.7e %15.7e %15.7e %15.7e]\n", outOfPlaneBends,
+                        cubicK, quarticK, penticK, sexticK );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "    %5d [%5d %5d %5d %5d] [%5d %5d %5d %5d] k=%15.7e [%5d %5d %5d %5d]\n", ii, 
+                            (*psOutOfPlaneBendID1)[ii].x, (*psOutOfPlaneBendID1)[ii].y, (*psOutOfPlaneBendID1)[ii].z, (*psOutOfPlaneBendID1)[ii].w,
+                            (*psOutOfPlaneBendID2)[ii].x, (*psOutOfPlaneBendID2)[ii].y, (*psOutOfPlaneBendID2)[ii].z, (*psOutOfPlaneBendID2)[ii].w,
+                            (*psOutOfPlaneBendParameter)[ii], 
+                            gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[ii].x],
+                            gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[ii].y],
+                            gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[ii].z],
+                            gpu->pOutputBufferCounter[psOutOfPlaneBendID1->_pSysData[ii].w] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }
+
    psOutOfPlaneBendID1->Upload();
    psOutOfPlaneBendID2->Upload();
    psOutOfPlaneBendParameter->Upload();
@@ -980,23 +1096,38 @@ void gpuSetAmoebaTorsionTorsionParameters(amoebaGpuContext amoebaGpu, const std:
        psTorsionTorsionID3->_pSysData[i].y = gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[i].z]++;
        psTorsionTorsionID3->_pSysData[i].z = gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[i].w]++;
        psTorsionTorsionID3->_pSysData[i].w = gpu->pOutputBufferCounter[psTorsionTorsionID2->_pSysData[i].x]++;
+    }
+
+    // logging info

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )
-if( (i < DUMP_PARAMETERS || i > torsionTorsions - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         fprintf( amoebaGpu->log, "TorsionTorsions: %5d [%5d %5d %5d %5d %5d] chiral=%5d grid=%5d [%5d %5d %5d %5d %5d] [%5d %5d %5d %5d %5d]\n", i, 
-                  (*psTorsionTorsionID1)[i].x, (*psTorsionTorsionID1)[i].y, (*psTorsionTorsionID1)[i].z, (*psTorsionTorsionID1)[i].w,
-                  (*psTorsionTorsionID2)[i].x, (*psTorsionTorsionID2)[i].y, (*psTorsionTorsionID2)[i].z, (*psTorsionTorsionID2)[i].w,
-                  (*psTorsionTorsionID3)[i].x, (*psTorsionTorsionID3)[i].y, (*psTorsionTorsionID3)[i].z,  (*psTorsionTorsionID3)[i].w,
-                  gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[i].x],
-                  gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[i].y],
-                  gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[i].z],
-                  gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[i].w], 
-                  gpu->pOutputBufferCounter[psTorsionTorsionID2->_pSysData[i].x] );
+    if( amoebaGpu->log ){
+
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = torsionTorsions;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaTorsionTorsionParameters: number of torsion-torsions=%5d\n", torsionTorsions );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log, "TorsionTorsions: %5d [%5d %5d %5d %5d %5d] chiral=%5d grid=%5d [%5d %5d %5d %5d %5d] [%5d %5d %5d %5d %5d]\n", ii, 
+                            (*psTorsionTorsionID1)[ii].x, (*psTorsionTorsionID1)[ii].y, (*psTorsionTorsionID1)[ii].z, (*psTorsionTorsionID1)[ii].w,
+                            (*psTorsionTorsionID2)[ii].x, (*psTorsionTorsionID2)[ii].y, (*psTorsionTorsionID2)[ii].z, (*psTorsionTorsionID2)[ii].w,
+                            (*psTorsionTorsionID3)[ii].x, (*psTorsionTorsionID3)[ii].y, (*psTorsionTorsionID3)[ii].z,  (*psTorsionTorsionID3)[ii].w,
+                            gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[ii].x],
+                            gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[ii].y],
+                            gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[ii].z],
+                            gpu->pOutputBufferCounter[psTorsionTorsionID1->_pSysData[ii].w], 
+                            gpu->pOutputBufferCounter[psTorsionTorsionID2->_pSysData[ii].x] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
 #endif
-#undef DUMP_PARAMETERS
+        (void) fflush( amoebaGpu->log );
    }
+
    psTorsionTorsionID1->Upload();
    psTorsionTorsionID2->Upload();
    psTorsionTorsionID3->Upload();
@@ -1069,7 +1200,7 @@ void gpuSetAmoebaTorsionTorsionGrids(amoebaGpuContext amoebaGpu, const std::vect
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "Grids %u totalGridEntries=%u totalFloat4 entries=%u\n", torsionTorsionGrids, totalGridEntries, totalEntries );
        for (unsigned int ii = 0; ii < floatGrids.size(); ii++) {
-            (void) fprintf( amoebaGpu->log, "Grid %u offset=%d begin=%.3f delta=%.3f Ny=%d\n", ii, amoebaGpu->amoebaSim.amoebaTorTorGridOffset[ii],
+            (void) fprintf( amoebaGpu->log, "Grid %u offset=%6d begin=%10.3f delta=%10.3f Ny=%3d\n", ii, amoebaGpu->amoebaSim.amoebaTorTorGridOffset[ii],
                            amoebaGpu->amoebaSim.amoebaTorTorGridBegin[ii], amoebaGpu->amoebaSim.amoebaTorTorGridDelta[ii], amoebaGpu->amoebaSim.amoebaTorTorGridNy[ii]);
        }
    }
@@ -1080,25 +1211,43 @@ void gpuSetAmoebaTorsionTorsionGrids(amoebaGpuContext amoebaGpu, const std::vect
    for (unsigned int ii = 0; ii < floatGrids.size(); ii++) {
        for (unsigned int jj = 0; jj < floatGrids[ii].size(); jj++) {
            for (unsigned int kk = 0; kk < floatGrids[ii][jj].size(); kk++) {
+
                (*psTorsionTorsionGrids)[index].x    = floatGrids[ii][jj][kk][2];
                (*psTorsionTorsionGrids)[index].y    = floatGrids[ii][jj][kk][3];
                (*psTorsionTorsionGrids)[index].z    = floatGrids[ii][jj][kk][4];
                (*psTorsionTorsionGrids)[index].w    = floatGrids[ii][jj][kk][5];

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 5
-#if (DUMP_PARAMETERS > 0 )
-if( (index < DUMP_PARAMETERS || index > totalEntries - (DUMP_PARAMETERS + 1)) && amoebaGpu->log )
-         (void) fprintf( amoebaGpu->log, "TorsionTorsionGrid: %5d %5d [%5d %5d ] [%10.3f %10.3f] [%15.7e %15.7e %15.7e %15.7e]\n", index, ii, jj, kk,
-                  floatGrids[ii][jj][kk][0], floatGrids[ii][jj][kk][1],
-                  (*psTorsionTorsionGrids)[index].x, (*psTorsionTorsionGrids)[index].y, (*psTorsionTorsionGrids)[index].z, (*psTorsionTorsionGrids)[index].w );
-#endif
-#undef DUMP_PARAMETERS
                index++;
            }
        }
    }
    
+    // logging info
+
+    if( amoebaGpu->log ){
+
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaTorsionTorsionGrids: number of grids=%5u\n", static_cast<unsigned int>(floatGrids.size()) );
+#ifdef PARAMETER_PRINT
+        unsigned int index    = 0;
+        for (unsigned int ii = 0; ii < floatGrids.size(); ii++) {
+            for (unsigned int jj = 0; jj < floatGrids[ii].size(); jj++) {
+                for (unsigned int kk = 0; kk < floatGrids[ii][jj].size(); kk++) {
+                    if( index < maxPrint || (index > (totalEntries - maxPrint)) ){
+                        (void) fprintf( amoebaGpu->log, "     %5d %5d [%5d %5d ] [%10.3f %10.3f] [%15.7e %15.7e %15.7e %15.7e]\n", index, ii, jj, kk,
+                                        floatGrids[ii][jj][kk][0], floatGrids[ii][jj][kk][1],
+                                        (*psTorsionTorsionGrids)[index].x, (*psTorsionTorsionGrids)[index].y, (*psTorsionTorsionGrids)[index].z, (*psTorsionTorsionGrids)[index].w );
+                    }
+                    index++;
+                }
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
+#endif
+        (void) fflush( amoebaGpu->log );
+    }
+    
 #if 0
    std::vector<float> grids;
    grids.resize( totalGridEntries );
@@ -1155,8 +1304,6 @@ if( (index < DUMP_PARAMETERS || index > totalEntries - (DUMP_PARAMETERS + 1)) &&
 exit(0);
 #endif

-
-
    psTorsionTorsionGrids->Upload();

 }
@@ -1201,8 +1348,6 @@ void gpuSetAmoebaBondOffsets(amoebaGpuContext amoebaGpu )
    amoebaGpu->amoebaSim.amoebaUreyBradley_offset              = amoebaGpu->amoebaSim.amoebaTorsionTorsion_offset +
                                                                 (amoebaGpu->psAmoebaUreyBradleyParameter ?  amoebaGpu->psAmoebaUreyBradleyParameter->_stride : 0);

-    //gpu->sim.localForces_threads_per_block  = (std::max(amoebaGpu->amoebaSim.amoebaTorsionTorsion_offset, gpu->sim.customBonds) / gpu->sim.blocks + 15) & 0xfffffff0;
-
    unsigned int maxI                                          = (amoebaGpu->amoebaSim.amoebaUreyBradley_offset > gpu->sim.customBonds) ? amoebaGpu->amoebaSim.amoebaUreyBradley_offset : gpu->sim.customBonds;
    gpu->sim.localForces_threads_per_block                     = (maxI/gpu->sim.blocks + 15) & 0xfffffff0;
    if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
@@ -1403,8 +1548,6 @@ static void gpuFixedEFieldAllocate( amoebaGpuContext amoebaGpu )
    
    unsigned int offset                              = paddedNumberOfAtoms*sizeof( float );
    memset( amoebaGpu->psDampingFactorAndThole->_pSysData,              0,2*offset );
-    //memset( amoebaGpu->psE_Field->_pSysData,            0, offset*3 );
-    //memset( amoebaGpu->psE_FieldPolar->_pSysData,       0, offset*3 );

 }

@@ -1522,13 +1665,6 @@ void gpuKirkwoodAllocate( amoebaGpuContext amoebaGpu )

    int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;

-#ifdef AMOEBA_DEBUG
-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log,"%s: paddedNumberOfAtoms      =%d\n", methodName.c_str(), paddedNumberOfAtoms );
-        (void) fflush( amoebaGpu->log );
-    }
-#endif
-
    amoebaGpu->psBorn                            = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBorn");
    amoebaGpu->psBornPolar                       = new CUDAStream<float>(paddedNumberOfAtoms,   1, "KirkwoodBornPolar");
    amoebaGpu->psGk_Field                        = new CUDAStream<float>(paddedNumberOfAtoms*3, 1, "Gk_Fixed_Field");
@@ -1596,7 +1732,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect

    unsigned int dipoleIndex                                                  = 0;
    unsigned int quadrupoleIndex                                              = 0;
-    unsigned int maxPrint                                                     = 5;
+    unsigned int maxPrint                                                     = 10;
    
    if( nonbondedMethod == 0 ){
        amoebaGpu->multipoleNonbondedMethod = AMOEBA_NO_CUTOFF;
@@ -1606,12 +1742,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
        throw OpenMM::OpenMMException("multipoleNonbondedMethod not recognized.\n" );
    }

-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log,"%s Nonbonded method=%d %d [NoCutoff=%d PME=%d] polarizationType=%d (0=mutual/1=direct)\n",
-                        methodName.c_str(), nonbondedMethod, amoebaGpu->multipoleNonbondedMethod,
-                        AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD, polarizationType );
-        (void) fflush( amoebaGpu->log );
-    }
    amoebaGpu->amoebaSim.sqrtPi                      = std::sqrt( 3.14159265358f );
    amoebaGpu->amoebaSim.electric                    = electricConstant;
    amoebaGpu->gpuContext->sim.alphaEwald            = alphaEwald;
@@ -1621,11 +1751,28 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
        amoebaGpu->amoebaSim.dielec               = 1.0f;
    }

-    static const int maxAxisType = 6;
-    int axisTypeCount[maxAxisType+1] = { 0, 0, 0, 0, 0, 0, 0 };
+    // logging info
+
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log,"%s\n", methodName.c_str() );
+        (void) fprintf( amoebaGpu->log,"   input nonbonded method=%d multipoleNonbondedMethod=%d [NoCutoff=%d PME=%d]\n",
+                        nonbondedMethod, amoebaGpu->multipoleNonbondedMethod, AMOEBA_NO_CUTOFF, AMOEBA_PARTICLE_MESH_EWALD );
+        (void) fprintf( amoebaGpu->log,"   polarizationType=%d (0=mutual/1=direct)\n", polarizationType );
+        (void) fprintf( amoebaGpu->log,"   maxCovalentDegreeSz=%d minId=%s mutualInducedMaxIterations=%d mutualInducedTargetEpsilon=%15.7e\n", 
+                        amoebaGpu->maxCovalentDegreeSz, minId.c_str(), amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon );
+        (void) fprintf( amoebaGpu->log,"   electric=%15.7e alphaEwald=%15.7e nonbondedCutoff=%15.7e dielec=%15.7e\n", 
+                        amoebaGpu->amoebaSim.electric, amoebaGpu->gpuContext->sim.alphaEwald, amoebaGpu->gpuContext->sim.nonbondedCutoff,  amoebaGpu->amoebaSim.dielec );
+        (void) fflush( amoebaGpu->log );
+    }
+
    std::vector<int> axisCount(charges.size(),0);
-    int maxTorqueBufferIndex = 0;
-    for( int ii = 0; ii < static_cast<int>(charges.size()); ii++ ){
+
+    static const int maxAxisType     = 6;
+    int axisTypeCount[maxAxisType+1] = { 0, 0, 0, 0, 0, 0, 0 };
+    int maxTorqueBufferIndex         = 0;
+
+    int chargeSize                   = static_cast<int>(charges.size());
+    for( int ii = 0; ii < chargeSize; ii++ ){

        // axis type & multipole particles ids
 
@@ -1666,10 +1813,15 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            }
        }

-        if( 0 && amoebaGpu->log ){
-            fprintf( amoebaGpu->log, "Z1 %4d %d [%4d %4d %4d] dmp/thole %15.7e %15.7e\n", ii,  axisType[ii],
-                     multipoleParticleX[ii], multipoleParticleY[ii], multipoleParticleZ[ii], dampingFactors[ii], tholes[ii] );
+#ifdef MAX_PARAMETER_PRINT
+        if( amoebaGpu->log && (ii < maxPrint || (ii > (chargeSize-maxPrint)) ) ){
+            (void) fprintf( amoebaGpu->log, "%6d axisType=%1d axis=[%6d %6d %6d] dipole[%15.7e %15.7e %15.7e] dmp/thole/polar %15.7e %15.7e %15.7e\n",
+                            ii,  axisType[ii],
+                            multipoleParticleX[ii], multipoleParticleY[ii], multipoleParticleZ[ii], 
+                            dipoles[dipoleIndex], dipoles[dipoleIndex+1], dipoles[dipoleIndex+2],
+                            dampingFactors[ii], tholes[ii], polarity[ii] );
        }
+#endif

        // charges

@@ -1748,8 +1900,9 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            }
        }

-#ifdef AMOEBA_DEBUG
-        if( (amoebaGpu->log && ( ( ( ii < maxPrint ) || (ii >= (charges.size() - maxPrint) )) ) ) ){
+        // logging info
+
+        if( 0 && (amoebaGpu->log && ( ( ( ii < maxPrint ) || (ii >= (chargeSize - maxPrint) )) ) ) ){

            // axis particles

@@ -1848,9 +2001,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            amoebaGpu->psDampingFactorAndThole->_pSysData[ii].x == -std::numeric_limits<double>::infinity()){
            (void) fprintf( amoebaGpu->log,"Nan detected at index=%d in psDampingFactor\n", ii );
        }
-#endif

-#ifdef AMOEBA_DEBUG
+#if 0
        if( amoebaGpu->log ){

            // covalent
@@ -1901,12 +2053,14 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    }

    amoebaGpu->amoebaSim.maxTorqueBufferIndex = maxTorqueBufferIndex;
+
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "Max axis count=%d\n", maxTorqueBufferIndex );
+        (void) fprintf( amoebaGpu->log, "%s max axis count=%d\n", methodName.c_str(), maxTorqueBufferIndex );
        std::string axisLabel[maxAxisType+1] = {  "ZThenX", "Bisector", "ZBisect", "ThreeFold", "ZOnly", "NoAxisType", "Unknown"};
        for( unsigned int kk = 0; kk < (maxAxisType+1); kk++ ){
            (void) fprintf( amoebaGpu->log, "%2u %10s atom count=%d\n", kk, axisLabel[kk].c_str(), axisTypeCount[kk] );
        }
+        (void) fprintf( amoebaGpu->log, "\n" );
    }

    // upload
@@ -1932,7 +2086,6 @@ void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu, float innerDielectri
    gpuContext gpu                         = amoebaGpu->gpuContext;
    int paddedNumberOfAtoms                = gpu->sim.paddedNumberOfAtoms;
    gpu->sim.dielectricOffset              = dielectricOffset;
-    //gpu->bIncludeGBSA                      = 1;
    amoebaGpu->includeObcCavityTerm        = includeCavityTerm;
    gpu->sim.probeRadius                   = probeRadius;
    gpu->sim.surfaceAreaFactor             = surfaceAreaFactor;
@@ -1961,17 +2114,13 @@ void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu, float innerDielectri
    amoebaGpu->amoebaSim.dwater      = solventDielectric;
    amoebaGpu->amoebaSim.dielec      = innerDielectric;

-//    amoebaGpu->amoebaSim.fc          = amoebaGpu->amoebaSim.electric*1.0f*(1.0f-solventDielectric)/(0.0f+1.0f*solventDielectric);;
-//    amoebaGpu->amoebaSim.fd          = amoebaGpu->amoebaSim.electric*2.0f*(1.0f-solventDielectric)/(1.0f+2.0f*solventDielectric);;
-//    amoebaGpu->amoebaSim.fq          = amoebaGpu->amoebaSim.electric*3.0f*(1.0f-solventDielectric)/(2.0f+3.0f*solventDielectric);;
-
    amoebaGpu->amoebaSim.fc          = 1.0f*(1.0f-solventDielectric)/(0.0f+1.0f*solventDielectric);;
    amoebaGpu->amoebaSim.fd          = 2.0f*(1.0f-solventDielectric)/(1.0f+2.0f*solventDielectric);;
    amoebaGpu->amoebaSim.fq          = 3.0f*(1.0f-solventDielectric)/(2.0f+3.0f*solventDielectric);;

    gpu->sim.preFactor               = -amoebaGpu->amoebaSim.electric*((1.0f/innerDielectric)-(1.0f/solventDielectric));

-    if( 0 && amoebaGpu->log ){
+    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log,"gpuSetAmoebaObcParameters: cavity=%d dielectricOffset=%15.7e probeRadius=%15.7e surfaceAreaFactor=%15.7e\n", 
                        includeCavityTerm, dielectricOffset, probeRadius, surfaceAreaFactor );
        (void) fprintf( amoebaGpu->log,"                           gkc=%12.3f solventDielectric=%15.7e innerDielectric=%15.7e sim.preFactor=%15.7e\n", 
@@ -1993,6 +2142,84 @@ void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu, float innerDielectri

 }

+extern "C"
+void gpuSetAmoebaGrycukParameters( amoebaGpuContext amoebaGpu, float innerDielectric, float solventDielectric, float dielectricOffset,
+                                   const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
+                                   int includeCavityTerm, float probeRadius, float surfaceAreaFactor )
+{
+
+    gpuContext gpu                         = amoebaGpu->gpuContext;
+    int paddedNumberOfAtoms                = gpu->sim.paddedNumberOfAtoms;
+    gpu->sim.dielectricOffset              = dielectricOffset;
+    amoebaGpu->includeObcCavityTerm        = includeCavityTerm;
+    gpu->sim.probeRadius                   = probeRadius;
+    gpu->sim.surfaceAreaFactor             = surfaceAreaFactor;
+    unsigned int particles                 = radius.size();
+
+    for (unsigned int i = 0; i < particles; i++) 
+    {    
+            (*gpu->psObcData)[i].x = radius[i];
+            (*gpu->psObcData)[i].y = scale[i]*(*gpu->psObcData)[i].x;
+            (*gpu->psPosq4)[i].w   = charge[i];
+    }    
+
+    // Dummy out extra particles data
+
+    for (unsigned int i = particles; i < paddedNumberOfAtoms; i++) 
+    {    
+        (*gpu->psBornRadii)[i]     = 0.2f;
+        (*gpu->psObcData)[i].x     = 0.01f;
+        (*gpu->psObcData)[i].y     = 0.01f;
+    }    
+
+    gpu->psBornRadii->Upload();
+    gpu->psObcData->Upload();
+    gpu->psPosq4->Upload();
+
+    amoebaGpu->amoebaSim.gkc         = 2.455f;
+    amoebaGpu->amoebaSim.dwater      = solventDielectric;
+    amoebaGpu->amoebaSim.dielec      = innerDielectric;
+
+    amoebaGpu->amoebaSim.fc          = 1.0f*(1.0f-solventDielectric)/(0.0f+1.0f*solventDielectric);;
+    amoebaGpu->amoebaSim.fd          = 2.0f*(1.0f-solventDielectric)/(1.0f+2.0f*solventDielectric);;
+    amoebaGpu->amoebaSim.fq          = 3.0f*(1.0f-solventDielectric)/(2.0f+3.0f*solventDielectric);;
+
+    gpu->sim.preFactor               = -amoebaGpu->amoebaSim.electric*((1.0f/innerDielectric)-(1.0f/solventDielectric));
+
+    // logging info
+
+    if( amoebaGpu->log ){
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        unsigned int maxIndex = particles;
+        (void) fprintf( amoebaGpu->log,"gpuSetAmoebaGrycukParameters: cavity=%d dielectricOffset=%15.7e probeRadius=%15.7e surfaceAreaFactor=%15.7e\n", 
+                        includeCavityTerm, dielectricOffset, probeRadius, surfaceAreaFactor );
+        (void) fprintf( amoebaGpu->log,"                           gkc=%12.3f solventDielectric=%15.7e innerDielectric=%15.7e sim.preFactor=%15.7e\n", 
+                        amoebaGpu->amoebaSim.gkc, amoebaGpu->amoebaSim.dwater, amoebaGpu->amoebaSim.dielec, gpu->sim.preFactor );
+        (void) fprintf( amoebaGpu->log,"                           fc=%15.7e fd=%15.7e fq=%15.7e\n",
+                        amoebaGpu->amoebaSim.fc, amoebaGpu->amoebaSim.fq, amoebaGpu->amoebaSim.fq );
+        (void) fprintf( amoebaGpu->log,"\nRadius scl*radius scl\n" );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < maxIndex; ii++ ){
+            (void) fprintf( amoebaGpu->log,"%6d %15.7e %15.7e %15.7e\n", ii,
+                            (*gpu->psObcData)[ii].x, (*gpu->psObcData)[ii].y, scale[ii] );
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = maxIndex - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
+        }
+        (void) fprintf( amoebaGpu->log, "\n" );
+#endif
+        (void) fflush( amoebaGpu->log );
+    }
+
+    gpuRotationToLabFrameAllocate( amoebaGpu );
+    gpuFixedEFieldAllocate( amoebaGpu );
+    gpuElectrostaticAllocate( amoebaGpu );
+    gpuKirkwoodAllocate( amoebaGpu );
+
+}
+
 static int encodeCell( unsigned int x, unsigned int y ){
    return ( (x << 17) | (y << 2) );
 }
@@ -2068,22 +2295,15 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
        }
    }

-    if( particles < 1 ){
-        (void) fprintf( stderr, "%s number of particles\n", methodName );
-        return;
-    } 
-
    amoebaGpu->psVdwSigmaEpsilon           = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms,   1, "VdwSigmaEpsilon");
-    for (unsigned int ii = 0; ii < particles; ii++) 
-    {    
+    for( unsigned int ii = 0; ii < particles; ii++ ){    
        amoebaGpu->psVdwSigmaEpsilon->_pSysData[ii].x    = sigmas[ii];
        amoebaGpu->psVdwSigmaEpsilon->_pSysData[ii].y    = epsilons[ii];
    }    

    // Dummy out extra particles data

-    for (unsigned int ii = particles; ii < gpu->sim.paddedNumberOfAtoms; ii++) 
-    {    
+    for( unsigned int ii = particles; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){    
        amoebaGpu->psVdwSigmaEpsilon->_pSysData[ii].x     = 1.0f;
        amoebaGpu->psVdwSigmaEpsilon->_pSysData[ii].y     = 0.0f;
    }    
@@ -2114,23 +2334,12 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,

   unsigned int numberOfNonReductions = 0;
   unsigned int numberOfReductions    = 0;
-    for (unsigned int ii = 0; ii < particles; ii++) 
-    {    
-        if( ivNonMapping[ii] )
-        {
+    for( unsigned int ii = 0; ii < particles; ii++ ){    
+        if( ivNonMapping[ii] ){
            numberOfNonReductions++;
        }
-        if( ivMapping[ii].size() > 0 )
-        {
+        if( ivMapping[ii].size() > 0 ){
            numberOfReductions++;
-#ifdef AMOEBA_DEBUG
-            (void) fprintf( amoebaGpu->log, "Atom %u has %u reductions: [", ii, ivMapping[ii].size() );
-            for (unsigned int jj = 0; jj < ivMapping[ii].size(); jj++) 
-            {
-                (void) fprintf( amoebaGpu->log, "%u ", ivMapping[ii][jj] );
-            }
-            (void) fprintf( amoebaGpu->log, "]  %12.4f\n", reductionFactors[ii] );
-#endif
        }
    }    
    
@@ -2152,15 +2361,12 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,

   unsigned int count    = 0;
   unsigned int nonCount = 0;
-    for (unsigned int ii = 0; ii < particles; ii++) 
-    {    
-        if( ivNonMapping[ii] )
-        {
+    for( unsigned int ii = 0; ii < particles; ii++ ){    
+        if( ivNonMapping[ii] ){
            psVdwNonReductionID->_pSysData[nonCount++] = ii;
        }

-        if( ivMapping[ii].size() > 0 )
-        {
+        if( ivMapping[ii].size() > 0 ){
            psAmoebaVdwReduction->_pSysData[count] = reductionFactors[ii];
            psVdwReductionID->_pSysData[count].x   = ii;
            psVdwReductionID->_pSysData[count].y   = ivMapping[ii][0];
@@ -2186,79 +2392,108 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
    psAmoebaVdwReduction->Upload();

    amoebaGpu->vdwExclusions.resize( gpu->natoms );
-    for( unsigned int ii = 0; ii < gpu->natoms; ii++)
-    {
+    for( unsigned int ii = 0; ii < gpu->natoms; ii++ ){
        for (unsigned int jj = 0; jj < allExclusions[ii].size(); jj++){ 
            amoebaGpu->vdwExclusions[ii].push_back( allExclusions[ii][jj] );
        }
    }

-#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        unsigned int maxPrint = 32;
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
        (void) fprintf( amoebaGpu->log, "%s sigma/epsilon combining rules=%d %d\n", methodName, 
                        amoebaGpu->vdwSigmaCombiningRule, amoebaGpu->vdwEpsilonCombiningRule);
-        for (unsigned int ii = 0; ii < gpu->natoms; ii++) 
-        {    
-            (void) fprintf( amoebaGpu->log, "%5u %15.7e %15.7e Ex[", ii, sigmas[ii], epsilons[ii] );
-            for (unsigned int jj = 0; jj < allExclusions[ii].size(); jj++){ 
-                (void) fprintf( amoebaGpu->log, "%d ", allExclusions[ii][jj] );
+        (void) fprintf( amoebaGpu->log, "%s particles=%d numberOfNonReductions=%d numberOfReductionsi=%d\n",
+                        methodName, particles, numberOfNonReductions, numberOfReductions );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < gpu->natoms; ii++ ){    
+            (void) fprintf( amoebaGpu->log, "%5u %15.7e %15.7e %15.7e Ex[", ii, sigmas[ii], epsilons[ii], reductionFactors[ii] );
+            for( unsigned int jj = 0; jj < allExclusions[ii].size(); jj++ ){ 
+                (void) fprintf( amoebaGpu->log, "%6d ", allExclusions[ii][jj] );
            }
            (void) fprintf( amoebaGpu->log, "]\n", ii, sigmas[ii], epsilons[ii] );
-            if( ii == maxPrint && ii < (gpu->sim.paddedNumberOfAtoms - maxPrint) )
-            {
-                ii = (gpu->sim.addedNumberOfAtoms - maxPrint);
+
+            if( ii == maxPrint ){
+                (void) fprintf( amoebaGpu->log, "\n" );
+                ii = (gpu->sim.paddedNumberOfAtoms - maxPrint);
+                if( ii < maxPrint )ii = maxPrint;
            }
        } 
+        (void) fprintf( amoebaGpu->log, "\n" );
+#endif
        (void) fflush( amoebaGpu->log );
    }    
-#endif

-#undef AMOEBA_DEBUG
 }

 extern "C"
 void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
 {
-    gpuContext gpu = amoebaGpu->gpuContext;
-    gpu->sim.alphaEwald = alpha;
-    int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
-    gpu->sim.pmeGridSize = gridSize;
+    gpuContext gpu                         = amoebaGpu->gpuContext;
+
+    gpu->sim.alphaEwald                    = alpha;
+
+    int3 gridSize                          = make_int3(gridSizeX, gridSizeY, gridSizeZ);
+    gpu->sim.pmeGridSize                   = gridSize;
+
    int3 groupSize = make_int3(2, 4, 4);
    gpu->sim.pmeGroupSize = groupSize;
-    const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
-    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
+
+    // logging info
+
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "gpuSetAmoebaPMEParameters alpha=%15.7e grid: [%d %d %d]\n",
+                        gpu->sim.alphaEwald , gridSizeX, gridSizeY, gridSizeZ );
+    }
+
+    const int3 numGroups                   = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
+    const unsigned int totalGroups         = numGroups.x*numGroups.y*numGroups.z;
+
    cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
-    gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
-    gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
-    gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
-    gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
-    gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
-    gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
-    gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
-    gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
-    amoebaGpu->psThetai1 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai1");
-    amoebaGpu->amoebaSim.pThetai1 = amoebaGpu->psThetai1->_pDevData;
-    amoebaGpu->psThetai2 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai2");
-    amoebaGpu->amoebaSim.pThetai2 = amoebaGpu->psThetai2->_pDevData;
-    amoebaGpu->psThetai3 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai3");
-    amoebaGpu->amoebaSim.pThetai3 = amoebaGpu->psThetai3->_pDevData;
-    amoebaGpu->psIgrid = new CUDAStream<int4>(gpu->natoms, 1, "igrid");
-    amoebaGpu->amoebaSim.pIgrid = amoebaGpu->psIgrid->_pDevData;
-    amoebaGpu->psPhi = new CUDAStream<float>(20*gpu->natoms, 1, "phi");
-    amoebaGpu->amoebaSim.pPhi = amoebaGpu->psPhi->_pDevData;
-    amoebaGpu->psPhid = new CUDAStream<float>(10*gpu->natoms, 1, "phid");
-    amoebaGpu->amoebaSim.pPhid = amoebaGpu->psPhid->_pDevData;
-    amoebaGpu->psPhip = new CUDAStream<float>(10*gpu->natoms, 1, "phip");
-    amoebaGpu->amoebaSim.pPhip = amoebaGpu->psPhip->_pDevData;
-    amoebaGpu->psPhidp = new CUDAStream<float>(20*gpu->natoms, 1, "phidp");
-    amoebaGpu->amoebaSim.pPhidp = amoebaGpu->psPhidp->_pDevData;
-    gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
-    gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
-    gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
-    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
-    gpu->psPmeBsplineTheta = new CUDAStream<float4>(1, 1, "PmeBsplineTheta"); // Not actually uesd
-    gpu->psPmeBsplineDtheta = new CUDAStream<float4>(1, 1, "PmeBsplineDtheta"); // Not actually used
+
+    gpu->psPmeGrid                         = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
+    gpu->sim.pPmeGrid                      = gpu->psPmeGrid->_pDevData;
+
+    gpu->psPmeBsplineModuli[0]             = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
+    gpu->sim.pPmeBsplineModuli[0]          = gpu->psPmeBsplineModuli[0]->_pDevData;
+
+    gpu->psPmeBsplineModuli[1]             = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
+    gpu->sim.pPmeBsplineModuli[1]          = gpu->psPmeBsplineModuli[1]->_pDevData;
+
+    gpu->psPmeBsplineModuli[2]             = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
+    gpu->sim.pPmeBsplineModuli[2]          = gpu->psPmeBsplineModuli[2]->_pDevData;
+
+    amoebaGpu->psThetai1                   = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai1");
+    amoebaGpu->amoebaSim.pThetai1          = amoebaGpu->psThetai1->_pDevData;
+
+    amoebaGpu->psThetai2                   = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai2");
+    amoebaGpu->amoebaSim.pThetai2          = amoebaGpu->psThetai2->_pDevData;
+
+    amoebaGpu->psThetai3                   = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai3");
+    amoebaGpu->amoebaSim.pThetai3          = amoebaGpu->psThetai3->_pDevData;
+
+    amoebaGpu->psIgrid                     = new CUDAStream<int4>(gpu->natoms, 1, "igrid");
+    amoebaGpu->amoebaSim.pIgrid            = amoebaGpu->psIgrid->_pDevData;
+
+    amoebaGpu->psPhi                       = new CUDAStream<float>(20*gpu->natoms, 1, "phi");
+    amoebaGpu->amoebaSim.pPhi              = amoebaGpu->psPhi->_pDevData;
+
+    amoebaGpu->psPhid                      = new CUDAStream<float>(10*gpu->natoms, 1, "phid");
+    amoebaGpu->amoebaSim.pPhid             = amoebaGpu->psPhid->_pDevData;
+
+    amoebaGpu->psPhip                      = new CUDAStream<float>(10*gpu->natoms, 1, "phip");
+    amoebaGpu->amoebaSim.pPhip             = amoebaGpu->psPhip->_pDevData;
+
+    amoebaGpu->psPhidp                     = new CUDAStream<float>(20*gpu->natoms, 1, "phidp");
+    amoebaGpu->amoebaSim.pPhidp            = amoebaGpu->psPhidp->_pDevData;
+
+    gpu->psPmeAtomRange                    = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
+    gpu->sim.pPmeAtomRange                 = gpu->psPmeAtomRange->_pDevData;
+
+    gpu->psPmeAtomGridIndex                = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
+    gpu->sim.pPmeAtomGridIndex             = gpu->psPmeAtomGridIndex->_pDevData;
+
+    gpu->psPmeBsplineTheta                 = new CUDAStream<float4>(1, 1, "PmeBsplineTheta"); // Not actually uesd
+    gpu->psPmeBsplineDtheta                = new CUDAStream<float4>(1, 1, "PmeBsplineDtheta"); // Not actually used

    // Initialize the b-spline moduli.

@@ -2624,22 +2859,23 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
    amoebaGpu->amoebaSim.shctd                       = shctd;
    amoebaGpu->amoebaSim.dispoff                     = dispoff;

-#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        unsigned int maxPrint = 10;
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
        (void) fprintf( amoebaGpu->log, "%s particles=%u total max dispersion energy=%14.5e eps[%14.5e %14.5e] rmin[%14.5e %14.5e] awtr=%14.5e shctd=%14.5e dispoff=%14.5e\n",
-                        methodName, radii.size(), totalMaxWcaDisperionEnergy, epso, epsh, rmino, rminh, awater, shctd, dispoff );
-        for (unsigned int ii = 0; ii < gpu->natoms; ii++) 
-        {    
+                        methodName, static_cast<unsigned int>(radii.size()), totalMaxWcaDispersionEnergy, epso, epsh, rmino, rminh, awater, shctd, dispoff );
+#ifdef PARAMETER_PRINT
+        for( unsigned int ii = 0; ii < gpu->natoms; ii++ ){    
            (void) fprintf( amoebaGpu->log, "%5u %15.7e %15.7e\n", ii, radii[ii], epsilons[ii] );
-            if( ii == maxPrint && ii < (paddedNumberOfAtoms - maxPrint) )
-            {
+            if( ii == maxPrint && ii < (paddedNumberOfAtoms - maxPrint) ){
+                (void) fprintf( amoebaGpu->log, "\n" );
                ii = (paddedNumberOfAtoms - maxPrint);
+                if( ii < maxPrint )ii = maxPrint;
            }
        } 
+        (void) fprintf( amoebaGpu->log, "\n" );
+#endif
        (void) fflush( amoebaGpu->log );
    }    
-#endif

 }

@@ -2770,37 +3006,39 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
 }

 extern "C"
-void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu) 
+void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu, int updateFlag ) 
 {

-    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "In amoebaGpuSetConstants\n" );
-        (void) fflush( amoebaGpu->log );
-    }
+    if( updateFlag == 0 ){

-    if( amoebaGpu->amoebaSim.dielec > 0.0f && amoebaGpu->amoebaSim.dwater > 0.0f ){
-        amoebaGpu->gpuContext->sim.preFactor = -amoebaGpu->amoebaSim.electric*((1.0f/amoebaGpu->amoebaSim.dielec)-(1.0f/amoebaGpu->amoebaSim.dwater));
+        if( amoebaGpu->log ){
+            (void) fprintf( amoebaGpu->log, "AmoebaGpuSetConstants %d\n", updateFlag );
+            (void) fflush( amoebaGpu->log );
+        }
+
+        if( amoebaGpu->amoebaSim.dielec > 0.0f && amoebaGpu->amoebaSim.dwater > 0.0f ){
+            amoebaGpu->gpuContext->sim.preFactor = -amoebaGpu->amoebaSim.electric*((1.0f/amoebaGpu->amoebaSim.dielec)-(1.0f/amoebaGpu->amoebaSim.dwater));
+        }
+        gpuSetAmoebaBondOffsets( amoebaGpu );
+        SetCalculateAmoebaLocalForcesSim( amoebaGpu );
+        SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
+        SetCalculateAmoebaKirkwoodSim( amoebaGpu );
+        SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpu );
+        SetCalculateAmoebaGrycukSim(  amoebaGpu  );
    }

-    gpuSetAmoebaBondOffsets( amoebaGpu );
-    SetCalculateAmoebaLocalForcesSim( amoebaGpu );
-    SetForcesSim( amoebaGpu->gpuContext );
+    SetCalculateAmoebaCudaUtilitiesSim( amoebaGpu );
    SetCalculateAmoebaMultipoleForcesSim( amoebaGpu );
    SetCalculateAmoebaCudaFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaVdw14_7Sim( amoebaGpu );
-    SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaPmeMutualInducedFieldSim( amoebaGpu );
    SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu );
    SetCalculateAmoebaElectrostaticSim( amoebaGpu );
    SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpu );
    SetCalculateAmoebaCudaMapTorquesSim( amoebaGpu );
-    SetCalculateAmoebaKirkwoodSim( amoebaGpu );
-    SetCalculateAmoebaCudaUtilitiesSim( amoebaGpu );
-    SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpu );
    SetCalculateAmoebaCudaFixedEAndGKFieldsSim( amoebaGpu );
    SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim( amoebaGpu );
-    SetCalculateObcGbsaForces2Sim(  amoebaGpu->gpuContext  );
    SetCalculateAmoebaPMESim( amoebaGpu );
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -124,9 +124,15 @@ extern void GetCalculateAmoebaKirkwoodEDiffSim( amoebaGpuContext amoebaGpu );
 //extern void cudaComputeAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );
 extern void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );

-extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
-extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
-extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
+//extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
+//extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
+//extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
+extern void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu );
+extern void kReduceGrycukGbsaBornSum( amoebaGpuContext gpu );
+extern void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
+extern void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
+extern void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu );
+

 // OBC -- Part 1
 //extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -288,6 +288,11 @@ void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu , float innerDielectr
                                const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
                                int includeCavityTerm, float probeRadius, float surfaceAreaFactor);

+extern "C"
+void gpuSetAmoebaGrycukParameters( amoebaGpuContext amoebaGpu , float innerDielectric, float solventDielectric, float dielectricOffset,
+                                   const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
+                                   int includeCavityTerm, float probeRadius, float surfaceAreaFactor);
+
 extern "C"
 void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
                                const std::vector<int>& indexIVs, 
@@ -313,7 +318,7 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
                                const float awater, const float shctd, const float dispoff );
 
 extern "C"
-void amoebaGpuSetConstants(amoebaGpuContext gpu);
+void amoebaGpuSetConstants(amoebaGpuContext gpu, int updateFlag );

 extern "C"
 void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+//----------------------------------------------------------------------------------------- 
+//----------------------------------------------------------------------------------------- 
+
+#include "cudaKernels.h"
+#include "amoebaCudaKernels.h"
+
+//#define AMOEBA_DEBUG
+#undef AMOEBA_DEBUG
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+
+void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
+    RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+
+void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
+    RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+}
+
+struct GrycukParticle {
+
+    float x;
+    float y;
+    float z;
+
+    float radius;
+    float scaledRadius;
+
+    float bornSum;
+};
+
+__device__ void loadGrycukShared( struct GrycukParticle* sA, unsigned int atomI )
+{
+    // coordinates, radii and scaled radii
+
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+
+    sA->radius                   = cSim.pObcData[atomI].x;
+    sA->scaledRadius             = cSim.pObcData[atomI].y;
+
+}
+
+__device__ void calculateGrycukBornRadiiPairIxn_kernel( GrycukParticle& atomI, GrycukParticle& atomJ, float*  bornSum ){
+
+    /*
+     * radius:       radius (TINKER rsolv)
+     * scaledRadius: radius*overlap scale factor (TINKER rsolv*shct)
+     *
+     */
+
+    float xr,yr,zr;
+    float r,r2;
+    
+    float sk, sk2;
+    float lik, uik;
+    float lik3, uik3;
+    float l2, l4, lr, l4r;
+    float u2, u4, ur, u4r;
+    float term;
+
+    // decide whether to compute the current interaction;
+
+    *bornSum = 0.0f;
+    if( atomI.radius <= 0.0f ){
+        return;
+    }
+
+    xr           = atomJ.x - atomI.x;
+    yr           = atomJ.y - atomI.y;
+    zr           = atomJ.z - atomI.z;
+
+    r2           = xr*xr + yr*yr + zr*zr;
+    r            = sqrt(r2);
+
+    sk           = atomJ.scaledRadius;
+    sk2          = sk*sk;
+
+    if( (atomI.radius + r) < sk ){
+        lik       = atomI.radius;
+        uik       = sk - r; 
+        lik3      = lik*lik*lik;
+        uik3      = uik*uik*uik;
+        *bornSum -= (1.0f/uik3 - 1.0f/lik3);
+    }
+
+    uik = r + sk;
+    if( (atomI.radius + r) < sk ){
+        lik = sk - r; 
+    } else if( r < (atomI.radius + sk) ){
+        lik = atomI.radius;
+    } else {
+        lik = r - sk;
+    }
+
+    l2          = lik*lik; 
+    l4          = l2*l2;
+    lr          = lik*r;
+    l4r         = l4*r; 
+
+    u2          = uik*uik;
+    u4          = u2*u2;
+    ur          = uik*r; 
+    u4r         = u4*r;
+
+   term         = (3.0f*(r2-sk2)+6.0f*u2-8.0f*ur)/u4r - (3.0f*(r2-sk2)+6.0f*l2-8.0f*lr)/l4r;
+  *bornSum     += term/16.0f;
+
+}
+
+__device__ void zeroGrycukParticleSharedField( struct GrycukParticle* sA )
+{
+    sA->bornSum               = 0.0f;
+}
+
+__global__ 
+__launch_bounds__(384, 1)
+void kReduceGrycukGbsaBornSum_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    
+    while (pos < cSim.atoms)
+    {   
+        float sum   = 0.0f;
+        float* pSt  = cSim.pBornSum + pos;
+    
+        // Get summed Born data
+        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
+        {   
+            sum += *pSt;
+            pSt += cSim.stride;
+        }   
+    
+        // Now calculate Born radius
+
+        float radius              = cSim.pObcData[pos].x;
+        radius                    = 1.0f/(radius*radius*radius);
+        sum                       = radius - sum;
+        sum                       = sum <= 0.0f ? 1000.0f : pow( sum, -1.0f/3.0f );
+        cSim.pBornRadii[pos]      = sum;
+
+        pos                      += gridDim.x * blockDim.x;
+    }   
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Reduce Born radii
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
+{
+
+    kReduceGrycukGbsaBornSum_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>();
+    LAUNCHERROR("kReduceGrycukGbsaBornSum");
+
+    if( 1 ){
+        static int callId                    = 0;
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        fileId.push_back( callId++ );
+        VectorOfDoubleVectors outputVector;
+        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); 
+        cudaWriteVectorOfDoubleVectorsToFile( "BornRGry", fileId, outputVector );
+    }   
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaGrycukBornRadii.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaGrycukBornRadii.h"
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Born radii using Grycuk algorithm
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+
+void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
+{
+
+#ifdef AMOEBA_DEBUG
+    static const char* methodName       = "kCalculateAmoebaGrycukBornRadii";
+    static int timestep = 0;
+    std::vector<int> fileId;
+    timestep++;
+    fileId.resize( 2 );
+    fileId[0] = timestep;
+    fileId[1] = 1;
+#endif
+
+    // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu = amoebaGpu->gpuContext;
+
+    // apparently debug array can take up nontrivial no. registers
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
+                        amoebaGpu->scalingDistanceCutoff );
+    }
+    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    debugArray->Upload();
+    unsigned int targetAtom                   = 0;
+
+    gpu->psBornRadii->Download();
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
+        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
+            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
+                            gpu->psBornRadii->_pSysData[ii] );
+        }
+    }
+#endif
+
+    // on first pass, set threads/block and based on that setting the energy buffer array
+
+    static unsigned int threadsPerBlock = 0;
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            //maxThreads = 384;
+            maxThreads = 512;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukParticle), gpu->sharedMemoryPerBlock ), maxThreads);
+
+#ifdef AMOEBA_DEBUG
+        if( amoebaGpu->log ){
+            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
+                            sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
+                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+            (void) fflush( amoebaGpu->log );
+        }
+#endif
+
+    }
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
+                        sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+    }
+#endif
+
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
+    } else {
+       kCalculateAmoebaGrycukBornRadiiN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
+
+   // ---------------------------------------------------------------------------------------
+}
+
+// Born radius chain rule component for Grycuk
+
+struct GrycukChainRuleParticle {
+
+    float x;
+    float y;
+    float z;
+
+    float radius;
+    float scaledRadius;
+
+    float bornRadius;
+    float bornForce;
+
+    float force[3];
+};
+
+__device__ void loadGrycukChainRuleParticleShared( struct GrycukChainRuleParticle* sA, unsigned int atomI )
+{
+    // coordinates, radii and scaled radii
+
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+
+    sA->radius                   = cSim.pObcData[atomI].x;
+    sA->scaledRadius             = cSim.pObcData[atomI].y;
+    sA->bornRadius               = cSim.pBornRadii[atomI];
+    sA->bornForce                = cSim.pBornForce[atomI];
+
+}
+
+__device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRuleParticle* sA )
+{
+    // zero force
+
+    sA->force[0]                 = 0.0f;
+    sA->force[1]                 = 0.0f;
+    sA->force[2]                 = 0.0f;
+
+}
+
+//#define AMOEBA_DEBUG
+
+__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3] 
+#ifdef AMOEBA_DEBUG
+,  float4 pullDebug[5]
+#endif
+){
+
+    const float pi         = 3.1415926535897f;
+    float third            = 1.0f/3.0f;
+    float pi43             = 4.0f*third*pi;
+
+    float lik, uik;
+    float lik4, uik4;
+    float factor           = -pow(pi,third)*pow(6.0f,(2.0f*third))/9.0f;
+    float term             = pi43/(atomI.bornRadius*atomI.bornRadius*atomI.bornRadius);
+          term             = factor/pow( term, (4.0f*third) );
+
+    float xr               = atomJ.x - atomI.x;
+    float yr               = atomJ.y - atomI.y;
+    float zr               = atomJ.z - atomI.z;
+
+    float sk               = atomJ.scaledRadius;
+    float sk2              = sk*sk;
+    float r2               = xr*xr + yr*yr + zr*zr;
+    float r                = sqrt(r2);
+    float de               = 0.0f;
+
+    if( (atomI.radius + r) < sk ){
+        float uik4;
+        uik        = sk - r;
+        uik4       = uik*uik;
+        uik4       = uik4*uik4;
+        de         = -4.0f*pi/uik4;
+    }
+
+    if( (atomI.radius + r) < sk){
+        lik  = sk - r;
+        lik4 = lik*lik;
+        lik4 = lik4*lik4;
+        de  += 0.25f*pi*(sk2-4.0f*sk*r+17.0f*r2)/ (r2*lik4);
+    } else if( r < (atomI.radius +sk) ){
+        lik  = atomI.radius;
+        lik4 = lik*lik;
+        lik4 = lik4*lik4;
+        de  += 0.25f*pi*(2.0f*atomI.radius*atomI.radius-sk2-r2)/ (r2*lik4);
+    } else {
+        lik  = r - sk;
+        lik4 = lik*lik;
+        lik4 = lik4*lik4;
+        de  += 0.25f*pi*(sk2-4.0f*sk*r+r2)/ (r2*lik4);
+    }
+    uik        = r + sk;
+    uik4       = uik*uik;
+    uik4       = uik4*uik4;
+    
+    de        -= 0.25f*pi*(sk2+4.0f*sk*r+r2)/ (r2*uik4);
+    float dbr  = term * de/r;
+          de   = dbr*atomI.bornForce;
+
+#ifdef AMOEBA_DEBUG
+    pullDebug[0].x = de;
+    pullDebug[0].y = r;
+    pullDebug[0].z = factor;
+    pullDebug[0].w = -4.0f;
+
+    pullDebug[1].x = atomI.bornForce/4.184f;
+    pullDebug[1].y = atomI.bornRadius;
+    pullDebug[1].z = atomJ.bornForce/4.184f;
+    pullDebug[1].w = -5.0f;
+#endif
+
+    force[0]   = xr*de;
+    force[1]   = yr*de;
+    force[2]   = zr*de;
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaGrycukChainRule.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaGrycukChainRule.h"
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Grycuk chain rule contribution to force
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+
+void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
+{
+
+#ifdef AMOEBA_DEBUG
+    static const char* methodName       = "kCalculateGrycukGbsaForces2";
+    static int timestep = 0;
+    std::vector<int> fileId;
+    timestep++;
+    fileId.resize( 2 );
+    fileId[0] = timestep;
+    fileId[1] = 1;
+#endif
+
+    // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu = amoebaGpu->gpuContext;
+
+    // apparently debug array can take up nontrivial no. registers
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
+    }
+    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(20*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*20*paddedNumberOfAtoms);
+    debugArray->Upload();
+    unsigned int targetAtom                   = 0;
+
+    gpu->psBornRadii->Download();
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
+        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
+            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
+                            gpu->psBornRadii->_pSysData[ii] );
+        }
+    }
+#endif
+
+    // on first pass, set threads/block and based on that setting the energy buffer array
+
+    static unsigned int threadsPerBlock = 0;
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            //maxThreads = 384;
+            maxThreads = 512;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukChainRuleParticle), gpu->sharedMemoryPerBlock ), maxThreads);
+
+#ifdef AMOEBA_DEBUG
+        if( amoebaGpu->log ){
+            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
+                            sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
+                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+            (void) fflush( amoebaGpu->log );
+        }
+#endif
+
+    }
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
+                        sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+    }
+#endif
+
+//kClearForces( gpu );
+
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
+#ifdef AMOEBA_DEBUG
+    ,debugArray->_pDevData, targetAtom
+#endif
+);
+    } else {
+        kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
+#ifdef AMOEBA_DEBUG
+    ,debugArray->_pDevData, targetAtom
+#endif
+);
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        debugArray->Download();
+        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        for( int jj = 0; jj < gpu->natoms; jj++ ){
+            int debugIndex = jj; 
+            (void) fprintf( amoebaGpu->log,"%5d %5d DebugGrycukChain\n", targetAtom, jj );
+            for( int kk = 0; kk < 7; kk++ ){
+                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
+                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
+                debugIndex += paddedNumberOfAtoms;
+            }   
+            (void) fprintf( amoebaGpu->log,"\n" );
+        }   
+    }   
+#endif
+
+   if( 0 ){ 
+        static int callId                    = 0;
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        fileId.push_back( callId++ ); 
+        VectorOfDoubleVectors outputVector;
+        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psLabFrameDipole,     outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*gpu->sim.paddedNumberOfAtoms, 1, "Temp1");
+
+        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f ); 
+        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f ); 
+        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); 
+        cudaWriteVectorOfDoubleVectorsToFile( "GryF", fileId, outputVector );
+        delete temp;
+        exit(0);
+    }    
+
+   // ---------------------------------------------------------------------------------------
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukBornRadii.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukBornRadii.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaScaleFactors.h"
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaGrycukBornRadii, _kernel)( unsigned int* workUnit ){
+
+    extern __shared__ GrycukParticle sA[];
+
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+
+    while (pos < end)
+    {
+
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+
+        // Extract cell coordinates
+
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+
+        unsigned int tgx                 = threadIdx.x & (GRID - 1);
+        unsigned int tbx                 = threadIdx.x - tgx;
+        unsigned int tj                  = tgx;
+
+        GrycukParticle*  psA             = &sA[tbx];
+        unsigned int atomI               = x + tgx;
+        GrycukParticle localParticle;
+        loadGrycukShared( &localParticle, atomI );
+
+        float bornSum                    = 0.0f;
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+
+            // load shared data
+
+            loadGrycukShared( &(sA[threadIdx.x]), atomI );
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                float localBornSum;
+                calculateGrycukBornRadiiPairIxn_kernel( localParticle, psA[j], &localBornSum );
+                bornSum   +=  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0.0 : localBornSum;
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset    = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += bornSum;
+#else
+            unsigned int offset   = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = bornSum;
+#endif
+
+        } else {
+
+            if (lasty != y) {
+                unsigned int atomJ        = y + tgx;
+                loadGrycukShared( &(sA[threadIdx.x]), atomJ );
+            }
+
+           // zero shared fields
+
+            zeroGrycukParticleSharedField(  &(sA[threadIdx.x]) );
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+
+                float localBornSum;
+                calculateGrycukBornRadiiPairIxn_kernel( localParticle, psA[tj], &localBornSum );
+                bornSum           +=  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0.0 : localBornSum;
+
+                calculateGrycukBornRadiiPairIxn_kernel( psA[tj], localParticle, &localBornSum );
+                psA[tj].bornSum   +=  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0.0 : localBornSum;
+
+                tj                 = (tj + 1) & (GRID - 1);
+
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+
+            unsigned int offset    = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += bornSum;
+
+            offset = y + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += sA[threadIdx.x].bornSum;
+#else
+
+            unsigned int offset   = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = bornSum;
+
+            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = sA[threadIdx.x].bornSum;
+#endif
+
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaScaleFactors.h"
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit 
+#ifdef AMOEBA_DEBUG
+                           , float4* debugArray, unsigned int targetAtom
+#endif
+){
+
+    extern __shared__ GrycukChainRuleParticle sAChainRule[];
+
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+
+#ifdef AMOEBA_DEBUG
+    float4 pullDebug[5];
+#endif
+
+    while (pos < end)
+    {
+
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+
+        // Extract cell coordinates
+
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+
+        unsigned int tgx                          = threadIdx.x & (GRID - 1);
+        unsigned int tbx                          = threadIdx.x - tgx;
+        unsigned int tj                           = tgx;
+
+        GrycukChainRuleParticle*  psAChainRule    = &sAChainRule[tbx];
+        unsigned int atomI                        = x + tgx;
+        GrycukChainRuleParticle localParticle;
+        loadGrycukChainRuleParticleShared( &localParticle, atomI );
+
+        zeroGrycukChainRuleParticleSharedField( &localParticle );
+
+        if (x == y){
+
+            // load shared data and zero force
+
+            loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomI );
+            zeroGrycukChainRuleParticleSharedField( &(sAChainRule[threadIdx.x]));
+
+            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
+            {
+                float localForce[3];
+                calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce
+#ifdef AMOEBA_DEBUG
+,  pullDebug
+#endif
+ );
+                if( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ){
+
+                    localParticle.force[0]     -= localForce[0];
+                    localParticle.force[1]     -= localForce[1];
+                    localParticle.force[2]     -= localForce[2];
+
+                    psAChainRule[j].force[0]   += localForce[0];
+                    psAChainRule[j].force[1]   += localForce[1];
+                    psAChainRule[j].force[2]   += localForce[2];
+
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom || (y+j) == targetAtom ){
+        unsigned int index                 = (atomI == targetAtom) ? (y + j) : atomI;
+
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) (y + j); 
+        debugArray[index].z                = -1.0f;
+        debugArray[index].w                = -1.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = (float) x;
+        debugArray[index].y                = (float) y;
+        debugArray[index].z                = (float) tgx;
+        debugArray[index].w                = -2.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = pullDebug[0].y;
+        debugArray[index].z                = pullDebug[0].z;
+        debugArray[index].w                = pullDebug[0].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[1].x;
+        debugArray[index].y                = pullDebug[1].y;
+        debugArray[index].z                = pullDebug[1].z;
+        debugArray[index].w                = pullDebug[1].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = localForce[0];
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -12.0f;
+
+ calculateGrycukChainRulePairIxn_kernel( psAChainRule[j], localParticle, localForce ,  pullDebug );
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = pullDebug[0].y;
+        debugArray[index].z                = pullDebug[0].z;
+        debugArray[index].w                = -13.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = localForce[0];
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -14.0f;
+}
+#endif
+
+                }
+            }
+
+            // Write results
+            float4 of; 
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4[offset];
+            of.x                       += localParticle.force[0]  + sAChainRule[threadIdx.x].force[0];
+            of.y                       += localParticle.force[1]  + sAChainRule[threadIdx.x].force[1];
+            of.z                       += localParticle.force[2]  + sAChainRule[threadIdx.x].force[2];
+            cSim.pForce4[offset]       = of; 
+
+        } else {
+
+            if (lasty != y) {
+                unsigned int atomJ        = y + tgx;
+                loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomJ );
+            }
+
+           // zero shared fields
+
+            zeroGrycukChainRuleParticleSharedField(  &(sAChainRule[threadIdx.x]) );
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+
+                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
+                    float localForce[3];
+                    calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce 
+#ifdef AMOEBA_DEBUG
+,  pullDebug
+#endif
+);
+    
+                    localParticle.force[0]     -= localForce[0];
+                    localParticle.force[1]     -= localForce[1];
+                    localParticle.force[2]     -= localForce[2];
+    
+                    psAChainRule[tj].force[0]  += localForce[0];
+                    psAChainRule[tj].force[1]  += localForce[1];
+                    psAChainRule[tj].force[2]  += localForce[2];
+    
+#ifdef AMOEBA_DEBUG
+unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
+if( atomI == targetAtom || (y+tj) == targetAtom ){
+
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) (y + tj); 
+        debugArray[index].z                = -1.0f;
+        debugArray[index].w                = -1.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = (float) x;
+        debugArray[index].y                = (float) y;
+        debugArray[index].z                = (float) tgx;
+        debugArray[index].w                = -2.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = pullDebug[0].y;
+        debugArray[index].z                = pullDebug[0].z;
+        debugArray[index].w                = pullDebug[0].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[1].x;
+        debugArray[index].y                = pullDebug[1].y;
+        debugArray[index].z                = pullDebug[1].z;
+        debugArray[index].w                = pullDebug[1].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = localForce[0];
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -10.0f;
+}
+#endif
+                    calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce
+#ifdef AMOEBA_DEBUG
+,  pullDebug
+#endif
+ );
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom || (y+tj) == targetAtom ){
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -11.0f;
+}
+#endif
+    
+                    localParticle.force[0]     += localForce[0];
+                    localParticle.force[1]     += localForce[1];
+                    localParticle.force[2]     += localForce[2];
+    
+                    psAChainRule[tj].force[0]  -= localForce[0];
+                    psAChainRule[tj].force[1]  -= localForce[1];
+                    psAChainRule[tj].force[2]  -= localForce[2];
+                }
+
+                tj  = (tj + 1) & (GRID - 1);
+
+            }
+
+            // Write results
+
+            float4 of;
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4[offset];
+            of.x                       += localParticle.force[0];
+            of.y                       += localParticle.force[1];
+            of.z                       += localParticle.force[2];
+            cSim.pForce4[offset]       = of;
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            offset                      = y + tgx + warp*cSim.stride;
+#else
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4[offset];
+            of.x                       += sAChainRule[threadIdx.x].force[0];
+            of.y                       += sAChainRule[threadIdx.x].force[1];
+            of.z                       += sAChainRule[threadIdx.x].force[2];
+            cSim.pForce4[offset]       = of;
+
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1666,8 +1666,8 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
 #endif
- void kReduceToBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
-                                          float* fieldOut )
+ void kReduceToObcBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                             float* fieldOut )
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

@@ -1723,8 +1723,8 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
 #endif
-void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
-                                                float* fieldOut )
+void kReduceToObcBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                                   float* fieldOut )
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

@@ -1788,17 +1788,137 @@ void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, un
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
 }

-/*
-static void kReduceAndCombine_dBorn(amoebaGpuContext amoebaGpu )
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+ void kReduceToBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                          float* fieldOut )
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Reduce field
+
+    while (pos < fieldComponents)
+    {
+
+        float totalField = 0.0f;
+
+        float* pFt1      = fieldIn1 + pos;
+        float* pFt2      = fieldIn2 + pos;
+
+        //float bornRadius = cSim.pBornRadii[pos];
+        //float obcChain   = cSim.pObcChain[pos];
+
+        unsigned int i   = outputBuffers;
+
+        while (i >= 4)
+        {
+            totalField += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
+            totalField += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
+            pFt1       += fieldComponents*4;
+            pFt2       += fieldComponents*4;
+            i          -= 4;
+        }
+
+        if (i >= 2)
+        {
+            totalField += pFt1[0] + pFt1[fieldComponents];
+            totalField += pFt2[0] + pFt2[fieldComponents];
+            pFt1       += fieldComponents*2;
+            pFt2       += fieldComponents*2;
+            i          -= 2;
+        }
+
+        if (i > 0)
+        {
+            totalField += pFt1[0];
+            totalField += pFt2[0];
+        }
+
+        //fieldOut[pos]   = totalField*bornRadius*bornRadius*obcChain;
+        fieldOut[pos]   = totalField;
+        pos            += gridDim.x * blockDim.x;
+    }
+}
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                                float* fieldOut )
 {
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    float energy     = 0.0f;
+
+    // Reduce field
+
+    while (pos < fieldComponents)
+    {
+
+        float totalForce = 0.0f;
+
+        float* pFt1      = fieldIn1 + pos;
+        float* pFt2      = fieldIn2 + pos;
+
+        float bornRadius = cSim.pBornRadii[pos];
+        //float obcChain   = cSim.pObcChain[pos];
+        float2 obcData   = cSim.pObcData[pos];
+
+        unsigned int i   = outputBuffers;
+
+        while (i >= 4)
+        {
+            totalForce += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
+            totalForce += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
+            pFt1       += fieldComponents*4;
+            pFt2       += fieldComponents*4;
+            i          -= 4;
+        }
+
+        if (i >= 2)
+        {
+            totalForce += pFt1[0] + pFt1[fieldComponents];
+            totalForce += pFt2[0] + pFt2[fieldComponents];
+            pFt1       += fieldComponents*2;
+            pFt2       += fieldComponents*2;
+            i          -= 2;
+        }
+
+        if (i > 0)
+        {
+            totalForce += pFt1[0];
+            totalForce += pFt2[0];
+        }

-    kReduceAndCombineFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                                     gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
-                                     amoebaGpu->psWorkArray_1_1->_pDevData,
-                                     amoebaGpu->psWorkArray_1_2->_pDevData,
-                                     amoebaGpu->psBorn->_pDevData );
-    LAUNCHERROR("kReduce_dBorn");
-} */
+        float r        = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
+        float ratio6   = ( (obcData.x + cSim.dielectricOffset) / bornRadius);
+              ratio6   = ratio6*ratio6*ratio6;
+              ratio6   = ratio6*ratio6;
+        float saTerm   = cSim.surfaceAreaFactor * r * r * ratio6;
+
+        totalForce    += saTerm / bornRadius;
+        //totalForce    *= bornRadius * bornRadius * obcChain;
+
+        fieldOut[pos]  = totalForce;
+
+        energy        += saTerm;
+        pos           += gridDim.x * blockDim.x;
+    }
+
+    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
+}

 static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
 {
@@ -1982,7 +2102,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )

    // Tinker's Born1 && E-diff

-    kCalculateObcGbsaForces2( amoebaGpu->gpuContext );
+    //kCalculateObcGbsaForces2( amoebaGpu->gpuContext );
+    kCalculateGrycukGbsaForces2( amoebaGpu );
    kCalculateAmoebaKirkwoodEDiff( amoebaGpu );

   // ---------------------------------------------------------------------------------------

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
@@ -36,6 +36,7 @@
 #include "AmoebaReferenceMultipoleForce.h"
 #include "AmoebaReferenceVdwForce.h"
 #include "AmoebaReferenceWcaDispersionForce.h"
+#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
 #include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
 #include "AmoebaReferenceUreyBradleyForce.h"
 #include "ReferencePlatform.h"
@@ -412,16 +413,31 @@ void ReferenceCalcAmoebaTorsionTorsionForceKernel::initialize(const System& syst
    for (int ii = 0; ii < numTorsionTorsionGrids; ii++) {

        const TorsionTorsionGrid grid = force.getTorsionTorsionGrid( ii );
-
        torsionTorsionGrids[ii].resize( grid.size() );
+
+        // check if grid needs to be reordered: x-angle should be 'slow' index
+
+        TorsionTorsionGrid reorderedGrid;
+        int reorder = 0; 
+        if( grid[0][0][0] != grid[0][1][0] ){
+            AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
+            reorder = 1; 
+        }    
+
        for (unsigned int kk = 0; kk < grid.size(); kk++) {

            torsionTorsionGrids[ii][kk].resize( grid[kk].size() );
            for (unsigned int jj = 0; jj < grid[kk].size(); jj++) {

                torsionTorsionGrids[ii][kk][jj].resize( grid[kk][jj].size() );
-                for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
-                    torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(grid[kk][jj][ll]);
+                if( reorder ){
+                    for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
+                        torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(reorderedGrid[kk][jj][ll]);
+                    }
+                } else {
+                    for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
+                        torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(grid[kk][jj][ll]);
+                    }
                }
            }
        }

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
@@ -27,8 +27,8 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "openmm/amoebaKernels.h"
 #include "openmm/System.h"
+#include "openmm/amoebaKernels.h"
 #include "SimTKUtilities/SimTKOpenMMRealType.h"

 namespace OpenMM {