Replaced calculation of Born radii using OBC algorithm w/ Grycuk algorithm

TorsionTorsion grids reordered, if needed, so that first angle is the 'slow' index Fixes for MonteCarloBarostat

Replaced calculation of Born radii using OBC algorithm w/ Grycuk algorithm
TorsionTorsion grids reordered, if needed, so that first angle is the 'slow' index Fixes for MonteCarloBarostat
25058a77 · Mark Friedrichs · e772769e · 25058a77 · 25058a77 · 25058a77
Commit 25058a77 authored Sep 30, 2011 by Mark Friedrichs
16 changed files
--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -18,6 +18,7 @@

 # ----------------------------------------------------------------------------
 #SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA OFF CACHE BOOL "Build verison of OpenMMAmoeba w/ backdoor serialization capability")
+#SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA TRUE )
 SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA FALSE )
 # ----------------------------------------------------------------------------


--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaTorsionTorsionForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaTorsionTorsionForce.h
@@ -41,6 +41,7 @@
 namespace OpenMM {

 typedef std::vector< std::vector< std::vector<double> > > TorsionTorsionGrid;
+typedef std::vector< std::vector< std::vector<float> > > TorsionTorsionGridFloat;

 /**
 * This class implements the Amoeba torsion-torsion interaction
@@ -198,6 +199,13 @@ public:
                }
            }
        }   
+/*
+        for( unsigned int kk = 0; kk < grid.size(); kk++ ){
+            for( unsigned int jj = 0; jj < grid[kk].size(); jj++ ){
+                fprintf( stderr, "xGrid %4d %4d %12.3f %12.3f  %15.7e  %15.7e\n", kk, jj,  grid[kk][jj][0], grid[kk][jj][1], grid[kk][jj][2], grid[kk][jj][2]/4.184 );
+            }
+        }
+*/

        _startValues[0] =  _grid[0][0][0];
        _startValues[1] =  _grid[0][0][1];

--- a/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaTorsionTorsionForceImpl.h
+++ b/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaTorsionTorsionForceImpl.h
@@ -60,6 +60,8 @@ public:
        return std::map<std::string, double>(); // This force field doesn't define any parameters.
    }
    std::vector<std::string> getKernelNames();
+
+    static void reorderGrid( const TorsionTorsionGrid& grid, TorsionTorsionGrid& reorderedGrid );
 private:
    AmoebaTorsionTorsionForce& owner;
    Kernel kernel;

--- a/plugins/amoeba/openmmapi/src/AmoebaTorsionTorsionForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaTorsionTorsionForceImpl.cpp
@@ -54,6 +54,121 @@ double AmoebaTorsionTorsionForceImpl::calcForcesAndEnergy(ContextImpl& context,
    return dynamic_cast<CalcAmoebaTorsionTorsionForceKernel&>(kernel.getImpl()).execute(context, includeForces, includeEnergy);
 }

+struct IntPair {
+    unsigned int index1;
+    unsigned int index2;
+};
+
+typedef std::map< double, struct IntPair > Map_Double_IntPair;
+typedef Map_Double_IntPair::iterator Map_Double_IntPairI;
+typedef Map_Double_IntPair::const_iterator Map_Double_IntPairCI;
+
+typedef std::map< double, Map_Double_IntPair > Map_Double_MapDoubleIntPair;
+typedef Map_Double_MapDoubleIntPair::iterator Map_Double_MapDoubleIntPairI;
+typedef Map_Double_MapDoubleIntPair::const_iterator Map_Double_MapDoubleIntPairCI;
+
+void AmoebaTorsionTorsionForceImpl::reorderGrid( const TorsionTorsionGrid& grid, TorsionTorsionGrid& reorderedGrid ){
+
+    reorderedGrid.resize( grid.size() );
+    std::vector<Map_Double_IntPair> map_Double_IntPair_Vector( grid.size() );
+    Map_Double_MapDoubleIntPair mapAngles;
+    //(void) fprintf( stderr, "AmoebaTorsionTorsionForceImpl::reorder grid\n" );
+
+    // (1) set dimensions for reorderd grid
+    // (2) build map:
+    //         map[angleX][angleY] = <ii, jj> indices
+    //         assume map keys are sorted from least to greatest
+
+    for (unsigned int ii = 0; ii < grid.size(); ii++) {
+    
+        reorderedGrid[ii].resize( grid[ii].size() );
+        for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
+            reorderedGrid[ii][jj].resize( grid[ii][jj].size() );
+
+            double angleX =  grid[ii][jj][0]; 
+            double angleY =  grid[ii][jj][1]; 
+
+            if( mapAngles.find( angleX ) == mapAngles.end() ){
+                if( map_Double_IntPair_Vector[ii].size() > 0 ){
+                    char buffer[1024];
+                    (void) sprintf( buffer, "TorsionTorsion grid reorder: x-angle not set correctly: x=%15.7e y=%15.7e size=%u should be zero; ii/jj indies=%u %u.\n",
+                                    angleX, angleY, static_cast<unsigned int>(map_Double_IntPair_Vector[ii].size()), ii, jj );
+                    throw OpenMMException(buffer);
+                 }
+                 mapAngles[angleX] = map_Double_IntPair_Vector[ii];
+            }
+
+            Map_Double_IntPair& map_Double_IntPair  = mapAngles[angleX];
+            if( map_Double_IntPair.find( angleY ) != map_Double_IntPair.end() ){
+                char buffer[1024];
+                (void) sprintf( buffer, "TorsionTorsion grid reorder: angle pair found twice: %15.7e %15.7e %u\n", angleX, angleY, static_cast<unsigned int>(map_Double_IntPair.size()) );
+                throw OpenMMException(buffer);
+            }
+            struct IntPair pair; 
+            pair.index1 = ii;
+            pair.index2 = jj;
+            map_Double_IntPair[angleY] = pair; 
+        }
+    }
+
+#if 0
+     (void) fprintf( stderr, "TorsionTorsion grid reorder map\n" );
+     for( Map_Double_MapDoubleIntPairCI ii = mapAngles.begin(); ii != mapAngles.end(); ii++ ){
+         double angleX                          = ii->first;
+         Map_Double_IntPair map_Double_IntPair  = ii->second;
+         (void) fprintf( stderr, "   %15.7e %u \n", angleX, static_cast<unsigned int>(map_Double_IntPair.size()) );
+      }
+      for( Map_Double_MapDoubleIntPairCI ii = mapAngles.begin(); ii != mapAngles.end(); ii++ ){
+          double angleX                          = ii->first;
+          Map_Double_IntPair map_Double_IntPair  = ii->second;
+          for( Map_Double_IntPairCI jj = map_Double_IntPair.begin(); jj != map_Double_IntPair.end(); jj++ ){
+              double angle         = jj->first;
+              struct IntPair pair  = jj->second;
+              (void) fprintf( stderr, "   %15.7e %15.7e %d %d\n", angleX, angle, pair.index1, pair.index2 );
+          }
+            }
+#endif
+
+    // load reordered entries
+
+    Map_Double_MapDoubleIntPairCI mapII    = mapAngles.begin();
+    Map_Double_IntPair map_Double_IntPair  = mapII->second;
+    Map_Double_IntPairCI mapJJ             = map_Double_IntPair.begin();
+
+    for (unsigned int ii = 0; ii < grid.size(); ii++) {
+        for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
+
+            struct IntPair pair  = mapJJ->second;
+            int index1           = pair.index1;
+            int index2           = pair.index2;
+            //(void) fprintf( stderr, "   %3d %3d %15.7e %15.7e %3d %3d zzz\n", ii, jj, mapII->first, mapJJ->first, index1, index2 );
+
+            for (unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
+                reorderedGrid[ii][jj][kk] = static_cast<float>(grid[index1][index2][kk]);
+            }
+
+            // increment map iterators
+
+            mapJJ++;
+            if( mapJJ == map_Double_IntPair.end() ){
+                mapII++;
+                if( mapII == mapAngles.end() ){
+                    if( (jj != (grid[ii].size()-1)) && (ii != (grid.size()-1)) ){
+                        char buffer[1024];
+                        (void) sprintf( buffer, "AmoebaTorsionTorsionForceImpl::reorderGrid: error detected with map iterators.\n" );
+                        throw OpenMMException(buffer);
+                    }
+                } else {
+                    map_Double_IntPair  = mapII->second;
+                    mapJJ               = map_Double_IntPair.begin();
+                }
+            }
+        }
+    }
+
+    return;
+}
+
 std::vector<std::string> AmoebaTorsionTorsionForceImpl::getKernelNames() {
    std::vector<std::string> names;
    names.push_back(CalcAmoebaTorsionTorsionForceKernel::Name());

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -37,6 +37,7 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
    hasAmoebaBonds                = false;
    hasAmoebaGeneralizedKirkwood  = false;
    hasAmoebaMultipole            = false;
+    useGrycuk                     = true;
    amoebaGpu                     = amoebaGpuInit( cudaPlatformData.gpu );
    localForceKernel              = NULL;
    log                           = NULL;
@@ -45,6 +46,10 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
    applyMultipoleCutoff          = 0;
    useVdwNeighborList            = 0;
    multipoleForceCount           = 0;
+
+    boxDimensions[0]              = 0.0;
+    boxDimensions[1]              = 0.0;
+    boxDimensions[2]              = 0.0;
 }   

 AmoebaCudaData::~AmoebaCudaData() {
@@ -74,6 +79,10 @@ bool AmoebaCudaData::getHasAmoebaMultipole( void ) const {
    return hasAmoebaMultipole;
 }

+int AmoebaCudaData::getUseGrycuk( void ) const {
+    return useGrycuk;
+}
+
 void AmoebaCudaData::setHasAmoebaGeneralizedKirkwood( bool inputHasAmoebaGeneralizedKirkwood ) {
    hasAmoebaGeneralizedKirkwood = inputHasAmoebaGeneralizedKirkwood;
 }
@@ -108,6 +117,7 @@ void AmoebaCudaData::setContextImpl( void* inputContextImpl ) {
 }

 void AmoebaCudaData::initializeGpu( void ) {
+
    if( !gpuInitialized ){
        if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){
            throw OpenMMException("GK force requires Multipole force\n");
@@ -117,16 +127,33 @@ void AmoebaCudaData::initializeGpu( void ) {
        amoebaGpuBuildThreadBlockWorkList( amoebaGpu );
        amoebaGpuBuildVdwExclusionList( amoebaGpu );
        amoebaGpuBuildScalingList( amoebaGpu );
-        amoebaGpuSetConstants( amoebaGpu );
+        amoebaGpuSetConstants( amoebaGpu, 0 );

-        gpuInitialized = true;
+        boxDimensions[0] = amoebaGpu->gpuContext->sim.periodicBoxSizeX;
+        boxDimensions[1] = amoebaGpu->gpuContext->sim.periodicBoxSizeY;
+        boxDimensions[2] = amoebaGpu->gpuContext->sim.periodicBoxSizeZ;
+
+        gpuInitialized   = true;

        if( log ){
            gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() );
            (void) fprintf( log, "Gpu initialized\n" );
            (void) fflush( log );
        }
+    } else {
+
+        if( boxDimensions[0] != amoebaGpu->gpuContext->sim.periodicBoxSizeX ||
+            boxDimensions[1] != amoebaGpu->gpuContext->sim.periodicBoxSizeY ||
+            boxDimensions[2] != amoebaGpu->gpuContext->sim.periodicBoxSizeZ ){
+            amoebaGpuSetConstants( amoebaGpu, 1 );
+            
+            boxDimensions[0] = amoebaGpu->gpuContext->sim.periodicBoxSizeX;
+            boxDimensions[1] = amoebaGpu->gpuContext->sim.periodicBoxSizeY;
+            boxDimensions[2] = amoebaGpu->gpuContext->sim.periodicBoxSizeZ;
+        }
+
    }
+
    return;
 }


--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -76,6 +76,13 @@ public:
     */
    bool getHasAmoebaMultipole( void ) const;

+    /**
+     * Get use Grycuk flag
+     * 
+     * @return value of useGrycuk
+     */
+    int getUseGrycuk( void ) const;
+
    /**
     * Set value of hasAmoebaGeneralizedKirkwood
     * 
@@ -190,11 +197,13 @@ private:
    int multipoleForceCount;
    int applyMultipoleCutoff;
    int useVdwNeighborList;
+    int useGrycuk;
    KernelImpl* localForceKernel;
    unsigned int kernelCount;
    void* contextImpl;
    FILE* log;
    bool gpuInitialized;
+    double boxDimensions[3];
 };



--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -31,6 +31,7 @@
 #include "kernels/amoebaCudaKernels.h"
 #include "openmm/internal/AmoebaMultipoleForceImpl.h"
 #include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
+#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
 #include "openmm/internal/NonbondedForceImpl.h"
 #include "CudaForceInfo.h"

@@ -738,22 +739,39 @@ void CudaCalcAmoebaTorsionTorsionForceKernel::initialize(const System& system, c
    // torsion-torsion grids

    numTorsionTorsionGrids = force.getNumTorsionTorsionGrids();
-    std::vector< std::vector< std::vector< std::vector<float> > > > floatGrids;
+    std::vector<TorsionTorsionGridFloat> floatGrids;

    floatGrids.resize(numTorsionTorsionGrids);
    for (int gridIndex = 0; gridIndex < numTorsionTorsionGrids; gridIndex++) {

        const TorsionTorsionGrid& grid = force.getTorsionTorsionGrid( gridIndex );
-
        floatGrids[gridIndex].resize( grid.size() );
+
+        // check if grid needs to be reordered: x-angle should be 'slow' index
+
+        TorsionTorsionGrid reorderedGrid;
+        int reorder = 0;
+        if( grid[0][0][0] != grid[0][1][0] ){
+            AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
+            reorder = 1;
+            if( data.getLog() ){
+                (void) fprintf( data.getLog(), "CudaCalcAmoebaTorsionTorsionForceKernel::initialize: reordering torsion-torsion grid %d.\n", gridIndex );
+            }
+        }
        for (unsigned int ii = 0; ii < grid.size(); ii++) {

            floatGrids[gridIndex][ii].resize( grid[ii].size() );
            for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {

                floatGrids[gridIndex][ii][jj].resize( grid[ii][jj].size() );
-                for (unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
-                    floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
+                if( reorder ){
+                    for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
+                        floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(reorderedGrid[ii][jj][kk]);
+                    }
+                } else {
+                    for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
+                        floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
+                    }
                }
            }
        }
@@ -788,12 +806,17 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {

    data.initializeGpu();

-    // calculate Born radii
+    // calculate Born radii using either the Grycuk or OBC algorithm if GK is active

    if( data.getHasAmoebaGeneralizedKirkwood() ){
        kClearBornSum( gpu->gpuContext );
-        kCalculateObcGbsaBornSum(gpu->gpuContext);
-        kReduceObcGbsaBornSum(gpu->gpuContext);
+        if( data.getUseGrycuk() ){
+            kCalculateAmoebaGrycukBornRadii( gpu );
+            kReduceGrycukGbsaBornSum( gpu );
+        } else {
+            kCalculateObcGbsaBornSum(gpu->gpuContext);
+            kReduceObcGbsaBornSum(gpu->gpuContext);
+       }
    }   

    // multipoles
@@ -990,23 +1013,27 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
            zsize = pmeGridDimension[2];
            pmeParametersSetBasedOnEwaldErrorTolerance = 0;
        }
+
+        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
+
        if( data.getLog() ){
-            (void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d] -",
+            (void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d]\n",
                            force.getEwaldErrorTolerance(), force.getCutoffDistance(),  alpha, xsize, ysize, zsize );
            if( pmeParametersSetBasedOnEwaldErrorTolerance  ){
-                 (void) fprintf( data.getLog(), " parameters set based on error tolerance and OpenMM algorithm.\n" );
+                 (void) fprintf( data.getLog(), "Parameters based on error tolerance and OpenMM algorithm.\n" );
            } else {
                 double alphaT;
                 int xsizeT, ysizeT, zsizeT;
                 NonbondedForceImpl::calcPMEParameters(system, nb, alphaT, xsizeT, ysizeT, zsizeT);
                 double impliedTolerance  = alpha*force.getCutoffDistance();
                        impliedTolerance  = 0.5*exp( -(impliedTolerance*impliedTolerance) );
-                 (void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e;", impliedTolerance );
+                 (void) fprintf( data.getLog(), "Using input parameters implied tolerance=%12.3e;", impliedTolerance );
                 (void) fprintf( data.getLog(), "OpenMM param: aEwald=%12.3f [%6d %6d %6d]\n", alphaT, xsizeT, ysizeT, zsizeT);
            }
+            (void) fprintf( data.getLog(), "\n" );
            (void) fflush( data.getLog() );
        }
-        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
+
        data.setApplyMultipoleCutoff( 1 );

        data.cudaPlatformData.nonbondedMethod = PARTICLE_MESH_EWALD;
@@ -1067,12 +1094,25 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
        scale[ii]   = static_cast<float>( scalingFactor );
        charge[ii]  = static_cast<float>( particleCharge );
    }   
-    gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
-                               static_cast<float>( force.getSolventDielectric() ), 
-                               static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
-                               force.getIncludeCavityTerm(),
-                               static_cast<float>( force.getProbeRadius() ), 
-                               static_cast<float>( force.getSurfaceAreaFactor() ) ); 
+    if( data.getUseGrycuk() ){
+
+        gpuSetAmoebaGrycukParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
+                                      static_cast<float>( force.getSolventDielectric() ), 
+                                      static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
+                                      force.getIncludeCavityTerm(),
+                                      static_cast<float>( force.getProbeRadius() ), 
+                                      static_cast<float>( force.getSurfaceAreaFactor() ) ); 
+        
+    } else {
+
+        gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
+                                   static_cast<float>( force.getSolventDielectric() ), 
+                                   static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
+                                   force.getIncludeCavityTerm(),
+                                   static_cast<float>( force.getProbeRadius() ), 
+                                   static_cast<float>( force.getSurfaceAreaFactor() ) ); 
+
+    }
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
 }


--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -124,9 +124,15 @@ extern void GetCalculateAmoebaKirkwoodEDiffSim( amoebaGpuContext amoebaGpu );
 //extern void cudaComputeAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );
 extern void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );

-extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
-extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
-extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
+//extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
+//extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
+//extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
+extern void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu );
+extern void kReduceGrycukGbsaBornSum( amoebaGpuContext gpu );
+extern void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
+extern void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
+extern void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu );
+

 // OBC -- Part 1
 //extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -288,6 +288,11 @@ void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu , float innerDielectr
                                const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
                                int includeCavityTerm, float probeRadius, float surfaceAreaFactor);

+extern "C"
+void gpuSetAmoebaGrycukParameters( amoebaGpuContext amoebaGpu , float innerDielectric, float solventDielectric, float dielectricOffset,
+                                   const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
+                                   int includeCavityTerm, float probeRadius, float surfaceAreaFactor);
+
 extern "C"
 void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
                                const std::vector<int>& indexIVs, 
@@ -313,7 +318,7 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
                                const float awater, const float shctd, const float dispoff );
 
 extern "C"
-void amoebaGpuSetConstants(amoebaGpuContext gpu);
+void amoebaGpuSetConstants(amoebaGpuContext gpu, int updateFlag );

 extern "C"
 void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+//----------------------------------------------------------------------------------------- 
+//----------------------------------------------------------------------------------------- 
+
+#include "cudaKernels.h"
+#include "amoebaCudaKernels.h"
+
+//#define AMOEBA_DEBUG
+#undef AMOEBA_DEBUG
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
+
+void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
+    RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
+}
+
+void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
+{
+    cudaError_t status;
+    gpuContext gpu = amoebaGpu->gpuContext;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
+    RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
+}
+
+struct GrycukParticle {
+
+    float x;
+    float y;
+    float z;
+
+    float radius;
+    float scaledRadius;
+
+    float bornSum;
+};
+
+__device__ void loadGrycukShared( struct GrycukParticle* sA, unsigned int atomI )
+{
+    // coordinates, radii and scaled radii
+
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+
+    sA->radius                   = cSim.pObcData[atomI].x;
+    sA->scaledRadius             = cSim.pObcData[atomI].y;
+
+}
+
+__device__ void calculateGrycukBornRadiiPairIxn_kernel( GrycukParticle& atomI, GrycukParticle& atomJ, float*  bornSum ){
+
+    /*
+     * radius:       radius (TINKER rsolv)
+     * scaledRadius: radius*overlap scale factor (TINKER rsolv*shct)
+     *
+     */
+
+    float xr,yr,zr;
+    float r,r2;
+    
+    float sk, sk2;
+    float lik, uik;
+    float lik3, uik3;
+    float l2, l4, lr, l4r;
+    float u2, u4, ur, u4r;
+    float term;
+
+    // decide whether to compute the current interaction;
+
+    *bornSum = 0.0f;
+    if( atomI.radius <= 0.0f ){
+        return;
+    }
+
+    xr           = atomJ.x - atomI.x;
+    yr           = atomJ.y - atomI.y;
+    zr           = atomJ.z - atomI.z;
+
+    r2           = xr*xr + yr*yr + zr*zr;
+    r            = sqrt(r2);
+
+    sk           = atomJ.scaledRadius;
+    sk2          = sk*sk;
+
+    if( (atomI.radius + r) < sk ){
+        lik       = atomI.radius;
+        uik       = sk - r; 
+        lik3      = lik*lik*lik;
+        uik3      = uik*uik*uik;
+        *bornSum -= (1.0f/uik3 - 1.0f/lik3);
+    }
+
+    uik = r + sk;
+    if( (atomI.radius + r) < sk ){
+        lik = sk - r; 
+    } else if( r < (atomI.radius + sk) ){
+        lik = atomI.radius;
+    } else {
+        lik = r - sk;
+    }
+
+    l2          = lik*lik; 
+    l4          = l2*l2;
+    lr          = lik*r;
+    l4r         = l4*r; 
+
+    u2          = uik*uik;
+    u4          = u2*u2;
+    ur          = uik*r; 
+    u4r         = u4*r;
+
+   term         = (3.0f*(r2-sk2)+6.0f*u2-8.0f*ur)/u4r - (3.0f*(r2-sk2)+6.0f*l2-8.0f*lr)/l4r;
+  *bornSum     += term/16.0f;
+
+}
+
+__device__ void zeroGrycukParticleSharedField( struct GrycukParticle* sA )
+{
+    sA->bornSum               = 0.0f;
+}
+
+__global__ 
+__launch_bounds__(384, 1)
+void kReduceGrycukGbsaBornSum_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    
+    while (pos < cSim.atoms)
+    {   
+        float sum   = 0.0f;
+        float* pSt  = cSim.pBornSum + pos;
+    
+        // Get summed Born data
+        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
+        {   
+            sum += *pSt;
+            pSt += cSim.stride;
+        }   
+    
+        // Now calculate Born radius
+
+        float radius              = cSim.pObcData[pos].x;
+        radius                    = 1.0f/(radius*radius*radius);
+        sum                       = radius - sum;
+        sum                       = sum <= 0.0f ? 1000.0f : pow( sum, -1.0f/3.0f );
+        cSim.pBornRadii[pos]      = sum;
+
+        pos                      += gridDim.x * blockDim.x;
+    }   
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Reduce Born radii
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
+{
+
+    kReduceGrycukGbsaBornSum_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>();
+    LAUNCHERROR("kReduceGrycukGbsaBornSum");
+
+    if( 1 ){
+        static int callId                    = 0;
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        fileId.push_back( callId++ );
+        VectorOfDoubleVectors outputVector;
+        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); 
+        cudaWriteVectorOfDoubleVectorsToFile( "BornRGry", fileId, outputVector );
+    }   
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaGrycukBornRadii.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaGrycukBornRadii.h"
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Born radii using Grycuk algorithm
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+
+void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
+{
+
+#ifdef AMOEBA_DEBUG
+    static const char* methodName       = "kCalculateAmoebaGrycukBornRadii";
+    static int timestep = 0;
+    std::vector<int> fileId;
+    timestep++;
+    fileId.resize( 2 );
+    fileId[0] = timestep;
+    fileId[1] = 1;
+#endif
+
+    // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu = amoebaGpu->gpuContext;
+
+    // apparently debug array can take up nontrivial no. registers
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
+                        amoebaGpu->scalingDistanceCutoff );
+    }
+    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
+    debugArray->Upload();
+    unsigned int targetAtom                   = 0;
+
+    gpu->psBornRadii->Download();
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
+        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
+            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
+                            gpu->psBornRadii->_pSysData[ii] );
+        }
+    }
+#endif
+
+    // on first pass, set threads/block and based on that setting the energy buffer array
+
+    static unsigned int threadsPerBlock = 0;
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            //maxThreads = 384;
+            maxThreads = 512;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukParticle), gpu->sharedMemoryPerBlock ), maxThreads);
+
+#ifdef AMOEBA_DEBUG
+        if( amoebaGpu->log ){
+            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
+                            sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
+                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+            (void) fflush( amoebaGpu->log );
+        }
+#endif
+
+    }
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
+                        sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+    }
+#endif
+
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
+    } else {
+       kCalculateAmoebaGrycukBornRadiiN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
+
+   // ---------------------------------------------------------------------------------------
+}
+
+// Born radius chain rule component for Grycuk
+
+struct GrycukChainRuleParticle {
+
+    float x;
+    float y;
+    float z;
+
+    float radius;
+    float scaledRadius;
+
+    float bornRadius;
+    float bornForce;
+
+    float force[3];
+};
+
+__device__ void loadGrycukChainRuleParticleShared( struct GrycukChainRuleParticle* sA, unsigned int atomI )
+{
+    // coordinates, radii and scaled radii
+
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+
+    sA->radius                   = cSim.pObcData[atomI].x;
+    sA->scaledRadius             = cSim.pObcData[atomI].y;
+    sA->bornRadius               = cSim.pBornRadii[atomI];
+    sA->bornForce                = cSim.pBornForce[atomI];
+
+}
+
+__device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRuleParticle* sA )
+{
+    // zero force
+
+    sA->force[0]                 = 0.0f;
+    sA->force[1]                 = 0.0f;
+    sA->force[2]                 = 0.0f;
+
+}
+
+//#define AMOEBA_DEBUG
+
+__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3] 
+#ifdef AMOEBA_DEBUG
+,  float4 pullDebug[5]
+#endif
+){
+
+    const float pi         = 3.1415926535897f;
+    float third            = 1.0f/3.0f;
+    float pi43             = 4.0f*third*pi;
+
+    float lik, uik;
+    float lik4, uik4;
+    float factor           = -pow(pi,third)*pow(6.0f,(2.0f*third))/9.0f;
+    float term             = pi43/(atomI.bornRadius*atomI.bornRadius*atomI.bornRadius);
+          term             = factor/pow( term, (4.0f*third) );
+
+    float xr               = atomJ.x - atomI.x;
+    float yr               = atomJ.y - atomI.y;
+    float zr               = atomJ.z - atomI.z;
+
+    float sk               = atomJ.scaledRadius;
+    float sk2              = sk*sk;
+    float r2               = xr*xr + yr*yr + zr*zr;
+    float r                = sqrt(r2);
+    float de               = 0.0f;
+
+    if( (atomI.radius + r) < sk ){
+        float uik4;
+        uik        = sk - r;
+        uik4       = uik*uik;
+        uik4       = uik4*uik4;
+        de         = -4.0f*pi/uik4;
+    }
+
+    if( (atomI.radius + r) < sk){
+        lik  = sk - r;
+        lik4 = lik*lik;
+        lik4 = lik4*lik4;
+        de  += 0.25f*pi*(sk2-4.0f*sk*r+17.0f*r2)/ (r2*lik4);
+    } else if( r < (atomI.radius +sk) ){
+        lik  = atomI.radius;
+        lik4 = lik*lik;
+        lik4 = lik4*lik4;
+        de  += 0.25f*pi*(2.0f*atomI.radius*atomI.radius-sk2-r2)/ (r2*lik4);
+    } else {
+        lik  = r - sk;
+        lik4 = lik*lik;
+        lik4 = lik4*lik4;
+        de  += 0.25f*pi*(sk2-4.0f*sk*r+r2)/ (r2*lik4);
+    }
+    uik        = r + sk;
+    uik4       = uik*uik;
+    uik4       = uik4*uik4;
+    
+    de        -= 0.25f*pi*(sk2+4.0f*sk*r+r2)/ (r2*uik4);
+    float dbr  = term * de/r;
+          de   = dbr*atomI.bornForce;
+
+#ifdef AMOEBA_DEBUG
+    pullDebug[0].x = de;
+    pullDebug[0].y = r;
+    pullDebug[0].z = factor;
+    pullDebug[0].w = -4.0f;
+
+    pullDebug[1].x = atomI.bornForce/4.184f;
+    pullDebug[1].y = atomI.bornRadius;
+    pullDebug[1].z = atomJ.bornForce/4.184f;
+    pullDebug[1].w = -5.0f;
+#endif
+
+    force[0]   = xr*de;
+    force[1]   = yr*de;
+    force[2]   = zr*de;
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateAmoebaCudaGrycukChainRule.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateAmoebaCudaGrycukChainRule.h"
+
+/**---------------------------------------------------------------------------------------
+
+   Compute Grycuk chain rule contribution to force
+
+   @param amoebaGpu        amoebaGpu context
+
+   --------------------------------------------------------------------------------------- */
+
+
+void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
+{
+
+#ifdef AMOEBA_DEBUG
+    static const char* methodName       = "kCalculateGrycukGbsaForces2";
+    static int timestep = 0;
+    std::vector<int> fileId;
+    timestep++;
+    fileId.resize( 2 );
+    fileId[0] = timestep;
+    fileId[1] = 1;
+#endif
+
+    // ---------------------------------------------------------------------------------------
+
+    gpuContext gpu = amoebaGpu->gpuContext;
+
+    // apparently debug array can take up nontrivial no. registers
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
+    }
+    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(20*paddedNumberOfAtoms, 1, "DebugArray");
+    memset( debugArray->_pSysData,      0, sizeof( float )*4*20*paddedNumberOfAtoms);
+    debugArray->Upload();
+    unsigned int targetAtom                   = 0;
+
+    gpu->psBornRadii->Download();
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
+        for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
+            (void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
+                            gpu->psBornRadii->_pSysData[ii] );
+        }
+    }
+#endif
+
+    // on first pass, set threads/block and based on that setting the energy buffer array
+
+    static unsigned int threadsPerBlock = 0;
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            //maxThreads = 384;
+            maxThreads = 512;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukChainRuleParticle), gpu->sharedMemoryPerBlock ), maxThreads);
+
+#ifdef AMOEBA_DEBUG
+        if( amoebaGpu->log ){
+            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                            gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
+                            sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
+                            (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+            (void) fflush( amoebaGpu->log );
+        }
+#endif
+
+    }
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
+                        (gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
+                        sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
+                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+        (void) fflush( amoebaGpu->log );
+    }
+#endif
+
+//kClearForces( gpu );
+
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
+#ifdef AMOEBA_DEBUG
+    ,debugArray->_pDevData, targetAtom
+#endif
+);
+    } else {
+        kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
+#ifdef AMOEBA_DEBUG
+    ,debugArray->_pDevData, targetAtom
+#endif
+);
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
+
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        debugArray->Download();
+        int paddedNumberOfAtoms  = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+        for( int jj = 0; jj < gpu->natoms; jj++ ){
+            int debugIndex = jj; 
+            (void) fprintf( amoebaGpu->log,"%5d %5d DebugGrycukChain\n", targetAtom, jj );
+            for( int kk = 0; kk < 7; kk++ ){
+                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
+                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
+                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
+                debugIndex += paddedNumberOfAtoms;
+            }   
+            (void) fprintf( amoebaGpu->log,"\n" );
+        }   
+    }   
+#endif
+
+   if( 0 ){ 
+        static int callId                    = 0;
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        fileId.push_back( callId++ ); 
+        VectorOfDoubleVectors outputVector;
+        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psLabFrameDipole,     outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        CUDAStream<float>* temp  = new CUDAStream<float>(3*gpu->sim.paddedNumberOfAtoms, 1, "Temp1");
+
+        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f ); 
+        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f ); 
+        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); 
+        cudaWriteVectorOfDoubleVectorsToFile( "GryF", fileId, outputVector );
+        delete temp;
+        exit(0);
+    }    
+
+   // ---------------------------------------------------------------------------------------
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukBornRadii.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukBornRadii.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaScaleFactors.h"
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaGrycukBornRadii, _kernel)( unsigned int* workUnit ){
+
+    extern __shared__ GrycukParticle sA[];
+
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+
+    while (pos < end)
+    {
+
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+
+        // Extract cell coordinates
+
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+
+        unsigned int tgx                 = threadIdx.x & (GRID - 1);
+        unsigned int tbx                 = threadIdx.x - tgx;
+        unsigned int tj                  = tgx;
+
+        GrycukParticle*  psA             = &sA[tbx];
+        unsigned int atomI               = x + tgx;
+        GrycukParticle localParticle;
+        loadGrycukShared( &localParticle, atomI );
+
+        float bornSum                    = 0.0f;
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+
+            // load shared data
+
+            loadGrycukShared( &(sA[threadIdx.x]), atomI );
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                float localBornSum;
+                calculateGrycukBornRadiiPairIxn_kernel( localParticle, psA[j], &localBornSum );
+                bornSum   +=  ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0.0 : localBornSum;
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset    = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += bornSum;
+#else
+            unsigned int offset   = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = bornSum;
+#endif
+
+        } else {
+
+            if (lasty != y) {
+                unsigned int atomJ        = y + tgx;
+                loadGrycukShared( &(sA[threadIdx.x]), atomJ );
+            }
+
+           // zero shared fields
+
+            zeroGrycukParticleSharedField(  &(sA[threadIdx.x]) );
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+
+                float localBornSum;
+                calculateGrycukBornRadiiPairIxn_kernel( localParticle, psA[tj], &localBornSum );
+                bornSum           +=  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0.0 : localBornSum;
+
+                calculateGrycukBornRadiiPairIxn_kernel( psA[tj], localParticle, &localBornSum );
+                psA[tj].bornSum   +=  ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0.0 : localBornSum;
+
+                tj                 = (tj + 1) & (GRID - 1);
+
+            }
+
+            // Write results
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+
+            unsigned int offset    = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += bornSum;
+
+            offset = y + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += sA[threadIdx.x].bornSum;
+#else
+
+            unsigned int offset   = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = bornSum;
+
+            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = sA[threadIdx.x].bornSum;
+#endif
+
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaGrycukChainRule.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "amoebaScaleFactors.h"
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit 
+#ifdef AMOEBA_DEBUG
+                           , float4* debugArray, unsigned int targetAtom
+#endif
+){
+
+    extern __shared__ GrycukChainRuleParticle sAChainRule[];
+
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int lasty           = 0xFFFFFFFF;
+
+#ifdef AMOEBA_DEBUG
+    float4 pullDebug[5];
+#endif
+
+    while (pos < end)
+    {
+
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+
+        // Extract cell coordinates
+
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+
+        unsigned int tgx                          = threadIdx.x & (GRID - 1);
+        unsigned int tbx                          = threadIdx.x - tgx;
+        unsigned int tj                           = tgx;
+
+        GrycukChainRuleParticle*  psAChainRule    = &sAChainRule[tbx];
+        unsigned int atomI                        = x + tgx;
+        GrycukChainRuleParticle localParticle;
+        loadGrycukChainRuleParticleShared( &localParticle, atomI );
+
+        zeroGrycukChainRuleParticleSharedField( &localParticle );
+
+        if (x == y){
+
+            // load shared data and zero force
+
+            loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomI );
+            zeroGrycukChainRuleParticleSharedField( &(sAChainRule[threadIdx.x]));
+
+            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
+            {
+                float localForce[3];
+                calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce
+#ifdef AMOEBA_DEBUG
+,  pullDebug
+#endif
+ );
+                if( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ){
+
+                    localParticle.force[0]     -= localForce[0];
+                    localParticle.force[1]     -= localForce[1];
+                    localParticle.force[2]     -= localForce[2];
+
+                    psAChainRule[j].force[0]   += localForce[0];
+                    psAChainRule[j].force[1]   += localForce[1];
+                    psAChainRule[j].force[2]   += localForce[2];
+
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom || (y+j) == targetAtom ){
+        unsigned int index                 = (atomI == targetAtom) ? (y + j) : atomI;
+
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) (y + j); 
+        debugArray[index].z                = -1.0f;
+        debugArray[index].w                = -1.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = (float) x;
+        debugArray[index].y                = (float) y;
+        debugArray[index].z                = (float) tgx;
+        debugArray[index].w                = -2.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = pullDebug[0].y;
+        debugArray[index].z                = pullDebug[0].z;
+        debugArray[index].w                = pullDebug[0].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[1].x;
+        debugArray[index].y                = pullDebug[1].y;
+        debugArray[index].z                = pullDebug[1].z;
+        debugArray[index].w                = pullDebug[1].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = localForce[0];
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -12.0f;
+
+ calculateGrycukChainRulePairIxn_kernel( psAChainRule[j], localParticle, localForce ,  pullDebug );
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = pullDebug[0].y;
+        debugArray[index].z                = pullDebug[0].z;
+        debugArray[index].w                = -13.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = localForce[0];
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -14.0f;
+}
+#endif
+
+                }
+            }
+
+            // Write results
+            float4 of; 
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4[offset];
+            of.x                       += localParticle.force[0]  + sAChainRule[threadIdx.x].force[0];
+            of.y                       += localParticle.force[1]  + sAChainRule[threadIdx.x].force[1];
+            of.z                       += localParticle.force[2]  + sAChainRule[threadIdx.x].force[2];
+            cSim.pForce4[offset]       = of; 
+
+        } else {
+
+            if (lasty != y) {
+                unsigned int atomJ        = y + tgx;
+                loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomJ );
+            }
+
+           // zero shared fields
+
+            zeroGrycukChainRuleParticleSharedField(  &(sAChainRule[threadIdx.x]) );
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+
+                if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
+                    float localForce[3];
+                    calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce 
+#ifdef AMOEBA_DEBUG
+,  pullDebug
+#endif
+);
+    
+                    localParticle.force[0]     -= localForce[0];
+                    localParticle.force[1]     -= localForce[1];
+                    localParticle.force[2]     -= localForce[2];
+    
+                    psAChainRule[tj].force[0]  += localForce[0];
+                    psAChainRule[tj].force[1]  += localForce[1];
+                    psAChainRule[tj].force[2]  += localForce[2];
+    
+#ifdef AMOEBA_DEBUG
+unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
+if( atomI == targetAtom || (y+tj) == targetAtom ){
+
+        debugArray[index].x                = (float) atomI;
+        debugArray[index].y                = (float) (y + tj); 
+        debugArray[index].z                = -1.0f;
+        debugArray[index].w                = -1.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = (float) x;
+        debugArray[index].y                = (float) y;
+        debugArray[index].z                = (float) tgx;
+        debugArray[index].w                = -2.0f;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = pullDebug[0].y;
+        debugArray[index].z                = pullDebug[0].z;
+        debugArray[index].w                = pullDebug[0].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[1].x;
+        debugArray[index].y                = pullDebug[1].y;
+        debugArray[index].z                = pullDebug[1].z;
+        debugArray[index].w                = pullDebug[1].w;
+
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = localForce[0];
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -10.0f;
+}
+#endif
+                    calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce
+#ifdef AMOEBA_DEBUG
+,  pullDebug
+#endif
+ );
+#ifdef AMOEBA_DEBUG
+if( atomI == targetAtom || (y+tj) == targetAtom ){
+        index                             += cSim.paddedNumberOfAtoms;
+        debugArray[index].x                = pullDebug[0].x;
+        debugArray[index].y                = localForce[1];
+        debugArray[index].z                = localForce[2];
+        debugArray[index].w                = -11.0f;
+}
+#endif
+    
+                    localParticle.force[0]     += localForce[0];
+                    localParticle.force[1]     += localForce[1];
+                    localParticle.force[2]     += localForce[2];
+    
+                    psAChainRule[tj].force[0]  -= localForce[0];
+                    psAChainRule[tj].force[1]  -= localForce[1];
+                    psAChainRule[tj].force[2]  -= localForce[2];
+                }
+
+                tj  = (tj + 1) & (GRID - 1);
+
+            }
+
+            // Write results
+
+            float4 of;
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4[offset];
+            of.x                       += localParticle.force[0];
+            of.y                       += localParticle.force[1];
+            of.z                       += localParticle.force[2];
+            cSim.pForce4[offset]       = of;
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            offset                      = y + tgx + warp*cSim.stride;
+#else
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4[offset];
+            of.x                       += sAChainRule[threadIdx.x].force[0];
+            of.y                       += sAChainRule[threadIdx.x].force[1];
+            of.z                       += sAChainRule[threadIdx.x].force[2];
+            cSim.pForce4[offset]       = of;
+
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1666,8 +1666,8 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
 #endif
- void kReduceToBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
-                                          float* fieldOut )
+ void kReduceToObcBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                             float* fieldOut )
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

@@ -1723,8 +1723,8 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
 #endif
-void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
-                                                float* fieldOut )
+void kReduceToObcBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                                   float* fieldOut )
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

@@ -1788,17 +1788,137 @@ void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, un
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
 }

-/*
-static void kReduceAndCombine_dBorn(amoebaGpuContext amoebaGpu )
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+ void kReduceToBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                          float* fieldOut )
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Reduce field
+
+    while (pos < fieldComponents)
+    {
+
+        float totalField = 0.0f;
+
+        float* pFt1      = fieldIn1 + pos;
+        float* pFt2      = fieldIn2 + pos;
+
+        //float bornRadius = cSim.pBornRadii[pos];
+        //float obcChain   = cSim.pObcChain[pos];
+
+        unsigned int i   = outputBuffers;
+
+        while (i >= 4)
+        {
+            totalField += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
+            totalField += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
+            pFt1       += fieldComponents*4;
+            pFt2       += fieldComponents*4;
+            i          -= 4;
+        }
+
+        if (i >= 2)
+        {
+            totalField += pFt1[0] + pFt1[fieldComponents];
+            totalField += pFt2[0] + pFt2[fieldComponents];
+            pFt1       += fieldComponents*2;
+            pFt2       += fieldComponents*2;
+            i          -= 2;
+        }
+
+        if (i > 0)
+        {
+            totalField += pFt1[0];
+            totalField += pFt2[0];
+        }
+
+        //fieldOut[pos]   = totalField*bornRadius*bornRadius*obcChain;
+        fieldOut[pos]   = totalField;
+        pos            += gridDim.x * blockDim.x;
+    }
+}
+
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
+                                                float* fieldOut )
 {
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    float energy     = 0.0f;
+
+    // Reduce field
+
+    while (pos < fieldComponents)
+    {
+
+        float totalForce = 0.0f;
+
+        float* pFt1      = fieldIn1 + pos;
+        float* pFt2      = fieldIn2 + pos;
+
+        float bornRadius = cSim.pBornRadii[pos];
+        //float obcChain   = cSim.pObcChain[pos];
+        float2 obcData   = cSim.pObcData[pos];
+
+        unsigned int i   = outputBuffers;
+
+        while (i >= 4)
+        {
+            totalForce += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
+            totalForce += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
+            pFt1       += fieldComponents*4;
+            pFt2       += fieldComponents*4;
+            i          -= 4;
+        }
+
+        if (i >= 2)
+        {
+            totalForce += pFt1[0] + pFt1[fieldComponents];
+            totalForce += pFt2[0] + pFt2[fieldComponents];
+            pFt1       += fieldComponents*2;
+            pFt2       += fieldComponents*2;
+            i          -= 2;
+        }
+
+        if (i > 0)
+        {
+            totalForce += pFt1[0];
+            totalForce += pFt2[0];
+        }

-    kReduceAndCombineFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                                     gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
-                                     amoebaGpu->psWorkArray_1_1->_pDevData,
-                                     amoebaGpu->psWorkArray_1_2->_pDevData,
-                                     amoebaGpu->psBorn->_pDevData );
-    LAUNCHERROR("kReduce_dBorn");
-} */
+        float r        = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
+        float ratio6   = ( (obcData.x + cSim.dielectricOffset) / bornRadius);
+              ratio6   = ratio6*ratio6*ratio6;
+              ratio6   = ratio6*ratio6;
+        float saTerm   = cSim.surfaceAreaFactor * r * r * ratio6;
+
+        totalForce    += saTerm / bornRadius;
+        //totalForce    *= bornRadius * bornRadius * obcChain;
+
+        fieldOut[pos]  = totalForce;
+
+        energy        += saTerm;
+        pos           += gridDim.x * blockDim.x;
+    }
+
+    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
+}

 static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
 {
@@ -1982,7 +2102,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )

    // Tinker's Born1 && E-diff

-    kCalculateObcGbsaForces2( amoebaGpu->gpuContext );
+    //kCalculateObcGbsaForces2( amoebaGpu->gpuContext );
+    kCalculateGrycukGbsaForces2( amoebaGpu );
    kCalculateAmoebaKirkwoodEDiff( amoebaGpu );

   // ---------------------------------------------------------------------------------------

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
@@ -36,6 +36,7 @@
 #include "AmoebaReferenceMultipoleForce.h"
 #include "AmoebaReferenceVdwForce.h"
 #include "AmoebaReferenceWcaDispersionForce.h"
+#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
 #include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
 #include "AmoebaReferenceUreyBradleyForce.h"
 #include "ReferencePlatform.h"
@@ -412,16 +413,31 @@ void ReferenceCalcAmoebaTorsionTorsionForceKernel::initialize(const System& syst
    for (int ii = 0; ii < numTorsionTorsionGrids; ii++) {

        const TorsionTorsionGrid grid = force.getTorsionTorsionGrid( ii );
-
        torsionTorsionGrids[ii].resize( grid.size() );
+
+        // check if grid needs to be reordered: x-angle should be 'slow' index
+
+        TorsionTorsionGrid reorderedGrid;
+        int reorder = 0; 
+        if( grid[0][0][0] != grid[0][1][0] ){
+            AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
+            reorder = 1; 
+        }    
+
        for (unsigned int kk = 0; kk < grid.size(); kk++) {

            torsionTorsionGrids[ii][kk].resize( grid[kk].size() );
            for (unsigned int jj = 0; jj < grid[kk].size(); jj++) {

                torsionTorsionGrids[ii][kk][jj].resize( grid[kk][jj].size() );
-                for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
-                    torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(grid[kk][jj][ll]);
+                if( reorder ){
+                    for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
+                        torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(reorderedGrid[kk][jj][ll]);
+                    }
+                } else {
+                    for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
+                        torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(grid[kk][jj][ll]);
+                    }
                }
            }
        }

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
@@ -27,8 +27,8 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "openmm/amoebaKernels.h"
 #include "openmm/System.h"
+#include "openmm/amoebaKernels.h"
 #include "SimTKUtilities/SimTKOpenMMRealType.h"

 namespace OpenMM {