Added code to calculate electrostatic potential given vector of grid points

e79b262b · Mark Friedrichs · 70ba177d · e79b262b · e79b262b · e79b262b
Commit e79b262b authored Jun 15, 2012 by Mark Friedrichs
20 changed files
--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
@@ -35,6 +35,7 @@
 #include "openmm/Force.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/windowsExport.h"
+#include "openmm/Vec3.h"
 #include <sstream>
 #include <vector>
@@ -302,34 +303,6 @@ public:
     */
    void setMutualInducedTargetEpsilon( double inputMutualInducedTargetEpsilon ); 
-    /**
-     * Get the scaling distance cutoff (nm)
-     * 
-     * @return scaling distance cutoff
-     */
-    //double getScalingDistanceCutoff( void ) const;
-    /**
-     * Set the scaling distance cutoff
-     * 
-     * @param scaling distance cutoff
-     */
-    //void setScalingDistanceCutoff( double inputScalingDistanceCutoff ); 
-    /**
-     * Get the electric constant
-     * @return the electric constant
-     */
-    //double getElectricConstant( void ) const;
-    /**
-     * Set the electric constant
-     * 
-     * @param the electric constant
-     */
-    //void setElectricConstant( double inputElectricConstant ); 
    /**
     * Get the error tolerance for Ewald summation.  This corresponds to the fractional error in the forces
     * which is acceptable.  This value is used to select the reciprocal space cutoff and separation
@@ -344,7 +317,19 @@ public:
     * rigorous guarantee that all forces on all atoms will be less than the tolerance, however.
     */
    void setEwaldErrorTolerance(double tol);
+    /**
+     * Get the electrostatic potential.
+     *
+     * @param inputGrid    input grid points over which the potential is to be evaluated
+     * @param context      context
+     * @param outputElectrostaticPotential output potential 
+     */
+    void getElectrostaticPotential( const std::vector< Vec3 >& inputGrid,
+                                    Context& context, std::vector< double >& outputElectrostaticPotential );
 protected:
    ForceImpl* createImpl();
 private:

--- a/plugins/amoeba/openmmapi/include/openmm/amoebaKernels.h
+++ b/plugins/amoeba/openmmapi/include/openmm/amoebaKernels.h
@@ -370,6 +370,10 @@ public:
     * @return the potential energy due to the force
     */
    virtual double execute(ContextImpl& context, bool includeForces, bool includeEnergy) = 0;
+    virtual void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
+                                            std::vector< double >& outputElectrostaticPotential ) = 0;
 };
 /**

--- a/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaMultipoleForceImpl.h
+++ b/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaMultipoleForceImpl.h
@@ -35,6 +35,7 @@
 #include "openmm/internal/ForceImpl.h"
 #include "openmm/AmoebaMultipoleForce.h"
 #include "openmm/Kernel.h"
+#include "openmm/Vec3.h"
 #include <utility>
 #include <string>
@@ -81,6 +82,10 @@ public:
     */
    static void getCovalentDegree( const AmoebaMultipoleForce& force, std::vector<int>& covalentDegree );
+    void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
+                                    std::vector< double >& outputElectrostaticPotential );
 private:
    AmoebaMultipoleForce& owner;
    Kernel kernel;

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -33,6 +33,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/AmoebaMultipoleForce.h"
 #include "openmm/internal/AmoebaMultipoleForceImpl.h"
+#include <stdio.h>
 using namespace OpenMM;
 using std::string;
@@ -130,22 +131,6 @@ double AmoebaMultipoleForce::getMutualInducedTargetEpsilon( void ) const {
 void AmoebaMultipoleForce::setMutualInducedTargetEpsilon( double inputMutualInducedTargetEpsilon ) {
    mutualInducedTargetEpsilon = inputMutualInducedTargetEpsilon;
 }
-/*
-double AmoebaMultipoleForce::getScalingDistanceCutoff( void ) const {
-    return scalingDistanceCutoff;
-}
-void AmoebaMultipoleForce::setScalingDistanceCutoff( double inputScalingDistanceCutoff ) {
-    scalingDistanceCutoff = inputScalingDistanceCutoff;
-}
-double AmoebaMultipoleForce::getElectricConstant( void ) const {
-    return electricConstant;
-}
-void AmoebaMultipoleForce::setElectricConstant( double inputElectricConstant ) {
-    electricConstant = inputElectricConstant;
-} */
 double AmoebaMultipoleForce::getEwaldErrorTolerance() const {
    return ewaldErrorTol;
@@ -254,6 +239,10 @@ void AmoebaMultipoleForce::getCovalentMaps(int index, std::vector< std::vector<i
    }
 }
+void AmoebaMultipoleForce::getElectrostaticPotential( const std::vector< Vec3 >& inputGrid, Context& context, std::vector< double >& outputElectrostaticPotential ){
+    dynamic_cast<AmoebaMultipoleForceImpl&>(getImplInContext(context)).getElectrostaticPotential(getContextImpl(context), inputGrid, outputElectrostaticPotential);
+}
 ForceImpl* AmoebaMultipoleForce::createImpl() {
    return new AmoebaMultipoleForceImpl(*this);
 }
--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
@@ -32,6 +32,7 @@
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/AmoebaMultipoleForceImpl.h"
 #include "openmm/amoebaKernels.h"
+#include <stdio.h>
 using namespace OpenMM;
@@ -137,3 +138,9 @@ void AmoebaMultipoleForceImpl::getCovalentDegree( const AmoebaMultipoleForce& fo
    }   
    return;
 }
+void AmoebaMultipoleForceImpl::getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
+                                                          std::vector< double >& outputElectrostaticPotential ){
+    kernel.getAs<CalcAmoebaMultipoleForceKernel>().getElectrostaticPotential(context, inputGrid, outputElectrostaticPotential);
+}
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.cpp
@@ -116,6 +116,10 @@ void AmoebaCudaData::setContextImpl( void* inputContextImpl ) {
    contextImpl = inputContextImpl;
 }
+void AmoebaCudaData::setGpuInitialized( bool inputGpuInitialized) {
+    gpuInitialized = inputGpuInitialized;
+}
 void AmoebaCudaData::initializeGpu( void ) {
    if( !gpuInitialized ){

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaData.h
@@ -188,6 +188,13 @@ public:
     */
    void setUseVdwNeighborList( int useVdwNeighborList ); 
+    /**
+     * Set flag indicating whether gpu is initialized
+     * 
+     * @param flag indicating gpu is initialized
+     */
+    void setGpuInitialized( bool gpuInitialized ); 
    CudaPlatform::PlatformData& cudaPlatformData;
 private:

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -109,7 +109,7 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
    if( mapIterator == contextToAmoebaDataMap.end() ){
        amoebaCudaData                         = new AmoebaCudaData( cudaPlatformData );
        contextToAmoebaDataMap[&context]       = amoebaCudaData;
-        //amoebaCudaData->setLog( stderr );
+        amoebaCudaData->setLog( stderr );
        amoebaCudaData->setContextImpl( static_cast<void*>(&context) );
    } else {
        amoebaCudaData                         = mapIterator->second;

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -798,9 +798,6 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
 static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    amoebaGpuContext gpu = data.getAmoebaGpu();
-    if( data.getMultipoleForceCount() == 0 ){
-        gpuCopyWorkUnit( gpu );
-    }
    data.incrementMultipoleForceCount();
    if( 0 && data.getLog() ){
@@ -839,6 +836,30 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
    }
 }
+static void computeAmoebaMultipolePotential( AmoebaCudaData& data, const std::vector< Vec3 >& inputGrid,
+                                             std::vector< double >& outputElectrostaticPotential) {
+    amoebaGpuContext gpu = data.getAmoebaGpu();
+    // load grid to board and allocate board memory for potential buffers
+    // calculate potential
+    // load potential into return vector
+    // deallocate board memory
+    gpuSetupElectrostaticPotentialCalculation( gpu, inputGrid );
+    data.setGpuInitialized( false );
+    data.initializeGpu();
+    kCalculateAmoebaMultipolePotential( gpu );
+    gpuLoadElectrostaticPotential( gpu, inputGrid.size(), outputElectrostaticPotential );
+    gpuCleanupElectrostaticPotentialCalculation( gpu );
+    if( 0 && data.getLog() ){
+        (void) fprintf( data.getLog(), "completed computeAmoebaMultipolePotential\n" );
+        (void) fflush( data.getLog());
+    }
+}
 class CudaCalcAmoebaMultipoleForceKernel::ForceInfo : public CudaForceInfo {
 public:
    ForceInfo(const AmoebaMultipoleForce& force) : force(force) {
@@ -1048,6 +1069,12 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
    return 0.0;
 }
+void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context,  const std::vector< Vec3 >& inputGrid,
+                                                                   std::vector< double >& outputElectrostaticPotential) {
+    computeAmoebaMultipolePotential( data, inputGrid, outputElectrostaticPotential );
+    return;
+}
 /* -------------------------------------------------------------------------- *
 *                       AmoebaGeneralizedKirkwood                            *
 * -------------------------------------------------------------------------- */

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -335,6 +335,15 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Execute the kernel to calculate the electrostatic potential
+     *
+     * @param context        the context in which to execute this kernel
+     * @param inputGrid      input grid coordinates
+     * @param outputElectrostaticPotential output potential 
+     */
+    void getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
+                                   std::vector< double >& outputElectrostaticPotential );
 private:
    class ForceInfo;
    int numMultipoles;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaGpu.cpp
@@ -37,6 +37,7 @@
 #define MAX_PARAMETER_PRINT 10
 #include "openmm/OpenMMException.h"
+#include "openmm/Vec3.h"
 #include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
@@ -383,6 +384,21 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameDipole, log );
    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psLabFrameQuadrupole, log );
+    (void) fflush( log );
+    (void) fprintf( log, "     potentialGridSize                  %u\n",      amoebaGpu->amoebaSim.potentialGridSize);
+    (void) fprintf( log, "     paddedPotentialGridSize            %u\n",      amoebaGpu->amoebaSim.paddedPotentialGridSize);
+    (void) fprintf( log, "     potentialWorkUnits                 %u\n",      amoebaGpu->amoebaSim.potentialWorkUnits );
+    totalMemory += gpuPrintCudaStreamUnsignedInt( amoebaGpu->psPotentialWorkUnit, log );
+    (void) fprintf( log, "     pPotentialWorkUnit                 %p\n",      amoebaGpu->amoebaSim.pPotentialWorkUnit);
+    totalMemory += gpuPrintCudaStreamFloat4( amoebaGpu->psPotentialGrid, log );
+    (void) fprintf( log, "     pPotentialGrid                     %p\n",      amoebaGpu->amoebaSim.pPotentialGrid);
+    totalMemory += gpuPrintCudaStreamFloat( amoebaGpu->psPotential, log );
+    (void) fprintf( log, "     pPotential                         %p\n",      amoebaGpu->amoebaSim.pPotential);
+    (void) fflush( log );
    (void) fprintf( log, "     polarizationType                   %d\n",      amoebaGpu->amoebaSim.polarizationType );
    (void) fprintf( log, "     maxCovalentDegreeSz                %d\n",      amoebaGpu->maxCovalentDegreeSz );
@@ -1473,6 +1489,123 @@ void gpuSetAmoebaBondOffsets(amoebaGpuContext amoebaGpu )
    }
 }
+/**---------------------------------------------------------------------------------------
+   Allocate data structs associated w/ calculation of electrostatic potential
+   @param amoebaGpu        amoebaGpu context
+   @param inputGrid        input grid over which potential is to be calculated
+   --------------------------------------------------------------------------------------- */
+void gpuSetupElectrostaticPotentialCalculation( amoebaGpuContext amoebaGpu, const std::vector< OpenMM::Vec3 >& inputGrid )
+{
+    // ---------------------------------------------------------------------------------------
+    const unsigned int grid                       = amoebaGpu->gpuContext->grid;
+    const unsigned int potentialGridSize          = inputGrid.size();
+    amoebaGpu->amoebaSim.potentialGridSize        = potentialGridSize;
+    amoebaGpu->amoebaSim.paddedPotentialGridSize  = (potentialGridSize + grid - 1)/grid;
+    amoebaGpu->amoebaSim.paddedPotentialGridSize *= grid;
+    const unsigned int atoms                      = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
+    const unsigned int particleDim                = atoms/grid;
+    const unsigned int cells                      = (particleDim*amoebaGpu->amoebaSim.paddedPotentialGridSize)/grid; 
+    CUDAStream<unsigned int>* psPotentialWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "PotentialWorkUnit");
+    unsigned int* pWorkList                       = psPotentialWorkUnit->_pSysData;
+    amoebaGpu->psPotentialWorkUnit                = psPotentialWorkUnit;
+    amoebaGpu->amoebaSim.pPotentialWorkUnit       = psPotentialWorkUnit->_pDevStream[0];
+    unsigned int count                            = 0;
+    for (unsigned int y = 0; y < particleDim; y++)
+    {
+        for (unsigned int x = 0; x < (amoebaGpu->amoebaSim.paddedPotentialGridSize/grid); x++)
+        {
+            pWorkList[count++] = (x << 17) | (y << 2);
+        }
+    }
+    amoebaGpu->amoebaSim.potentialWorkUnits = cells;
+    psPotentialWorkUnit->Upload();
+#ifdef AMOEBA_DEBUG
+    if( amoebaGpu->log ){
+        (void) fprintf( amoebaGpu->log, "gpuSetupElectrostaticPotentialCalculation:: Potential grid=%u %u buffers=%u ttl=%u particleDim=%u cells=%u\n", 
+                        potentialGridSize,  amoebaGpu->amoebaSim.paddedPotentialGridSize, amoebaGpu->gpuContext->sim.outputBuffers,
+                        amoebaGpu->amoebaSim.paddedPotentialGridSize*amoebaGpu->gpuContext->sim.outputBuffers, particleDim, cells );
+    }    
+#endif
+    // load grid 
+    amoebaGpu->psPotentialGrid = new CUDAStream<float4>( potentialGridSize,  1, "PotentialGrid");
+    for( int ii = 0; ii < potentialGridSize; ii++ ){
+        amoebaGpu->psPotentialGrid->_pSysData[ii].x = inputGrid[ii][0];
+        amoebaGpu->psPotentialGrid->_pSysData[ii].y = inputGrid[ii][1];
+        amoebaGpu->psPotentialGrid->_pSysData[ii].z = inputGrid[ii][2];
+    }
+    amoebaGpu->psPotentialGrid->Upload();
+    // allocate memory for potential buffers and zero
+    amoebaGpu->psPotential = new CUDAStream<float>( amoebaGpu->amoebaSim.paddedPotentialGridSize,  amoebaGpu->gpuContext->sim.outputBuffers, "Potential");
+    memset( amoebaGpu->psPotential->_pSysData,      0, sizeof(float)*amoebaGpu->amoebaSim.paddedPotentialGridSize*amoebaGpu->gpuContext->sim.outputBuffers);
+    amoebaGpu->psPotential->Upload();
+    amoebaGpu->amoebaSim.pPotentialGrid = amoebaGpu->psPotentialGrid->_pDevData;
+    amoebaGpu->amoebaSim.pPotential     = amoebaGpu->psPotential->_pDevData;
+}
+/**---------------------------------------------------------------------------------------
+   Load output potential
+   @param amoebaGpu                    amoebaGpu context
+   @param gridSize                     grid size
+   @param outputElectrostaticPotential output potential
+   --------------------------------------------------------------------------------------- */
+void gpuLoadElectrostaticPotential( amoebaGpuContext amoebaGpu, unsigned int gridSize, std::vector< double >& outputElectrostaticPotential ){
+    // ---------------------------------------------------------------------------------------
+    outputElectrostaticPotential.resize( gridSize );
+    amoebaGpu->psPotential->Download();
+    for( int ii = 0; ii < gridSize; ii++ ){
+        outputElectrostaticPotential[ii] = amoebaGpu->psPotential->_pSysData[ii];
+    }
+    return;
+}
+/**---------------------------------------------------------------------------------------
+   Deallocate data structs associated w/ calculation of electrostatic potential
+   @param amoebaGpu        amoebaGpu context
+   --------------------------------------------------------------------------------------- */
+void gpuCleanupElectrostaticPotentialCalculation( amoebaGpuContext amoebaGpu ){
+    // ---------------------------------------------------------------------------------------
+    delete amoebaGpu->psPotentialWorkUnit;
+    amoebaGpu->psPotentialWorkUnit              = NULL;
+    amoebaGpu->amoebaSim.pPotentialWorkUnit     = NULL;
+    delete amoebaGpu->psPotentialGrid;
+    amoebaGpu->psPotentialGrid                  = NULL;
+    amoebaGpu->amoebaSim.pPotentialGrid         = NULL;
+    delete amoebaGpu->psPotential;
+    amoebaGpu->psPotential                      = NULL;
+    amoebaGpu->amoebaSim.pPotential             = NULL;
+}
 /**---------------------------------------------------------------------------------------
   Allocate data structs associated w/ molecular -> lab frame calculation
@@ -4807,34 +4940,6 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
    }
 }
-/**---------------------------------------------------------------------------------------
-   Track iterations for MI dipoles
-   @param amoebaGpu            amoebaGpuContext reference
-   @param iteration            MI iteration
-   --------------------------------------------------------------------------------------- */
-void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){
-// ---------------------------------------------------------------------------------------
-/*
-    gpuContext gpu = amoebaGpu->gpuContext;
-    gpu->psInteractingWorkUnit->Download();
-    gpu->psWorkUnit->Download();
-    amoebaGpu->psWorkUnit->Download();
-    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
-        gpu->psWorkUnit->_pSysData[ii]            = amoebaGpu->psWorkUnit->_pSysData[ii];
-    }    
-    gpu->psInteractingWorkUnit->Upload();
-    gpu->psWorkUnit->Upload();
-*/
-// ---------------------------------------------------------------------------------------
-}
 #undef  AMOEBA_DEBUG
 /**---------------------------------------------------------------------------------------

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -52,6 +52,7 @@ extern void kCalculateAmoebaLocalForces(amoebaGpuContext gpu);
 extern void SetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext gpu);
 extern void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool performGk );
+extern void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu );
 // vdw
@@ -105,6 +106,7 @@ extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::
 extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce );
+extern void cudaComputeAmoebaElectrostaticPotential( amoebaGpuContext amoebaGpu );
 extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
 extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -147,6 +147,13 @@ struct cudaAmoebaGmxSimulation {
    float* pMolecularDipole; 
    float* pMolecularQuadrupole; 
+    unsigned int paddedPotentialGridSize;
+    unsigned int potentialGridSize;
+    unsigned int* pPotentialWorkUnit;
+    unsigned int potentialWorkUnits;
+    float4* pPotentialGrid; 
+    float*  pPotential; 
    float* pLabFrameDipole;
    float* pLabFrameQuadrupole;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -28,6 +28,8 @@
 * -------------------------------------------------------------------------- */
 #include "kernels/gputypes.h"
+#include "OpenMM.h"
+#include "openmm/Vec3.h"
 #include "amoebaCudaTypes.h"
 #include <map>
@@ -105,12 +107,17 @@ struct _amoebaGpuContext {
    // buffer indices used for mapping torques onto forces 
    int torqueMapForce4Delete;
-    CUDAStream<int4>* psMultipoleParticlesTorqueBufferIndices;
+    CUDAStream<int4>*    psMultipoleParticlesTorqueBufferIndices;
    CUDAStream<float4>*  psTorqueMapForce4; 
    CUDAStream<float>* psMolecularDipole;
    CUDAStream<float>* psMolecularQuadrupole;
+    CUDAStream<unsigned int>* psPotentialWorkUnit;
+    CUDAStream<float4>* psPotentialGrid;
+    CUDAStream<float>*  psPotential;
    // molecular frame multipoles
    CUDAStream<float>* psLabFrameDipole;
@@ -283,6 +290,13 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
                                     int nonbondedMethod, int polarizationType, float cutoffDistance,  float alphaEwald );
+extern "C"
+void gpuSetupElectrostaticPotentialCalculation( amoebaGpuContext amoebaGpu, const std::vector< OpenMM::Vec3 >& inputGrid );
+extern "C"
+void gpuLoadElectrostaticPotential( amoebaGpuContext amoebaGpu, unsigned int gridSize, std::vector< double >& outputElectrostaticPotential );
+extern "C"
+void gpuCleanupElectrostaticPotentialCalculation( amoebaGpuContext amoebaGpu );
 extern "C"
 void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu , float innerDielectric, float solventDielectric,
                                const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
@@ -323,7 +337,4 @@ void amoebaGpuSetConstants(amoebaGpuContext gpu, int updateFlag );
 extern "C"
 void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
-extern "C"
-void gpuCopyWorkUnit(amoebaGpuContext gpu);
 #endif //__AMOEBA_GPUTYPES_H__
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -27,6 +27,7 @@
 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"
+#include <stdio.h>
 static __constant__ cudaGmxSimulation cSim;
 static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
@@ -39,6 +40,7 @@ void SetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
 }
 void GetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
@@ -624,7 +626,7 @@ static void kReduceTorque(amoebaGpuContext amoebaGpu ){
   Compute Amoeba electrostatic force & torque
   @param amoebaGpu        amoebaGpu context
-   @param gpu              OpenMM gpu Cuda context
+   @param addTorqueToForce if set, then add force resulting from torque to force array
   --------------------------------------------------------------------------------------- */
@@ -634,8 +636,6 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
    gpuContext gpu = amoebaGpu->gpuContext;
-    // apparently debug array can take up nontrivial no. registers
    // on first pass, set threads/block
    static unsigned int threadsPerBlock = 0;
@@ -669,3 +669,230 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueTo
   // ---------------------------------------------------------------------------------------
 }
+struct ElectrostaticPotentialParticle {
+    // coordinates charge
+    float x;
+    float y;
+    float z;
+    float q;
+    // lab frame dipole
+    float labFrameDipole[3];
+    // lab frame quadrupole
+    float labFrameQuadrupole[9];
+    // induced dipole
+    float inducedDipole[3];
+};
+/**---------------------------------------------------------------------------------------
+   Load data for particle w/ index=atomI
+   @param sa        address to store atomI's coordinates and multipole moments
+   @param atomI     index of atom whose data is to be stored
+   --------------------------------------------------------------------------------------- */
+static __device__ void loadElectrostaticPotentialParticle( volatile struct ElectrostaticPotentialParticle* sA, unsigned int atomI ){
+    // coordinates & charge
+    sA->x                        = cSim.pPosq[atomI].x;
+    sA->y                        = cSim.pPosq[atomI].y;
+    sA->z                        = cSim.pPosq[atomI].z;
+    sA->q                        = cSim.pPosq[atomI].w;
+    // lab dipole
+    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
+    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
+    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
+    // lab quadrupole
+    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
+    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
+    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
+    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
+    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
+    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
+    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
+    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
+    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
+    // induced dipole
+    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
+    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
+    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
+}
+/**---------------------------------------------------------------------------------------
+   Calculate potential at grid point due atomI
+   Code adapted from TINKER routine potpoint in potpoint.f
+   @param atomI     atomI's coordinates and multipole moments
+   @param gridPoint grid coordinates
+   @param potential output potential
+   --------------------------------------------------------------------------------------- */
+__device__ void calculateElectrostaticPotentialForAtomGridPoint_kernel( volatile ElectrostaticPotentialParticle& atomI, volatile float4& gridPoint, float* potential ){
+    float xr                 = atomI.x - gridPoint.x;
+    float yr                 = atomI.y - gridPoint.y;
+    float zr                 = atomI.z - gridPoint.z;
+    float r2                 = xr*xr + yr*yr + zr*zr;
+    float r                  = sqrtf( r2 );
+    float rr1                = 1.0f/r;
+    *potential               = atomI.q*rr1;
+    float rr2                = rr1*rr1;
+    float rr3                = rr1*rr2;
+    float scd                = atomI.labFrameDipole[0]*xr     +  atomI.labFrameDipole[1]*yr    + atomI.labFrameDipole[2]*zr;
+    float scu                =  atomI.inducedDipole[0]*xr     +   atomI.inducedDipole[1]*yr    +  atomI.inducedDipole[2]*zr;
+    *potential              -= (scd + scu)*rr3;
+    float rr5                = 3.0f*rr3*rr2;
+    float scq                = xr*(atomI.labFrameQuadrupole[0]*xr + atomI.labFrameQuadrupole[1]*yr + atomI.labFrameQuadrupole[2]*zr);
+          scq               += yr*(atomI.labFrameQuadrupole[1]*xr + atomI.labFrameQuadrupole[4]*yr + atomI.labFrameQuadrupole[5]*zr);
+          scq               += zr*(atomI.labFrameQuadrupole[2]*xr + atomI.labFrameQuadrupole[5]*yr + atomI.labFrameQuadrupole[8]*zr);
+    *potential              += scq*rr5;
+    return;
+}
+// Include versions of the kernels for N x PotentialGridSize calculations.
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME 
+#define METHOD_NAME(a, b) a##NxG##b
+#include "kCalculateAmoebaCudaElectrostaticPotential.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##NxGByWarp##b
+#include "kCalculateAmoebaCudaElectrostaticPotential.h"
+// Kernel to reduce potential
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReducePotential_kernel()
+{
+    unsigned int pos             = (blockIdx.x * blockDim.x + threadIdx.x);
+    float conversionFactor       = (cAmoebaSim.electric/cAmoebaSim.dielec);
+    // Reduce potential
+    while (pos < cAmoebaSim.paddedPotentialGridSize)
+    {
+        float totalPotential         = 0.0f;
+        float* pFt                   = cAmoebaSim.pPotential + pos;
+        int i                        = cSim.outputBuffers;
+        while (i >= 4)
+        {
+            float f1             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f2             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f3             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f4             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            totalPotential      += f1 + f2 + f3 + f4;
+            i                   -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            float f2             = *pFt;
+            pFt                 += cAmoebaSim.paddedPotentialGridSize;
+            totalPotential      += f1 + f2;
+            i                   -= 2;
+        }
+        if (i > 0)
+        {
+            totalPotential += *pFt;
+        }
+        totalPotential *= conversionFactor;
+        pFt             = cAmoebaSim.pPotential + pos;
+        *pFt            = totalPotential;
+        pos            += gridDim.x*blockDim.x;
+    }   
+}
+/**---------------------------------------------------------------------------------------
+   Reduce Amoeba electrostatic potential
+   @param gpu        gpu context
+   --------------------------------------------------------------------------------------- */
+void kReducePotential(gpuContext gpu)
+{
+    kReducePotential_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
+    LAUNCHERROR("kReducePotential");
+}
+/**---------------------------------------------------------------------------------------
+   Compute Amoeba electrostatic potential
+   @param amoebaGpu        amoebaGpu context
+   --------------------------------------------------------------------------------------- */
+void cudaComputeAmoebaElectrostaticPotential( amoebaGpuContext amoebaGpu ){
+   // ---------------------------------------------------------------------------------------
+    gpuContext gpu = amoebaGpu->gpuContext;
+    // on first pass, set threads/block
+    static unsigned int threadsPerBlock = 0;
+    if( threadsPerBlock == 0 ){
+        unsigned int maxThreads;
+        if (gpu->sm_version >= SM_20)
+            //maxThreads = 384;
+            maxThreads = 512;
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 128;
+        else
+            maxThreads = 64;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticPotentialParticle), gpu->sharedMemoryPerBlock), maxThreads);
+    }
+    if (gpu->bOutputBufferPerWarp){
+        kCalculateAmoebaCudaElectrostaticPotentialNxGByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
+    } else {
+        kCalculateAmoebaCudaElectrostaticPotentialNxG_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
+    }
+    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticPotential");
+    kReducePotential( amoebaGpu->gpuContext );
+   // ---------------------------------------------------------------------------------------
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(512, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(128, 1)
+#else
+__launch_bounds__(64, 1)
+#endif
+void METHOD_NAME(kCalculateAmoebaCudaElectrostaticPotential, _kernel)( void ){
+    extern __shared__ volatile ElectrostaticPotentialParticle sAPotential[];
+    unsigned int* workUnit       = cAmoebaSim.pPotentialWorkUnit;
+    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
+    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits    = cAmoebaSim.potentialWorkUnits;
+    unsigned int pos             = warp*numWorkUnits/totalWarps;
+    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
+    while (pos < end){
+        unsigned int x;
+        unsigned int y;
+        bool bExclusionFlag;
+        // Extract cell coordinates
+        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
+        unsigned int tgx              = threadIdx.x & (GRID - 1);
+        unsigned int tbx              = threadIdx.x - tgx;
+        unsigned int tj               = tgx;
+        volatile ElectrostaticPotentialParticle* psA = &sAPotential[tbx];
+        unsigned int gridPointIndex   = x + tgx;
+        unsigned int particleIndex    = y + tgx;
+        // load particle info
+        loadElectrostaticPotentialParticle( &(sAPotential[threadIdx.x]), particleIndex );
+        float totalPotential  = 0.0f;
+        for (unsigned int j = 0; j < GRID; j++){
+            unsigned int particleJ = y + tj;
+            float potential;
+            calculateElectrostaticPotentialForAtomGridPoint_kernel( psA[tj], cAmoebaSim.pPotentialGrid[gridPointIndex], &potential );
+            if( particleJ < cSim.atoms && gridPointIndex < cAmoebaSim.potentialGridSize ){
+                totalPotential += potential;
+            }
+            tj = (tj + 1) & (GRID - 1);
+        }
+        // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+        unsigned int offset            = (x + tgx + warp*cAmoebaSim.paddedPotentialGridSize);
+        cAmoebaSim.pPotential[offset] += totalPotential; 
+#else
+        unsigned int offset            = (x + tgx + (y >> GRIDBITS)*cAmoebaSim.paddedPotentialGridSize);
+        cAmoebaSim.pPotential[offset]  = totalPotential; 
+#endif
+        pos++;
+    }
+}
--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodParticle.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodParticle.h
@@ -42,7 +42,7 @@ struct KirkwoodParticle {
    float dBornRadius;
    float dBornRadiusPolar;
-//    float padding;
+    //float padding;
 };

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaMutualInducedAndGkFields.cu
@@ -30,6 +30,7 @@
 #include "openmm/OpenMMException.h"
 #include <stdio.h>
+#include <sstream>
 using namespace std;
@@ -587,7 +588,9 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoe
        // check for nans
        if( currentEpsilon != currentEpsilon ){
-             throw OpenMM::OpenMMException("GkFieldBySOR: Nans detected in induced dipole calculation.");
+             std::stringstream msg;            
+             msg << "GkFieldBySOR: Nans detected in induced dipole calculation at iteration=" << iteration << " call=" << timestep << ".";
+             throw OpenMM::OpenMMException( msg.str() );
        }
        // converged?

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaRotateFrame.cu
@@ -493,3 +493,65 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
    }
 }
+void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu ) 
+{
+    std::string methodName = "kCalculateAmoebaMultipolePotential";
+    // compute lab frame moments
+    cudaComputeAmoebaLabFrameMoments( amoebaGpu );
+    if( 0 ){
+        gpuContext gpu                       = amoebaGpu->gpuContext;
+        std::vector<int> fileId;
+        //fileId.push_back( 0 );
+        VectorOfDoubleVectors outputVector;
+        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psLabFrameDipole,     outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        cudaLoadCudaFloatArray( gpu->natoms,  9, amoebaGpu->psLabFrameQuadrupole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
+        cudaWriteVectorOfDoubleVectorsToFile( "CudaLabMoments", fileId, outputVector );
+    }   
+    // compute fixed E-field and mutual induced field 
+    if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
+        cudaComputeAmoebaFixedEField( amoebaGpu );
+        cudaComputeAmoebaMutualInducedField( amoebaGpu );
+    } else {
+        gpuContext gpu = amoebaGpu->gpuContext;
+        kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+        LAUNCHERROR("kFindBlockBoundsPeriodic");
+        kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+        LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+        compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+        //compactStream( gpu->compactPlan, 
+        //               gpu->sim.pInteractingWorkUnit, unsigned int* dOut
+        //               amoebaGpu->psWorkUnit->_pDevData, const unsigned int* dIn
+        //               gpu->sim.pInteractionFlag,        const unsigned int* dValid
+        //               gpu->sim.workUnits,               gpu
+        //               gpu->sim.pInteractionCount);
+        kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+        LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+        cudaComputeAmoebaPmeFixedEField( amoebaGpu );
+        cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
+    }
+    // check if induce dipole calculation converged -- abort if it did not
+    if( amoebaGpu->mutualInducedDone == 0 ){
+       throw OpenMM::OpenMMException("Induced dipole calculation did not converge" );
+    }
+    // calculate electrostatic potential
+    cudaComputeAmoebaElectrostaticPotential( amoebaGpu );
+}
--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
@@ -563,6 +563,11 @@ double ReferenceCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bo
    return static_cast<double>(energy);
 }
+void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
+                                                                        std::vector< double >& outputElectrostaticPotential ){
+    return;
+}
 ///* -------------------------------------------------------------------------- *
 // *                       AmoebaGeneralizedKirkwood                            *
 // * -------------------------------------------------------------------------- */