Cutoffs/PBC for Vdw force

b8e12104 · Mark Friedrichs · 408469c3 · b8e12104 · b8e12104 · b8e12104
Commit b8e12104 authored Oct 09, 2010 by Mark Friedrichs
18 changed files
--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -136,3 +136,8 @@ extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues)
 extern void SetCustomNonbondedForceExpression(const Expression<256>& expression);
 extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression);
 extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues);
+extern __global__ void OPENMMCUDA_EXPORT kFindBlockBoundsPeriodic_kernel( void );
+extern __global__ void OPENMMCUDA_EXPORT kFindBlocksWithInteractionsPeriodic_kernel( void );
+extern __global__ void OPENMMCUDA_EXPORT kFindInteractionsWithinBlocksPeriodic_kernel( unsigned int* workUnit );
--- a/plugins/amoeba/openmmapi/include/AmoebaVdwForce.h
+++ b/plugins/amoeba/openmmapi/include/AmoebaVdwForce.h
@@ -143,11 +143,41 @@ public:
     */
    void getParticleExclusions( int particleIndex, std::vector< int >& exclusions ) const;
+    /**
+     * Set cutoff
+     * 
+     * @param cutoff cutoff
+     */
+    void setCutoff( double cutoff );
+    /**
+     * Get cutoff
+     * 
+     * @return cutoff
+     */
+    double getCutoff( void ) const;
+    /**
+     * Set flag for employing periodic boundary conditions
+     * 
+     * @param pbcFlag if nonozero, use periodic boundary conditions
+     */
+    void setPBC( int pbcFlag );
+    /**
+     * Get periodic boundary conditions flag
+     * 
+     * @return periodic boundary conditions flag (nonzero -> use PBC)
+     */
+    int getPBC( void ) const;
 protected:
    ForceImpl* createImpl();
 private:
    class VdwInfo;
+    int usePBC;
+    double cutoff;
    std::string sigmaCombiningRule;
    std::string epsilonCombiningRule;
    std::vector< std::vector<int> > exclusions;
@@ -170,7 +200,7 @@ private:
 class AmoebaVdwForce::VdwInfo {
 public:
    int ivIndex, classIndex;
-    double reductionFactor, sigma, epsilon;
+    double reductionFactor, sigma, epsilon, cutoff;
    VdwInfo() {
        ivIndex = classIndex = -1;
        reductionFactor      = 0.0;

--- a/plugins/amoeba/openmmapi/src/AmoebaVdwForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaVdwForce.cpp
@@ -36,7 +36,7 @@
 using namespace OpenMM;
-AmoebaVdwForce::AmoebaVdwForce() {
+AmoebaVdwForce::AmoebaVdwForce() : usePBC(0), cutoff(1.0e+10) {
 }
 int AmoebaVdwForce::addParticle(int ivIndex, int classIndex, double sigma, double epsilon, double reductionFactor ) {
@@ -102,6 +102,22 @@ void AmoebaVdwForce::getParticleExclusions( int particleIndex, std::vector< int
 }
+void AmoebaVdwForce::setCutoff( double inputCutoff ){
+    cutoff = inputCutoff;
+}
+double AmoebaVdwForce::getCutoff( void ) const {
+    return cutoff;
+}
+void AmoebaVdwForce::setPBC( int pbcFlag ){
+    usePBC = pbcFlag;
+}
+int AmoebaVdwForce::getPBC( void ) const {
+    return usePBC;
+}
 ForceImpl* AmoebaVdwForce::createImpl() {
    return new AmoebaVdwForceImpl(*this);
 }
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -954,7 +954,7 @@ static void computeAmoebaVdwForce( AmoebaCudaData& data ) {
    // Vdw14_7F
-    kCalculateAmoebaVdw14_7Forces(gpu);
+    kCalculateAmoebaVdw14_7Forces(gpu, data.getApplyCutoff());
 }
 class CudaCalcAmoebaVdwForceKernel::ForceInfo : public CudaForceInfo {
@@ -1014,7 +1014,7 @@ void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const Amoeba
    gpuSetAmoebaVdwParameters( data.getAmoebaGpu(), indexIVs, indexClasses, sigmas, epsilons, reductions,
                               force.getSigmaCombiningRule(), force.getEpsilonCombiningRule(),
-                               allExclusions );
+                               allExclusions, force.getPBC(), static_cast<float>(force.getCutoff()) );
    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
 }

--- a/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
@@ -339,8 +339,6 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "     maxCovalentDegreeSz                %d\n",      amoebaGpu->maxCovalentDegreeSz );
    (void) fprintf( log, "     solventDielectric                  %10.3f\n",  amoebaGpu->solventDielectric);
-    (void) fprintf( log, "     pGamma                             %10.3f\n",  amoebaGpu->pGamma );
-    (void) fprintf( log, "     scalingDistanceCutoff              %10.3f\n",  amoebaGpu->scalingDistanceCutoff );
    (void) fprintf( log, "     scalingDistanceCutoff              %15.7e\n",  amoebaGpu->amoebaSim.scalingDistanceCutoff );
    (void) fprintf( log, "     pDampingFactorAndThole             %p\n",      amoebaGpu->amoebaSim.pDampingFactorAndThole );
    (void) fprintf( log, "     pScaleIndicesIndex                 %p\n",      amoebaGpu->amoebaSim.pScaleIndicesIndex );
@@ -384,7 +382,6 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    gpuPrintCudaStreamFloat( amoebaGpu->psWorkVector[0], log );
    gpuPrintCudaStreamFloat( amoebaGpu->psForce, log );
    gpuPrintCudaStreamFloat( amoebaGpu->psTorque, log );
-    gpuPrintCudaStreamFloat( amoebaGpu->psEnergy, log );
    gpuPrintCudaStreamFloat( amoebaGpu->torqueMapForce, log );
    (void) fprintf( log, "     maxMapTorqueDifference             %d\n",  amoebaGpu->maxMapTorqueDifference );
@@ -404,12 +401,11 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
    (void) fprintf( log, "\n" );
-    (void) fprintf( log, "     useVdwTable                        %u\n",  amoebaGpu->useVdwTable);
-    (void) fprintf( log, "     vdwTableSize                       %u\n",  amoebaGpu->vdwTableSize);
    (void) fprintf( log, "     vdwSigmaCombiningRule              %d\n",      amoebaGpu->vdwSigmaCombiningRule);
    (void) fprintf( log, "     vdwEpsilonCombiningRule            %d\n",      amoebaGpu->vdwEpsilonCombiningRule);
+    (void) fprintf( log, "     vdwUsePBC                          %d\n",      amoebaGpu->amoebaSim.vdwUsePBC);
+    (void) fprintf( log, "     vdwCutoff2                         %15.7e\n",  amoebaGpu->amoebaSim.vdwCutoff2);
    gpuPrintCudaStreamFloat2( amoebaGpu->psVdwSigmaEpsilon, log );
-    gpuPrintCudaStreamFloat2( amoebaGpu->psVdwTable, log );
    gpuPrintCudaStreamInt( amoebaGpu->psAmoebaVdwNonReductionID, log );
    gpuPrintCudaStreamInt4( amoebaGpu->psAmoebaVdwReductionID, log );
    gpuPrintCudaStreamFloat( amoebaGpu->psAmoebaVdwReduction, log );
@@ -1469,7 +1465,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    amoebaGpu->maxCovalentDegreeSz                                            = maxCovalentRange;
    amoebaGpu->paddedNumberOfAtoms                                            = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
-    amoebaGpu->scalingDistanceCutoff                                          = static_cast<float>(scalingDistanceCutoff);
    gpuRotationToLabFrameAllocate( amoebaGpu );
    gpuFixedEFieldAllocate( amoebaGpu );
@@ -1708,11 +1703,6 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
            // covalent/polarization degree
-            if( ii < 1 ){ 
-                (void) fprintf( amoebaGpu->log,"Gamma=%.5f scaledDistCutoff=%.5f\n",
-                                amoebaGpu->pGamma, amoebaGpu->scalingDistanceCutoff );
-            }
            (void) fprintf( amoebaGpu->log,"%3d covalent/polarization degree: minIdx[%6d %6d] Thole=%12.5f dampingFactor=%12.5f\n", ii,
                            amoebaGpu->psCovalentDegree->_pSysStream[0][particlesOffset],  amoebaGpu->psPolarizationDegree->_pSysStream[0][particlesOffset],
                            amoebaGpu->psDampingFactorAndThole->_pSysStream[0][ii].y, amoebaGpu->psDampingFactorAndThole->_pSysStream[0][ii].x );
@@ -1884,7 +1874,7 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
    // upload
-    amoebaGpu->amoebaSim.scalingDistanceCutoff = amoebaGpu->scalingDistanceCutoff;
+    amoebaGpu->amoebaSim.scalingDistanceCutoff = static_cast<float>(scalingDistanceCutoff);
    amoebaGpu->amoebaSim.numberOfAtoms         = amoebaGpu->gpuContext->natoms;
    amoebaGpu->amoebaSim.paddedNumberOfAtoms   = amoebaGpu->paddedNumberOfAtoms;
@@ -1992,7 +1982,8 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
                                const std::vector<float>& reductions,
                                const std::string& vdwSigmaCombiningRule,
                                const std::string& vdwEpsilonCombiningRule,
-                                const std::vector< std::vector<int> >& allExclusions )
+                                const std::vector< std::vector<int> >& allExclusions,
+                                int usePBC, float cutoff )
 {
   // ---------------------------------------------------------------------------------------
@@ -2000,10 +1991,14 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
   // ---------------------------------------------------------------------------------------
    gpuContext gpu                         = amoebaGpu->gpuContext;
    amoebaGpu->paddedNumberOfAtoms         = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    unsigned int particles                 = sigmas.size();
+    amoebaGpu->amoebaSim.vdwUsePBC         = usePBC;
+    amoebaGpu->amoebaSim.vdwCutoff2        = cutoff*cutoff;
    // set sigma combining rule flag
    if( vdwSigmaCombiningRule.compare( "ARITHMETIC" ) == 0 ){
@@ -2157,13 +2152,12 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
    psVdwReductionID->Upload();
    psAmoebaVdwReduction->Upload();
+    amoebaGpuBuildOutputBuffers( amoebaGpu );
    amoebaGpuBuildVdwExclusionList( amoebaGpu, allExclusions );
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        unsigned int maxPrint = 32;
-        (void) fprintf( amoebaGpu->log, "%s useVdwTable=%d size=%d\n", methodName, amoebaGpu->useVdwTable, 
-                        (amoebaGpu->useVdwTable ? amoebaGpu->vdwTableSize : 0) ); 
        (void) fprintf( amoebaGpu->log, "%s sigma/epsilon combining rules=%d %d\n", methodName, 
                        amoebaGpu->vdwSigmaCombiningRule, amoebaGpu->vdwEpsilonCombiningRule);
        for (unsigned int ii = 0; ii < gpu->natoms; ii++) 
@@ -2659,7 +2653,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
    delete gpu->psWorkVector[3];
    delete gpu->psForce;
    delete gpu->psTorque;
-    delete gpu->psEnergy;
    delete gpu->torqueMapForce;
    delete gpu->psGk_Field;
@@ -2671,7 +2664,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
    delete gpu->psKirkwoodEDiffForce;
    delete gpu->psVdwSigmaEpsilon;
-    delete gpu->psVdwTable;
    delete gpu->psAmoebaVdwNonReductionID;
    delete gpu->psAmoebaVdwReductionID;
    delete gpu->psAmoebaVdwReduction;
@@ -2768,7 +2760,12 @@ extern "C"
 void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
 {
+    if( amoebaGpu->nonbondBlocks == amoebaGpu->gpuContext->sim.blocks ){
+        return;
+    }
    unsigned int paddedNumberOfAtoms                = amoebaGpu->paddedNumberOfAtoms;
    amoebaGpu->nonbondBlocks                        = amoebaGpu->gpuContext->sim.blocks;
    amoebaGpu->threadsPerBlock                      = amoebaGpu->gpuContext->sim.threads_per_block;
@@ -2810,6 +2807,15 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
                         amoebaGpu->fieldReduceThreadsPerBlock ); 
        (void) fflush( amoebaGpu->log );
    }
+    if( amoebaGpu->psWorkArray_3_1 ){
+        delete amoebaGpu->psWorkArray_3_1;
+        delete amoebaGpu->psWorkArray_3_2;
+        delete amoebaGpu->psWorkArray_3_3;
+        delete amoebaGpu->psWorkArray_3_4;
+        delete amoebaGpu->psWorkArray_1_1;
+        delete amoebaGpu->psWorkArray_1_2;
+    }
    amoebaGpu->psWorkArray_3_1            = new CUDAStream<float>(3*paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_3_1");
    amoebaGpu->amoebaSim.pWorkArray_3_1   = amoebaGpu->psWorkArray_3_1->_pDevStream[0];
@@ -2826,8 +2832,6 @@ void amoebaGpuBuildOutputBuffers( amoebaGpuContext amoebaGpu )
    amoebaGpu->psWorkArray_1_2            = new CUDAStream<float>(  paddedNumberOfAtoms, (amoebaGpu->outputBuffers), "AmoebaField_1_2");
    amoebaGpu->amoebaSim.pWorkArray_1_2   = amoebaGpu->psWorkArray_1_2->_pDevStream[0];
-    amoebaGpu->psEnergy                   = new CUDAStream<float>(amoebaGpu->energyOutputBuffers, 1, "AmoebaEnergy");
    return;
 }
@@ -2928,87 +2932,23 @@ int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext amoebaGpu )
    CUDAStream<unsigned int>* psWorkUnit       = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
    unsigned int* pWorkList                    = psWorkUnit->_pSysData;
    amoebaGpu->psWorkUnit                      = psWorkUnit;
+    memset( amoebaGpu->psWorkUnit->_pSysStream[0], 0, cells*sizeof( unsigned int) );
    CUDAStream<unsigned int>* psVdwWorkUnit    = new CUDAStream<unsigned int>(cells, 1u, "VdwWorkUnit");
    unsigned int* pVdwWorkList                 = psVdwWorkUnit->_pSysData;
+    amoebaGpu->amoebaSim.pVdwWorkUnit          = psVdwWorkUnit->_pDevStream[0];
    amoebaGpu->psVdwWorkUnit                   = psVdwWorkUnit;
+    memset( amoebaGpu->psVdwWorkUnit->_pSysStream[0], 0, cells*sizeof( unsigned int) );
-/*
-    CUDAStream<unsigned int>* psInteractingWorkUnit  = new CUDAStream<unsigned int>(cells, 1u, "InteractingWorkUnit");
-    amoebaGpu->psInteractingWorkUnit                 = psInteractingWorkUnit;
-    amoebaGpu->workUnits                             = cells;
-    CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u, "InteractionFlag");
-    amoebaGpu->psInteractionFlag = psInteractionFlag;
-    amoebaGpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
-    CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u, "InteractionCount");
-    amoebaGpu->psInteractionCount = psInteractionCount;
-    amoebaGpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
-    CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u, "GridBoundingBox");
-    amoebaGpu->psGridBoundingBox = psGridBoundingBox;
-    amoebaGpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
-    CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u, "GridCenter");
-    amoebaGpu->psGridCenter = psGridCenter;
-    amoebaGpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
-    amoebaGpu->sim.nonbond_workBlock      = amoebaGpu->sim.nonbondThreadsPerBlock / GRID;
-    amoebaGpu->sim.bornForce2_workBlock   = amoebaGpu->sim.bornForce2_threads_per_block / GRID;
-    amoebaGpu->sim.workUnits = cells;
-*/
-    // Initialize the plan for doing stream compaction.
-    // planCompaction(amoebaGpu->compactPlan);
-    // Increase block count if necessary for extra large molecules that would
-    // otherwise overflow the SM workunit buffers
-//    int minimumBlocks = (cells + amoebaGpu->sim.workUnitsPerSM - 1) / amoebaGpu->sim.workUnitsPerSM;
-//    if ((int) amoebaGpu->sim.nonbond_blocks < minimumBlocks)
-//    {
-//        amoebaGpu->sim.nonbond_blocks = amoebaGpu->sim.nonbond_blocks * ((minimumBlocks + amoebaGpu->sim.nonbond_blocks - 1) / amoebaGpu->sim.nonbond_blocks);
-//    }
-//    if ((int) amoebaGpu->sim.bornForce2_blocks < minimumBlocks)
-//    {
-//        amoebaGpu->sim.bornForce2_blocks = amoebaGpu->sim.bornForce2_blocks * ((minimumBlocks + amoebaGpu->sim.bornForce2_blocks - 1) / amoebaGpu->sim.bornForce2_blocks);
-//    }
-/*
-    amoebaGpu->sim.nbWorkUnitsPerBlock            = cells / amoebaGpu->sim.nonbond_blocks;
-    amoebaGpu->sim.nbWorkUnitsPerBlockRemainder   = cells - amoebaGpu->sim.nonbond_blocks * amoebaGpu->sim.nbWorkUnitsPerBlock;
-    amoebaGpu->sim.interaction_threads_per_block  = 64;
-    amoebaGpu->sim.interaction_blocks = (amoebaGpu->workUnits + amoebaGpu->sim.interaction_threads_per_block - 1) / amoebaGpu->sim.interaction_threads_per_block;
-    if (amoebaGpu->sim.interaction_blocks > 8*amoebaGpu->sim.blocks)
-        amoebaGpu->sim.interaction_blocks = 8*amoebaGpu->sim.blocks;
-    if (activeWorkUnits > (int) cells)
-    {
-        int balancedWorkBlock                   = (cells + amoebaGpu->sim.nonbond_blocks - 1) / amoebaGpu->sim.nonbond_blocks;
-        amoebaGpu->sim.nonbondThreadsPerBlock      = balancedWorkBlock * GRID;
-        amoebaGpu->sim.nonbond_workBlock              = balancedWorkBlock;
-    }
-    activeWorkUnits = amoebaGpu->sim.bornForce2_blocks * amoebaGpu->sim.bornForce2_workBlock;
-    if (activeWorkUnits > (int) cells)
-    {
-        int balancedWorkBlock                   = (cells + amoebaGpu->sim.bornForce2_blocks - 1) / amoebaGpu->sim.bornForce2_blocks;
-        amoebaGpu->sim.bornForce2_threads_per_block   = balancedWorkBlock * GRID;
-        amoebaGpu->sim.bornForce2_workBlock           = balancedWorkBlock;
-    }
-*/
    unsigned int count = 0;
    for (unsigned int y = 0; y < dim; y++)
    {
-        for (unsigned int x = y; x < dim; x++)
+        for (unsigned int x = y; x < dim; x++, count++)
        {
            pWorkList[count]      = encodeCell( x, y );
-            pVdwWorkList[count] = encodeCell( x, y );
+            pVdwWorkList[count]   =  pWorkList[count];
-            count++;
        }
    }
-    //(*amoebaGpu->psInteractionCount)[0] = amoebaGpu->workUnits;
-    //amoebaGpu->psInteractionCount->Upload();
    psWorkUnit->Upload();
    psVdwWorkUnit->Upload();
@@ -3100,8 +3040,8 @@ void amoebaGpuBuildScalingList( amoebaGpuContext amoebaGpu )
    CUDAStream<int>* psScalingIndicesIndex          = new CUDAStream<int>(cells, 1u, "ScalingIndicesIndex");
    amoebaGpu->psScalingIndicesIndex                = psScalingIndicesIndex;
    amoebaGpu->amoebaSim.pScaleIndicesIndex         = psScalingIndicesIndex->_pDevStream[0];
+    memset( amoebaGpu->psScalingIndicesIndex->_pSysStream[0], 0, cells*sizeof(unsigned int) );
-    memset( amoebaGpu->psScalingIndicesIndex->_pSysStream[0], 0, sizeof( cells)*sizeof( unsigned int) );
    int numWithScalingIndices                       = 0;
    int gridOffset                                  = grid - 1;
    int lastBlock                                   = (static_cast<int>(paddedAtoms) > amoebaGpu->gpuContext->natoms) ? (amoebaGpu->gpuContext->natoms)/grid : -1;
@@ -4308,19 +4248,17 @@ void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration){
 void gpuCopyWorkUnit( amoebaGpuContext amoebaGpu ){
 // ---------------------------------------------------------------------------------------
+/*
    gpuContext gpu = amoebaGpu->gpuContext;
    gpu->psInteractingWorkUnit->Download();
    gpu->psWorkUnit->Download();
    amoebaGpu->psWorkUnit->Download();
-    (void) fprintf( amoebaGpu->log, "gpuCopyInteractingWorkUnit called -- to be removed.\n" );
    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
-        //gpu->psInteractingWorkUnit->_pSysStream[0][ii] = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
        gpu->psWorkUnit->_pSysStream[0][ii]            = amoebaGpu->psWorkUnit->_pSysStream[0][ii];
    }    
    gpu->psInteractingWorkUnit->Upload();
    gpu->psWorkUnit->Upload();
+*/
 // ---------------------------------------------------------------------------------------

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
@@ -57,7 +57,7 @@ extern void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool per
 extern void SetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext gpu);
 extern void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext gpu);
-extern void kCalculateAmoebaVdw14_7Forces(amoebaGpuContext amoebaGpu );
+extern void kCalculateAmoebaVdw14_7Forces(amoebaGpuContext amoebaGpu, int applyCutoff );
 // wca dispersion

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
@@ -146,8 +146,11 @@ struct cudaAmoebaGmxSimulation {
    float* pWorkArray_1_1;
    float* pWorkArray_1_2;
+    int vdwUsePBC;
+    float vdwCutoff2;
    unsigned int amoebaVdwNonReductions;
    int* pAmoebaVdwNonReductionID;
+    unsigned int* pVdwWorkUnit;
    unsigned int amoebaVdwReductions;
    int4* pAmoebaVdwReductionID;

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
@@ -136,8 +136,6 @@ struct _amoebaGpuContext {
    // scaling-related parameters
-    float pGamma;
-    float scalingDistanceCutoff;
    CUDAStream<float2>*  psDampingFactorAndThole;
    // slated for removal -- no longer used
@@ -177,7 +175,6 @@ struct _amoebaGpuContext {
    CUDAStream<float>*  psForce; 
    CUDAStream<float>*  psTorque; 
-    CUDAStream<float>*  psEnergy; 
    CUDAStream<float>*  torqueMapForce; 
    int maxMapTorqueDifference; 
    int maxMapTorqueDifferencePow2;
@@ -198,9 +195,6 @@ struct _amoebaGpuContext {
    CUDAStream<float2>*  psVdwSigmaEpsilon;
-    unsigned int useVdwTable;
-    unsigned int vdwTableSize;
-    CUDAStream<float2>*  psVdwTable; 
    CUDAStream<int>*     psAmoebaVdwNonReductionID; 
    CUDAStream<int4>*    psAmoebaVdwReductionID; 
    CUDAStream<float>*   psAmoebaVdwReduction; 
@@ -322,7 +316,7 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
                                const std::vector<float>& reductions,
                                const std::string& sigmaCombiningRule,
                                const std::string& epsilonCombiningRule,
-                                const std::vector< std::vector<int> >& allExclusions );
+                                const std::vector< std::vector<int> >& allExclusions, int usePBC, float cutoff );
 extern "C"
 void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
@@ -766,11 +766,8 @@ void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        " gamma=%.3e scalingDistanceCutoff=%.3f ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-                        methodName, gpu->natoms,
-                        amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
-                        amoebaGpu->scalingDistanceCutoff );
    }   
   int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwood.cu
@@ -1861,10 +1861,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        " gamma=%.3e scalingDistanceCutoff=%.3f ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-                        methodName, gpu->natoms,
-                        amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
                        amoebaGpu->scalingDistanceCutoff );
    }
   int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaKirkwoodEDiff.cu
@@ -1035,11 +1035,8 @@ void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
+        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                        " gamma=%.3e scalingDistanceCutoff=%.3f ZZZ\n",
+                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-                        methodName, gpu->natoms,
-                        amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
-                        amoebaGpu->scalingDistanceCutoff );
        (void) fflush( amoebaGpu->log );
    }   
    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeDirectElectrostatic.cu
@@ -1028,11 +1028,11 @@ __device__ void loadPmeDirectElectrostaticShared( struct PmeDirectElectrostaticP
 // Include versions of the kernels for N^2 calculations.
 #undef USE_OUTPUT_BUFFER_PER_WARP
-#define METHOD_NAME(a, b) a##N2##b
+#define METHOD_NAME(a, b) a##Cutoff##b
 #include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
 #define USE_OUTPUT_BUFFER_PER_WARP
 #undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
 #include "kCalculateAmoebaCudaPmeDirectElectrostatic.h"
 // reduce psWorkArray_3_1 -> force
@@ -1088,11 +1088,8 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
-      (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
+      (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
-                      " gamma=%.3e scalingDistanceCutoff=%.3f ZZZ\n",
+                      methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
-                      methodName, gpu->natoms,
-                      amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
-                      amoebaGpu->scalingDistanceCutoff );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
@@ -1142,26 +1139,26 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
            maxThreads = 128;
        else
            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)), maxThreads);
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)), maxThreads);
    }
    kClearFields_3( amoebaGpu, 2 );
 #ifdef AMOEBA_DEBUG
-    (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n",
+    (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u\n", 
-                    threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)),
+                    threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(PmeDirectElectrostaticParticle)),
-                    (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3)) );
+                    sizeof(PmeDirectElectrostaticParticle) );
-      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
+      (void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Obuf=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
                      amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
-                      sizeof(PmeDirectElectrostaticParticle)+sizeof(float3), (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits,
+                      sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
-                      gpu->sim.nonbond_threads_per_block );
+                      amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
      (void) fflush( amoebaGpu->log );
 #endif
    if (gpu->bOutputBufferPerWarp){
-      kCalculateAmoebaPmeDirectElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticCutoffByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                         gpu->sim.pInteractingWorkUnit,
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
@@ -1176,7 +1173,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
 //                                                                         gpu->sim.pInteractingWorkUnit,
 //                                                                         amoebaGpu->psWorkUnit->_pDevStream[0],
-      kCalculateAmoebaPmeDirectElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, (sizeof(PmeDirectElectrostaticParticle)+sizeof(float3))*threadsPerBlock>>>(
+      kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
                                                                         gpu->sim.pInteractingWorkUnit,
                                                                         amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
@@ -1186,7 +1183,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
                                                                         amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
 #endif
    }
-    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticN2Forces");
+    LAUNCHERROR("kCalculateAmoebaPmeDirectElectrostaticCutoffForces");
    kReduceForceTorque( amoebaGpu );
 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeFixedEField.cu
@@ -373,11 +373,11 @@ __device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle&
 // Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
+#define METHOD_NAME(a, b) a##Cutoff##b
 #include "kCalculateAmoebaCudaPmeFixedEField.h"
 #define USE_OUTPUT_BUFFER_PER_WARP
 #undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
 #include "kCalculateAmoebaCudaPmeFixedEField.h"
 /**---------------------------------------------------------------------------------------
@@ -443,7 +443,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
    }    
    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaPmeDirectFixedE_FieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
@@ -453,7 +453,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
 #endif
    } else {
-        kCalculateAmoebaPmeDirectFixedE_FieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
                                                                           gpu->sim.pInteractingWorkUnit,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
@@ -473,7 +473,7 @@ static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
-        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
+        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u warp=%d\n",
                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->energyOutputBuffers, 
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPmeMutualInducedField.cu
@@ -218,11 +218,11 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
 // Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
+#define METHOD_NAME(a, b) a##Cutoff##b
 #include "kCalculateAmoebaCudaPmeMutualInducedField.h"
 #define USE_OUTPUT_BUFFER_PER_WARP
 #undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
 #include "kCalculateAmoebaCudaPmeMutualInducedField.h"
 __global__ 
@@ -413,7 +413,7 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
    if (gpu->bOutputBufferPerWarp){
                                                                 //gpu->sim.pInteractingWorkUnit,
                                                                 //amoebaGpu->psWorkUnit->_pDevStream[0],
-        kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->sim.pInteractingWorkUnit,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG
@@ -426,14 +426,14 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
    } else {
 #ifdef AMOEBA_DEBUG
-        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
+        (void) fprintf( amoebaGpu->log, "Cutoff no warp\n" );
-        (void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
 #endif
-        kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
+        kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
                                                                 gpu->sim.pInteractingWorkUnit,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
 #ifdef AMOEBA_DEBUG

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.cu
@@ -4,6 +4,7 @@
 #include "amoebaGpuTypes.h"
 #include "amoebaCudaKernels.h"
+#include "cudaKernels.h"
 #include "kCalculateAmoebaCudaUtilities.h"
 #include "kCalculateAmoebaCudaVdwParticle.h"
 #include "amoebaScaleFactors.h"
@@ -62,22 +63,6 @@ __device__ void loadVdw14_7Shared( struct Vdw14_7Particle* sA, unsigned int atom
 }
-// load struct and arrays w/ shared data in sA
-__device__ void loadVdw14_7Data( struct Vdw14_7Particle* sA,
-                                 float4* jCoord, float* jSigma, float* jEpsilon )
-{
-    // load coordinates, sigma, epsilon
-    jCoord->x               = sA->x;
-    jCoord->y               = sA->y;
-    jCoord->z               = sA->z;
-    *jSigma                 = sA->sigma;
-    *jEpsilon               = sA->epsilon;
-}
 __device__ void getVdw14_7CombindedSigmaEpsilon_kernel( int sigmaCombiningRule, float iSigma, float jSigma, float* combindedSigma,
                                                        int epsilonCombiningRule, float iEpsilon, float jEpsilon, float* combindedEpsilon )
 {
@@ -106,15 +91,12 @@ __device__ void getVdw14_7CombindedSigmaEpsilon_kernel( int sigmaCombiningRule,
 }
-__device__ void calculateVdw14_7PairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ,
+__device__ void calculateVdw14_7PairIxn_kernel( float combindedSigma,    float combindedEpsilon,
-                                                float combindedSigma,    float combindedEpsilon,
                                                float force[3], float* energy
 #ifdef AMOEBA_DEBUG
               , float4* debugArray
 #endif
+)
- )
 {
    const float deltaHalM1 = 0.07f;
@@ -124,15 +106,15 @@ __device__ void calculateVdw14_7PairIxn_kernel( float4 atomCoordinatesI, float4
    // ---------------------------------------------------------------------------------------
-    // get deltaR, and r between 2 atoms
+    // on input force[i] is assummed to contain delta[i] for coordinates of atom I and J 
-    force[0]                                     = atomCoordinatesJ.x - atomCoordinatesI.x;
-    force[1]                                     = atomCoordinatesJ.y - atomCoordinatesI.y;
-    force[2]                                     = atomCoordinatesJ.z - atomCoordinatesI.z;
-    float rI                                     =  rsqrtf( force[0]*force[0] + force[1]*force[1] + force[2]*force[2] );
+    float r2                                     = force[0]*force[0] + force[1]*force[1] + force[2]*force[2];
+    if( r2 > cAmoebaSim.vdwCutoff2 ){
+        *energy = force[0] = force[1] = force[2] = 0.0f;
+        return;
+    }
+    float rI                                     =  rsqrtf( r2 );
    float r                                      =  1.0f/rI;
-    float r2                                     =  r*r;
    float r6                                     =  r2*r2*r2;
    float r7                                     =  r6*r;
@@ -166,6 +148,7 @@ __device__ void calculateVdw14_7PairIxn_kernel( float4 atomCoordinatesI, float4
    debugArray[1].x                              = tau;
    debugArray[1].y                              = rho;
    debugArray[1].z                              = gTau;
+    debugArray[1].w                              = r;
 #endif
 }
@@ -424,6 +407,21 @@ static void kCalculateAmoebaVdw14_7NonReduction(amoebaGpuContext amoebaGpu, CUDA
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##N2ByWarp##b
 #include "kCalculateAmoebaCudaVdw14_7.h"
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP 
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateAmoebaCudaVdw14_7.h"
+#undef METHOD_NAME
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateAmoebaCudaVdw14_7.h"
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP 
+#undef USE_CUTOFF
 // reduce psWorkArray_3_1 -> outputArray
@@ -471,7 +469,7 @@ void kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpuContext amoebaGpu, CUDAStr
   --------------------------------------------------------------------------------------- */
-void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu )
+void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu, int applyCutoff )
 {
   // ---------------------------------------------------------------------------------------
@@ -492,52 +490,122 @@ void kCalculateAmoebaVdw14_7Forces( amoebaGpuContext amoebaGpu )
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
-    int targetAtom                             = 21;
+    int targetAtom                             = 342;
 #endif
-    // clear output arrays
-    kClearFields_3( amoebaGpu, 1 );
    // set threads/block first time through
+    // on first pass, set threads/block
    if( threadsPerBlock == 0 ){ 
-        threadsPerBlock = getThreadsPerBlock( amoebaGpu, sizeof(Vdw14_7Particle));
+        unsigned int maxThreads;
-threadsPerBlock = 192;
+        if (gpu->sm_version >= SM_20)
+            maxThreads = 384; 
+        else if (gpu->sm_version >= SM_12)
+            maxThreads = 192; 
+        else
+            maxThreads = 128;
+        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(Vdw14_7Particle)), maxThreads);
    }    
    kCalculateAmoebaVdw14_7CopyCoordinates( amoebaGpu, gpu->psPosq4, amoebaGpu->psAmoebaVdwCoordinates );
    kCalculateAmoebaVdw14_7CoordinateReduction( amoebaGpu, amoebaGpu->psAmoebaVdwCoordinates, amoebaGpu->psAmoebaVdwCoordinates );
-    if (gpu->bOutputBufferPerWarp){
+#ifdef AMOEBA_DEBUG
-#if 0
+    (void) fprintf( amoebaGpu->log, "Apply cutoff=%d warp=%d\n", applyCutoff, gpu->bOutputBufferPerWarp );
-        (void) fprintf( amoebaGpu->log, "N2 warp\n" ); (void) fflush( amoebaGpu->log );
+    (void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+                    amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+                    sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
+                    amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
+    (void) fflush( amoebaGpu->log );
+#endif
+    // clear output arrays
+    kClearFields_3( amoebaGpu, 1 );
-        kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*amoebaGpu->nonbondThreadsPerBlock>>>(
+    if( applyCutoff ){
+        kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+        LAUNCHERROR("kFindBlockBoundsPeriodic");
+        kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+        LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+        compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->amoebaSim.pVdwWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+        kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+        LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+if( 0 ){  
+    gpu->psInteractionCount->Download();
+    gpu->psInteractingWorkUnit->Download();
+    gpu->psInteractionFlag->Download();
+    amoebaGpu->psVdwWorkUnit->Download();
+    (void) fprintf( amoebaGpu->log, "Vdw Ixn count=%u\n", gpu->psInteractionCount->_pSysStream[0][0] );
+    for( unsigned int ii = 0; ii < gpu->psInteractingWorkUnit->_length; ii++ ){
+        unsigned int x          = gpu->psInteractingWorkUnit->_pSysStream[0][ii];
+        unsigned int y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+        unsigned int exclusions = (x & 0x1);
+                     x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "GpuCell %8u  %8u [%5u %5u %1u] %10u ", ii, gpu->psInteractingWorkUnit->_pSysStream[0][ii], x,y,exclusions, gpu->psInteractionFlag->_pSysStream[0][ii] );
+                     x          = amoebaGpu->psVdwWorkUnit->_pSysStream[0][ii];
+                     y          = ((x >> 2) & 0x7fff) << GRIDBITS;
+                     exclusions = (x & 0x1);
+                     x          = (x >> 17) << GRIDBITS;
+        (void) fprintf( amoebaGpu->log, "   AmGpu %8u [%5u %5u %1u]\n", amoebaGpu->psWorkUnit->_pSysStream[0][ii], x,y,exclusions );
+    }    
+    (void) fflush( amoebaGpu->log );
+}
-                                                                 amoebaGpu->psWorkUnit->_pDevStream[0],
+        if (gpu->bOutputBufferPerWarp){
+            kCalculateAmoebaVdw14_7CutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+                                                                gpu->sim.pInteractingWorkUnit,
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipole->_pDevStream[0],
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
-                                                                 amoebaGpu->psInducedDipolePolar->_pDevStream[0],
+                                                                amoebaGpu->vdwSigmaCombiningRule,
-                                                                 amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                amoebaGpu->vdwEpsilonCombiningRule,
 #ifdef AMOEBA_DEBUG
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0],
+                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
                                                                debugArray->_pDevStream[0], targetAtom );
 #else
-                                                                 amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
 #endif
+        } else {
+            kCalculateAmoebaVdw14_7Cutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+                                                                gpu->sim.pInteractingWorkUnit,
+                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
+                                                                amoebaGpu->vdwSigmaCombiningRule,
+                                                                amoebaGpu->vdwEpsilonCombiningRule,
+#ifdef AMOEBA_DEBUG
+                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
+                                                                debugArray->_pDevStream[0], targetAtom );
+#else
+                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
 #endif
+        }
+        LAUNCHERROR("kCalculateAmoebaVdw14_7Cutoff");  
    } else {
+        if (gpu->bOutputBufferPerWarp){
+                                                                //amoebaGpu->psVdwWorkUnit->_pDevStream[0],
+            kCalculateAmoebaVdw14_7N2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
+                                                                gpu->sim.pInteractingWorkUnit,
+                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
+                                                                amoebaGpu->psVdwSigmaEpsilon->_pDevStream[0],
+                                                                amoebaGpu->vdwSigmaCombiningRule,
+                                                                amoebaGpu->vdwEpsilonCombiningRule,
 #ifdef AMOEBA_DEBUG
-        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
+                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0],
-        (void) fprintf( amoebaGpu->log, "numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
+                                                                debugArray->_pDevStream[0], targetAtom );
-                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
+#else
-                        sizeof(Vdw14_7Particle), sizeof(Vdw14_7Particle)*threadsPerBlock,
+                                                                amoebaGpu->psWorkArray_3_1->_pDevStream[0] );
-                        amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
-        (void) fflush( amoebaGpu->log );
 #endif
+        } else {
            kCalculateAmoebaVdw14_7N2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(Vdw14_7Particle)*threadsPerBlock>>>(
                                                                amoebaGpu->psVdwWorkUnit->_pDevStream[0],
                                                                amoebaGpu->psAmoebaVdwCoordinates->_pDevStream[0],
@@ -552,8 +620,8 @@ threadsPerBlock = 192;
 #endif
        }
+        LAUNCHERROR("kCalculateAmoebaVdw14_7N2");  
-    LAUNCHERROR("kCalculateAmoebaVdw14_7");  
+    }
 #ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaVdw14_7.h
@@ -53,9 +53,8 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;
-    float4 jCoord;
+    int exclusionIndex;
-    float jSigma;
+    int exclusionMask;
-    float jEpsilon;
    float totalEnergy            = 0.0f;
 #ifdef AMOEBA_DEBUG
@@ -77,10 +76,10 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
        unsigned int tj                  = tgx;
        Vdw14_7Particle* psA             = &sA[tbx];
+        Vdw14_7Particle  localParticle;
        unsigned int atomI               = x + tgx;
-        float4 iCoord                    = atomCoord[atomI];
+        loadVdw14_7Shared( &localParticle, atomI, atomCoord, vdwParameters );
-        float iSigma                     = vdwParameters[atomI].x;
-        float iEpsilon                   = vdwParameters[atomI].y;
        float forceSum[3];
@@ -94,10 +93,12 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
        if (x == y) 
        {
+            if( bExclusionFlag ){
                unsigned int xi              = x >> GRIDBITS;
                unsigned int cell            = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-            int exclusionIndex           = cAmoebaSim.pVdwExclusionIndicesIndex[cell]+tgx;
+                exclusionIndex               = cAmoebaSim.pVdwExclusionIndicesIndex[cell]+tgx;
-            int exclusionMask            = cAmoebaSim.pVdwExclusionIndices[exclusionIndex];
+                exclusionMask                = cAmoebaSim.pVdwExclusionIndices[exclusionIndex];
+            }
            // load shared data
@@ -108,30 +109,38 @@ void METHOD_NAME(kCalculateAmoebaVdw14_7, _kernel)(
                float ijForce[3];
-                // load coords, charge, ...
-                loadVdw14_7Data( &(psA[j]), &jCoord, &jSigma, &jEpsilon ); 
                // get combined sigma and epsilon
                float combindedSigma;
                float combindedEpsilon;
-                getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   iSigma,   jSigma,   &combindedSigma,
+                getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   localParticle.sigma,   psA[j].sigma,   &combindedSigma,
-                                                        epsilonCombiningRule, iEpsilon, jEpsilon, &combindedEpsilon );
+                                                        epsilonCombiningRule, localParticle.epsilon, psA[j].epsilon, &combindedEpsilon );
                // calculate force
+                ijForce[0]    = psA[j].x - localParticle.x;
+                ijForce[1]    = psA[j].y - localParticle.y;
+                ijForce[2]    = psA[j].z - localParticle.z;
+                if( cAmoebaSim.vdwUsePBC )
+                {
+                    ijForce[0]   -= floor(ijForce[0]*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    ijForce[1]   -= floor(ijForce[1]*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    ijForce[2]   -= floor(ijForce[2]*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                }
                float energy;
-                calculateVdw14_7PairIxn_kernel( iCoord, jCoord, combindedSigma, combindedEpsilon, ijForce, &energy
+                calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy
 #ifdef AMOEBA_DEBUG
 ,  pullDebug
 #endif
 );
                // mask out excluded ixns
+                unsigned int mask  =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                if( mask && bExclusionFlag ){
                    unsigned int maskIndex  = 1 << j;
-                unsigned int mask       =  ( (exclusionMask & maskIndex) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    mask = (exclusionMask & maskIndex) ? 0 : 1;
+                }
                // add to field at atomI the field due atomJ's dipole
@@ -152,7 +161,7 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) x;
        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) cell+tgx;
+        debugArray[index].z                = (float) tgx;
        debugArray[index].w                = energy;
        index                             += cAmoebaSim.paddedNumberOfAtoms;
@@ -188,10 +197,8 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            load3dArray( offset, forceSum, outputForce );
 #endif
-        }
+        } else {
-        else
-        {
-            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
                // load coordinates, charge, ...
@@ -200,120 +207,51 @@ if( atomI == targetAtom || (y+j) == targetAtom ){
            }
-           // zero shared fields
+#ifdef USE_CUTOFF
+            unsigned int flags = cSim.pInteractionFlag[pos];
-            zeroVdw14_7SharedForce(  &(sA[threadIdx.x]) );
+            if (flags == 0) {
+            } else {
-            if( !bExclusionFlag )
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijForce[3];
-                    // load coords, charge, ...
-                    loadVdw14_7Data( &(psA[tj]), &jCoord, &jSigma, &jEpsilon ); 
-                    // get combined sigma and epsilon
-                    float combindedSigma;
-                    float combindedEpsilon;
-                    getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   iSigma,   jSigma,   &combindedSigma,
-                                                            epsilonCombiningRule, iEpsilon, jEpsilon, &combindedEpsilon );
-                    // calculate force
-                    float energy;
-                    calculateVdw14_7PairIxn_kernel( iCoord, jCoord, combindedSigma, combindedEpsilon, ijForce, &energy
-#ifdef AMOEBA_DEBUG
-    ,  pullDebug
 #endif
-       );
-                    if( (atomI < cAmoebaSim.numberOfAtoms) && ((y+tj) < cAmoebaSim.numberOfAtoms) ){
-                        // add to field at atomI the field due atomJ's dipole
-                        forceSum[0]              += ijForce[0];
-                        forceSum[1]              += ijForce[1];
-                        forceSum[2]              += ijForce[2];
-                        // add to field at atomJ the field due atomI's dipole
-                        psA[tj].force[0]         -= ijForce[0];
-                        psA[tj].force[1]         -= ijForce[1];
-                        psA[tj].force[2]         -= ijForce[2];
-                        totalEnergy              += energy;
-                    }
-#ifdef AMOEBA_DEBUG
-if( atomI == targetAtom || (y+tj) == targetAtom ){
-        unsigned int index                 = (atomI == targetAtom) ? (y + tj) : atomI;
-        debugArray[index].x                = (float) atomI;
-        debugArray[index].y                = (float) (y + tj); 
-        debugArray[index].z                = -2.0f;
-        debugArray[index].w                = -1.0f;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
-        debugArray[index].x                = (float) x;
-        debugArray[index].y                = (float) y;
-        debugArray[index].z                = -1.0f;
-        debugArray[index].w                = energy;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+           // zero shared fields
-        debugArray[index].x                = pullDebug[0].x;
-        debugArray[index].y                = pullDebug[0].y;
-        debugArray[index].z                = pullDebug[0].z;
-        debugArray[index].w                = pullDebug[0].w;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+            zeroVdw14_7SharedForce( &(sA[threadIdx.x]) );
-        debugArray[index].x                = pullDebug[1].x;
-        debugArray[index].y                = pullDebug[1].y;
-        debugArray[index].z                = pullDebug[1].z;
-        debugArray[index].w                = pullDebug[1].w;
-        index                             += cAmoebaSim.paddedNumberOfAtoms;
+            if( bExclusionFlag ){
-        debugArray[index].x                = ijForce[0];
-        debugArray[index].y                = ijForce[1];
-        debugArray[index].z                = ijForce[2];
-}
-#endif
-                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            else 
-            {
                unsigned int xi              = x >> GRIDBITS;
                unsigned int yi              = y >> GRIDBITS;
                unsigned int cell            = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                int exclusionIndex           = cAmoebaSim.pVdwExclusionIndicesIndex[cell]+tgx;
+                exclusionIndex               = cAmoebaSim.pVdwExclusionIndicesIndex[cell]+tgx;
-                int exclusionMask            = cAmoebaSim.pVdwExclusionIndices[exclusionIndex];
+                exclusionMask                = cAmoebaSim.pVdwExclusionIndices[exclusionIndex];
+            }
            for (unsigned int j = 0; j < GRID; j++)
            {
                float ijForce[3];
-                    // load coords, charge, ...
-                    loadVdw14_7Data( &(psA[tj]), &jCoord, &jSigma, &jEpsilon ); 
                // get combined sigma and epsilon
                float combindedSigma;
                float combindedEpsilon;
-                    getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   iSigma,   jSigma,   &combindedSigma,
+                getVdw14_7CombindedSigmaEpsilon_kernel( sigmaCombiningRule,   localParticle.sigma,   psA[tj].sigma,   &combindedSigma,
-                                                            epsilonCombiningRule, iEpsilon, jEpsilon, &combindedEpsilon );
+                                                        epsilonCombiningRule, localParticle.epsilon, psA[tj].epsilon, &combindedEpsilon );
                // calculate force
                float energy;
-                    calculateVdw14_7PairIxn_kernel( iCoord, jCoord, combindedSigma, combindedEpsilon, ijForce, &energy
+                ijForce[0]    = psA[tj].x - localParticle.x;
+                ijForce[1]    = psA[tj].y - localParticle.y;
+                ijForce[2]    = psA[tj].z - localParticle.z;
+                if( cAmoebaSim.vdwUsePBC )
+                {
+                    ijForce[0]   -= floor(ijForce[0]*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    ijForce[1]   -= floor(ijForce[1]*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    ijForce[2]   -= floor(ijForce[2]*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                }
+                calculateVdw14_7PairIxn_kernel( combindedSigma, combindedEpsilon, ijForce, &energy
 #ifdef AMOEBA_DEBUG
    ,  pullDebug
 #endif
@@ -321,8 +259,11 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
                // mask out excluded ixns
+                unsigned int mask       =  ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                if( mask && bExclusionFlag ){
                    unsigned int maskIndex  = 1 << tj;
-                    unsigned int mask       =  ( (exclusionMask & maskIndex) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
+                    mask                    =  (exclusionMask & maskIndex) ? 0 : 1;
+                }
                // accumulate force for atomI
@@ -350,7 +291,7 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = (float) x;
        debugArray[index].y                = (float) y;
-        debugArray[index].z                = (float) cell+tgx;
+        debugArray[index].z                = (float) tgx;
        debugArray[index].w                = energy;
        index                             += cAmoebaSim.paddedNumberOfAtoms;
@@ -373,8 +314,10 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
 #endif
                tj                  = (tj + 1) & (GRID - 1);
-                } 
+            } // end of j-loop 
-            } 
+#ifdef USE_CUTOFF
+}
+#endif
            // Write results
@@ -385,7 +328,6 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
            offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].force,       outputForce );
 #else
            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
            load3dArray( offset, forceSum,       outputForce );
@@ -395,7 +337,8 @@ if( atomI == targetAtom || (y+tj) == targetAtom ){
 #endif
            lasty = y;
-        }
+        } // x == y block
        pos++;
    }
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;

--- a/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaRotateFrame.cu
@@ -2,6 +2,7 @@
 //-----------------------------------------------------------------------------------------
+#include "cudaKernels.h"
 #include "amoebaCudaKernels.h"
 #include <stdio.h>
@@ -353,13 +354,6 @@ void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu )
 }
-#undef USE_PERIODIC
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kFindInteractingBlocks.h"
-#undef USE_PERIODIC
-#undef METHOD_NAME
 void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood ) 
 {
    std::string methodName = "kCalculateAmoebaMultipoleForces";
@@ -384,7 +378,8 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
            LAUNCHERROR("kFindBlockBoundsPeriodic");
            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            //compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, amoebaGpu->psWorkUnit->_pDevStream[0], gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");

--- a/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/AmoebaTinkerParameterFile.cpp
@@ -2460,7 +2460,7 @@ static int readAmoebaGeneralizedKirkwoodParameters( FILE* filePtr, MapStringInt&
    --------------------------------------------------------------------------------------- */
-static int readAmoebaVdwParameters( FILE* filePtr, MapStringInt& forceMap, const StringVector& tokens,
+static int readAmoebaVdwParameters( FILE* filePtr, int version, MapStringInt& forceMap, const StringVector& tokens,
                                    System& system,  int useOpenMMUnits,
                                    MapStringVectorOfVectors& supplementary,
                                    MapStringString& inputArgumentMap, int* lineCount, FILE* log ){
@@ -2542,6 +2542,21 @@ static int readAmoebaVdwParameters( FILE* filePtr, MapStringInt& forceMap, const
        }
    }
+    if( version > 1 ){
+        lineTokensT.resize(0);
+        isNotEof = readLine( filePtr, lineTokensT, lineCount, log );
+        if( lineTokensT[0] == "AmoebaVdw14_7Periodic" ){
+            int usePBC = atoi( lineTokensT[1].c_str() );
+            vdwForce->setPBC( usePBC );
+        }
+        lineTokensT.resize(0);
+        isNotEof = readLine( filePtr, lineTokensT, lineCount, log );
+        if( lineTokensT[0] == "AmoebaVdw14_7CutOff" ){
+            double cutoff = atof( lineTokensT[1].c_str() );
+            vdwForce->setCutoff( cutoff );
+        }
+    }
    lineTokensT.resize(0);
    isNotEof = readLine( filePtr, lineTokensT, lineCount, log );
    if( lineTokensT[0] == "AmoebaVdw14_7Exclusion" ){
@@ -2621,6 +2636,8 @@ static int readAmoebaVdwParameters( FILE* filePtr, MapStringInt& forceMap, const
                        methodName.c_str(), arraySize, (useOpenMMUnits ? "OpenMM" : "Amoeba"),
                        vdwForce->getSigmaCombiningRule().c_str(), vdwForce->getEpsilonCombiningRule().c_str() );
+        (void) fprintf( log, "use periodic boundary conditions=%d cutoff=%15.7e\n", vdwForce->getPBC(), vdwForce->getCutoff() );
        for( int ii = 0; ii < vdwForce->getNumParticles();  ii++ ){
            int indexIV, indexClass;
            double sigma, epsilon, reduction;
@@ -3669,7 +3686,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
            // Amoeba Vdw
            } else if( field == "AmoebaVdw14_7SigEpsTable"  || field == "AmoebaVdw14_7Reduction" ){
-                readAmoebaVdwParameters( filePtr, forceMap, tokens, system, useOpenMMUnits, supplementary, inputArgumentMap, &lineCount, log );
+                readAmoebaVdwParameters( filePtr, version, forceMap, tokens, system, useOpenMMUnits, supplementary, inputArgumentMap, &lineCount, log );
            } else if( field == "AmoebaVdwForce" ){
                readVec3( filePtr, tokens, forces[AMOEBA_VDW_FORCE], &lineCount, field, log );
            } else if( field == "AmoebaVdwEnergy" ){
@@ -4560,7 +4577,7 @@ void testUsingAmoebaTinkerParameterFile( const std::string& amoebaTinkerParamete
    StringVector forceList;
    std::string activeForceNames;
    for( MapStringInt::const_iterator ii = forceMap.begin(); ii != forceMap.end(); ii++ ){
-        if( ii->second ){
+        if( ii->second && tinkerForces.find( ii->first ) != tinkerForces.end() ){
            if( includeCavityTerm && ii->first == AMOEBA_GK_FORCE ){
                forceList.push_back( AMOEBA_GK_CAVITY_FORCE );
                activeForceNames += AMOEBA_GK_CAVITY_FORCE + ":";