Deleted the old CUDA platform

2e451b9d · Peter Eastman · 352e2fc7 · 352e2fc7 · 352e2fc7 · 352e2fc7
Commit 2e451b9d authored Dec 13, 2012 by Peter Eastman
20 changed files
--- a/plugins/amoeba/platforms/cuda-old/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda-old/src/AmoebaCudaKernelFactory.cpp
-/* -------------------------------------------------------------------------- *
- *                              OpenMMAmoeba                                  *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors:                                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "AmoebaCudaKernelFactory.h"
-#include "AmoebaCudaKernels.h"
-#include "CudaPlatform.h"
-#include "AmoebaCudaData.h"
-#include "openmm/internal/ContextImpl.h"
-#include "openmm/OpenMMException.h"
-using namespace OpenMM;
-extern "C" void registerPlatforms() {
-}
-extern "C" OPENMMCUDA_EXPORT void registerKernelFactories() {
-    for( int ii = 0; ii < Platform::getNumPlatforms(); ii++ ){
-        Platform& platform = Platform::getPlatform(ii);
-        if( platform.getName() == "Cuda" ){
-             AmoebaCudaKernelFactory* factory = new AmoebaCudaKernelFactory();
-             platform.registerKernelFactory(CalcAmoebaBondForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaAngleForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaInPlaneAngleForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaPiTorsionForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaStretchBendForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaOutOfPlaneBendForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaTorsionTorsionForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaMultipoleForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaGeneralizedKirkwoodForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaVdwForceKernel::Name(), factory);
-             platform.registerKernelFactory(CalcAmoebaWcaDispersionForceKernel::Name(), factory);
-             //platform.registerKernelFactory(CalcAmoebaForcesAndEnergyKernel::Name(), factory);
-        }
-    }
-}
-extern "C" OPENMMCUDA_EXPORT void registerAmoebaCudaKernelFactories( void ) {
-    int hasCudaPlatform = 0;
-    for( int ii = 0; ii < Platform::getNumPlatforms() && hasCudaPlatform == 0; ii++ ){
-        Platform& platform = Platform::getPlatform(ii);
-        if( platform.getName() == "Cuda" ){
-            hasCudaPlatform = 1;
-        }
-    }
-    if( hasCudaPlatform == 0 ){
-        if (gpuIsAvailable() ){
-            Platform::registerPlatform(new CudaPlatform());
-        }
-    }
-    registerKernelFactories();
-}
-static std::map<ContextImpl*, AmoebaCudaData*> contextToAmoebaDataMap;
-// look up AmoebaCudaData for input contextImpl in contextToAmoebaDataMap
-extern "C" void* getAmoebaCudaData( ContextImpl& context ) {
-    std::map<ContextImpl*, AmoebaCudaData*>::const_iterator mapIterator  = contextToAmoebaDataMap.find(&context);
-    if( mapIterator == contextToAmoebaDataMap.end() ){
-        return NULL;
-    } else {
-        return static_cast<void*>(mapIterator->second);
-    }
-}
-// remove AmoebaCudaData from contextToAmoebaDataMap
-extern "C" void removeAmoebaCudaDataFromContextMap( void* inputContext ) {
-    ContextImpl* context = static_cast<ContextImpl*>(inputContext);
-    contextToAmoebaDataMap.erase( context );
-    return;
-}
-KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
-    CudaPlatform::PlatformData& cudaPlatformData = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
-    // create AmoebaCudaData object if contextToAmoebaDataMap does not contain
-    // key equal to current context
-    AmoebaCudaData* amoebaCudaData;
-    std::map<ContextImpl*, AmoebaCudaData*>::const_iterator mapIterator  = contextToAmoebaDataMap.find(&context);
-    if( mapIterator == contextToAmoebaDataMap.end() ){
-        amoebaCudaData                         = new AmoebaCudaData( cudaPlatformData );
-        contextToAmoebaDataMap[&context]       = amoebaCudaData;
-        //amoebaCudaData->setLog( stderr );
-        amoebaCudaData->setContextImpl( static_cast<void*>(&context) );
-    } else {
-        amoebaCudaData                         = mapIterator->second;
-    }
-    if (name == CalcAmoebaBondForceKernel::Name())
-        return new CudaCalcAmoebaBondForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaAngleForceKernel::Name())
-        return new CudaCalcAmoebaAngleForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaInPlaneAngleForceKernel::Name())
-        return new CudaCalcAmoebaInPlaneAngleForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaPiTorsionForceKernel::Name())
-        return new CudaCalcAmoebaPiTorsionForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaStretchBendForceKernel::Name())
-        return new CudaCalcAmoebaStretchBendForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaOutOfPlaneBendForceKernel::Name())
-        return new CudaCalcAmoebaOutOfPlaneBendForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaTorsionTorsionForceKernel::Name())
-        return new CudaCalcAmoebaTorsionTorsionForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaMultipoleForceKernel::Name())
-        return new CudaCalcAmoebaMultipoleForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaGeneralizedKirkwoodForceKernel::Name())
-        return new CudaCalcAmoebaGeneralizedKirkwoodForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaVdwForceKernel::Name())
-        return new CudaCalcAmoebaVdwForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    if (name == CalcAmoebaWcaDispersionForceKernel::Name())
-        return new CudaCalcAmoebaWcaDispersionForceKernel(name, platform, *amoebaCudaData, context.getSystem());
-    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
-}
--- a/plugins/amoeba/platforms/cuda-old/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda-old/src/AmoebaCudaKernels.cpp
-/* -------------------------------------------------------------------------- *
- *                               OpenMMAmoeba                                 *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
- * Authors:                                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "AmoebaCudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
-#include "kernels/amoebaGpuTypes.h"
-#include "kernels/cudaKernels.h"
-#include "kernels/amoebaCudaKernels.h"
-#include "openmm/internal/AmoebaVdwForceImpl.h"
-#include "openmm/internal/AmoebaMultipoleForceImpl.h"
-#include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
-#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
-#include "openmm/internal/NonbondedForceImpl.h"
-#include "CudaForceInfo.h"
-#include <stdio.h>
-#include <cmath>
-#ifdef _MSC_VER
-#include <windows.h>
-#endif
-extern "C" int gpuSetConstants( gpuContext gpu );
-using namespace OpenMM;
-using namespace std;
-/* -------------------------------------------------------------------------- *
- *                           Calculates bonded forces                         *
- * -------------------------------------------------------------------------- */
-static void computeAmoebaLocalForces( AmoebaCudaData& data ) {
-    amoebaGpuContext gpu = data.getAmoebaGpu();
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "computeAmoebaLocalForces\n" ); (void) fflush( data.getLog() );
-    }
-    data.initializeGpu();
-    kCalculateAmoebaLocalForces(gpu);
-}
-/* -------------------------------------------------------------------------- *
- *                            AmoebaBondForce                                 *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaBondForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaBondForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumBonds();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2;
-        double length, k;
-        force.getBondParameters(index, particle1, particle2, length, k);
-        particles.resize(2);
-        particles[0] = particle1;
-        particles[1] = particle2;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2;
-        double length1, length2, k1, k2;
-        force.getBondParameters(group1, particle1, particle2, length1, k1);
-        force.getBondParameters(group2, particle1, particle2, length2, k2);
-        return (length1 == length2 && k1 == k2);
-    }
-private:
-    const AmoebaBondForce& force;
-};
-CudaCalcAmoebaBondForceKernel::CudaCalcAmoebaBondForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) : 
-                CalcAmoebaBondForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaBondForceKernel::~CudaCalcAmoebaBondForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaBondForceKernel::initialize(const System& system, const AmoebaBondForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numBonds = force.getNumBonds();
-    std::vector<int>   particle1(numBonds);
-    std::vector<int>   particle2(numBonds);
-    std::vector<float> length(numBonds);
-    std::vector<float> quadratic(numBonds);
-    for (int i = 0; i < numBonds; i++) {
-        int particle1Index, particle2Index;
-        double lengthValue, kValue;
-        force.getBondParameters(i, particle1Index, particle2Index, lengthValue, kValue );
-        particle1[i]     = particle1Index; 
-        particle2[i]     = particle2Index; 
-        length[i]        = static_cast<float>( lengthValue );
-        quadratic[i]     = static_cast<float>( kValue );
-    } 
-    gpuSetAmoebaBondParameters( data.getAmoebaGpu(), particle1, particle2, length, quadratic, 
-                                static_cast<float>(force.getAmoebaGlobalBondCubic()),
-                                static_cast<float>(force.getAmoebaGlobalBondQuartic()) );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                            AmoebaInPlaneAngleForce                         *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaAngleForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaAngleForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumAngles();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3;
-        double angle, k;
-        force.getAngleParameters(index, particle1, particle2, particle3, angle, k);
-        particles.resize(3);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3;
-        double angle1, angle2, k1, k2;
-        force.getAngleParameters(group1, particle1, particle2, particle3, angle1, k1);
-        force.getAngleParameters(group2, particle1, particle2, particle3, angle2, k2);
-        return (angle1 == angle2 && k1 == k2);
-    }
-private:
-    const AmoebaAngleForce& force;
-};
-CudaCalcAmoebaAngleForceKernel::CudaCalcAmoebaAngleForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) :
-            CalcAmoebaAngleForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaAngleForceKernel::~CudaCalcAmoebaAngleForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaAngleForceKernel::initialize(const System& system, const AmoebaAngleForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numAngles                     = force.getNumAngles();
-    std::vector<int> particle1(numAngles);
-    std::vector<int> particle2(numAngles);
-    std::vector<int> particle3(numAngles);
-    std::vector<float> angle(numAngles);
-    std::vector<float> k(numAngles);
-    for (int i = 0; i < numAngles; i++) {
-        double angleValue, kQuadratic;
-        force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], angleValue, kQuadratic);
-        angle[i]            = static_cast<float>( angleValue );
-        k[i]                = static_cast<float>( kQuadratic );
-    }
-    gpuSetAmoebaAngleParameters(data.getAmoebaGpu(), particle1, particle2, particle3, angle, k,
-                                static_cast<float>(force.getAmoebaGlobalAngleCubic()),
-                                static_cast<float>(force.getAmoebaGlobalAngleQuartic()),
-                                static_cast<float>(force.getAmoebaGlobalAnglePentic()),
-                                static_cast<float>(force.getAmoebaGlobalAngleSextic()) );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                            AmoebaInPlaneAngleForce                         *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaInPlaneAngleForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaInPlaneAngleForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumAngles();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4;
-        double angle, k;
-        force.getAngleParameters(index, particle1, particle2, particle3, particle4, angle, k);
-        particles.resize(4);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4;
-        double angle1, angle2, k1, k2;
-        force.getAngleParameters(group1, particle1, particle2, particle3, particle4, angle1, k1);
-        force.getAngleParameters(group2, particle1, particle2, particle3, particle4, angle2, k2);
-        return (angle1 == angle2 && k1 == k2);
-    }
-private:
-    const AmoebaInPlaneAngleForce& force;
-};
-CudaCalcAmoebaInPlaneAngleForceKernel::CudaCalcAmoebaInPlaneAngleForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) : 
-          CalcAmoebaInPlaneAngleForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaInPlaneAngleForceKernel::~CudaCalcAmoebaInPlaneAngleForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaInPlaneAngleForceKernel::initialize(const System& system, const AmoebaInPlaneAngleForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numAngles = force.getNumAngles();
-    std::vector<int> particle1(numAngles);
-    std::vector<int> particle2(numAngles);
-    std::vector<int> particle3(numAngles);
-    std::vector<int> particle4(numAngles);
-    std::vector<float> angle(numAngles);
-    std::vector<float> k(numAngles);
-    for (int i = 0; i < numAngles; i++) {
-        double angleValue, kQuadratic;
-        force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], angleValue, kQuadratic);
-        //angle[i]            = static_cast<float>( (angleValue*RadiansToDegrees) );
-        angle[i]            = static_cast<float>( angleValue );
-        k[i]                = static_cast<float>( kQuadratic );
-    }
-    gpuSetAmoebaInPlaneAngleParameters(data.getAmoebaGpu(), particle1, particle2, particle3, particle4, angle, k,
-                                       static_cast<float>( force.getAmoebaGlobalInPlaneAngleCubic()),
-                                       static_cast<float>( force.getAmoebaGlobalInPlaneAngleQuartic()),
-                                       static_cast<float>( force.getAmoebaGlobalInPlaneAnglePentic()),
-                                       static_cast<float>( force.getAmoebaGlobalInPlaneAngleSextic() ) );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaInPlaneAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                            AmoebaPiTorsionForce                            *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaPiTorsionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaPiTorsionForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumPiTorsions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4, particle5, particle6;
-        double k;
-        force.getPiTorsionParameters(index, particle1, particle2, particle3, particle4, particle5, particle6, k);
-        particles.resize(6);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-        particles[4] = particle5;
-        particles[5] = particle6;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4, particle5, particle6;
-        double k1, k2;
-        force.getPiTorsionParameters(group1, particle1, particle2, particle3, particle4, particle5, particle6, k1);
-        force.getPiTorsionParameters(group2, particle1, particle2, particle3, particle4, particle5, particle6, k2);
-        return (k1 == k2);
-    }
-private:
-    const AmoebaPiTorsionForce& force;
-};
-CudaCalcAmoebaPiTorsionForceKernel::CudaCalcAmoebaPiTorsionForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) :
-         CalcAmoebaPiTorsionForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaPiTorsionForceKernel::~CudaCalcAmoebaPiTorsionForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaPiTorsionForceKernel::initialize(const System& system, const AmoebaPiTorsionForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numPiTorsions                     = force.getNumPiTorsions();
-    std::vector<int> particle1(numPiTorsions);
-    std::vector<int> particle2(numPiTorsions);
-    std::vector<int> particle3(numPiTorsions);
-    std::vector<int> particle4(numPiTorsions);
-    std::vector<int> particle5(numPiTorsions);
-    std::vector<int> particle6(numPiTorsions);
-    std::vector<float> torsionKParameters(numPiTorsions);
-    for (int i = 0; i < numPiTorsions; i++) {
-        double torsionKParameter;
-        force.getPiTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], particle5[i], particle6[i], torsionKParameter);
-        torsionKParameters[i] = static_cast<float>(torsionKParameter);
-    }
-    gpuSetAmoebaPiTorsionParameters(data.getAmoebaGpu(), particle1, particle2, particle3, particle4, particle5, particle6, torsionKParameters);
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaPiTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                           AmoebaStretchBend                                *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaStretchBendForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaStretchBendForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumStretchBends();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3;
-        double lengthAB, lengthCB, angle, k;
-        force.getStretchBendParameters(index, particle1, particle2, particle3, lengthAB, lengthCB, angle, k);
-        particles.resize(3);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3;
-        double lengthAB1, lengthAB2, lengthCB1, lengthCB2, angle1, angle2, k1, k2;
-        force.getStretchBendParameters(group1, particle1, particle2, particle3, lengthAB1, lengthCB1, angle1, k1);
-        force.getStretchBendParameters(group2, particle1, particle2, particle3, lengthAB2, lengthCB2, angle2, k2);
-        return (lengthAB1 == lengthAB2 && lengthCB1 == lengthCB2 && angle1 == angle2 && k1 == k2);
-    }
-private:
-    const AmoebaStretchBendForce& force;
-};
-CudaCalcAmoebaStretchBendForceKernel::CudaCalcAmoebaStretchBendForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) :
-                   CalcAmoebaStretchBendForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaStretchBendForceKernel::~CudaCalcAmoebaStretchBendForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaStretchBendForceKernel::initialize(const System& system, const AmoebaStretchBendForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numStretchBends                     = force.getNumStretchBends();
-    std::vector<int>   particle1(numStretchBends);
-    std::vector<int>   particle2(numStretchBends);
-    std::vector<int>   particle3(numStretchBends);
-    std::vector<float> lengthABParameters(numStretchBends);
-    std::vector<float> lengthCBParameters(numStretchBends);
-    std::vector<float> angleParameters(numStretchBends);
-    std::vector<float> kParameters(numStretchBends);
-    for (int i = 0; i < numStretchBends; i++) {
-        double lengthAB, lengthCB, angle, k;
-        force.getStretchBendParameters(i, particle1[i], particle2[i], particle3[i], lengthAB, lengthCB, angle, k);
-        lengthABParameters[i] = static_cast<float>(lengthAB);
-        lengthCBParameters[i] = static_cast<float>(lengthCB);
-        angleParameters[i]    = static_cast<float>(angle);
-        kParameters[i]        = static_cast<float>(k);
-    }
-    gpuSetAmoebaStretchBendParameters(data.getAmoebaGpu(), particle1, particle2, particle3, lengthABParameters, lengthCBParameters, angleParameters, kParameters);
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaStretchBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                           AmoebaOutOfPlaneBend                             *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaOutOfPlaneBendForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaOutOfPlaneBendForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumOutOfPlaneBends();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4;
-        double k;
-        force.getOutOfPlaneBendParameters(index, particle1, particle2, particle3, particle4, k);
-        particles.resize(4);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4;
-        double k1, k2;
-        force.getOutOfPlaneBendParameters(group1, particle1, particle2, particle3, particle4, k1);
-        force.getOutOfPlaneBendParameters(group2, particle1, particle2, particle3, particle4, k2);
-        return (k1 == k2);
-    }
-private:
-    const AmoebaOutOfPlaneBendForce& force;
-};
-CudaCalcAmoebaOutOfPlaneBendForceKernel::CudaCalcAmoebaOutOfPlaneBendForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) :
-          CalcAmoebaOutOfPlaneBendForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaOutOfPlaneBendForceKernel::~CudaCalcAmoebaOutOfPlaneBendForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaOutOfPlaneBendForceKernel::initialize(const System& system, const AmoebaOutOfPlaneBendForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numOutOfPlaneBends                     = force.getNumOutOfPlaneBends();
-    std::vector<int>   particle1(numOutOfPlaneBends);
-    std::vector<int>   particle2(numOutOfPlaneBends);
-    std::vector<int>   particle3(numOutOfPlaneBends);
-    std::vector<int>   particle4(numOutOfPlaneBends);
-    std::vector<float> kParameters(numOutOfPlaneBends);
-    for (int i = 0; i < numOutOfPlaneBends; i++) {
-        double k;
-        force.getOutOfPlaneBendParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], k);
-        kParameters[i] = static_cast<float>(k);
-    }
-    gpuSetAmoebaOutOfPlaneBendParameters(data.getAmoebaGpu(), particle1, particle2, particle3, particle4, kParameters,
-                                         static_cast<float>( force.getAmoebaGlobalOutOfPlaneBendCubic()),
-                                         static_cast<float>( force.getAmoebaGlobalOutOfPlaneBendQuartic()),
-                                         static_cast<float>( force.getAmoebaGlobalOutOfPlaneBendPentic()),
-                                         static_cast<float>( force.getAmoebaGlobalOutOfPlaneBendSextic() ) );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaOutOfPlaneBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                           AmoebaTorsionTorsion                             *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaTorsionTorsionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaTorsionTorsionForce& force) : force(force) {
-    }
-    int getNumParticleGroups() {
-        return force.getNumTorsionTorsions();
-    }
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2, particle3, particle4, particle5, chiralCheckAtomIndex, gridIndex;
-        force.getTorsionTorsionParameters(index, particle1, particle2, particle3, particle4, particle5, chiralCheckAtomIndex, gridIndex);
-        particles.resize(5);
-        particles[0] = particle1;
-        particles[1] = particle2;
-        particles[2] = particle3;
-        particles[3] = particle4;
-        particles[4] = particle5;
-    }
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2, particle3, particle4, particle5;
-        int chiral1, chiral2, grid1, grid2;
-        force.getTorsionTorsionParameters(group1, particle1, particle2, particle3, particle4, particle5, chiral1, grid1);
-        force.getTorsionTorsionParameters(group2, particle1, particle2, particle3, particle4, particle5, chiral2, grid2);
-        return (grid1 == grid2);
-    }
-private:
-    const AmoebaTorsionTorsionForce& force;
-};
-CudaCalcAmoebaTorsionTorsionForceKernel::CudaCalcAmoebaTorsionTorsionForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) :
-                CalcAmoebaTorsionTorsionForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaTorsionTorsionForceKernel::~CudaCalcAmoebaTorsionTorsionForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaTorsionTorsionForceKernel::initialize(const System& system, const AmoebaTorsionTorsionForce& force) {
-    data.setAmoebaLocalForcesKernel( this );
-    numTorsionTorsions = force.getNumTorsionTorsions();
-    // torsion-torsion parameters
-    std::vector<int>   particle1(numTorsionTorsions);
-    std::vector<int>   particle2(numTorsionTorsions);
-    std::vector<int>   particle3(numTorsionTorsions);
-    std::vector<int>   particle4(numTorsionTorsions);
-    std::vector<int>   particle5(numTorsionTorsions);
-    std::vector<int>   chiralCheckAtomIndex(numTorsionTorsions);
-    std::vector<int>   gridIndices(numTorsionTorsions);
-    for (int i = 0; i < numTorsionTorsions; i++) {
-        force.getTorsionTorsionParameters(i, particle1[i], particle2[i], particle3[i],
-                                             particle4[i], particle5[i],
-                                             chiralCheckAtomIndex[i], gridIndices[i]);
-    }
-    gpuSetAmoebaTorsionTorsionParameters(data.getAmoebaGpu(), particle1, particle2, particle3, particle4, particle5, chiralCheckAtomIndex, gridIndices );
-    // torsion-torsion grids
-    numTorsionTorsionGrids = force.getNumTorsionTorsionGrids();
-    std::vector<TorsionTorsionGridFloat> floatGrids;
-    floatGrids.resize(numTorsionTorsionGrids);
-    for (int gridIndex = 0; gridIndex < numTorsionTorsionGrids; gridIndex++) {
-        const TorsionTorsionGrid& grid = force.getTorsionTorsionGrid( gridIndex );
-        floatGrids[gridIndex].resize( grid.size() );
-        // check if grid needs to be reordered: x-angle should be 'slow' index
-        TorsionTorsionGrid reorderedGrid;
-        int reorder = 0;
-        if( grid[0][0][0] != grid[0][1][0] ){
-            AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
-            reorder = 1;
-            if( data.getLog() ){
-                (void) fprintf( data.getLog(), "CudaCalcAmoebaTorsionTorsionForceKernel Reordered torsion-torsion grid %4d [%u %u] %12.3f %12.3f   [%u %u] %12.3f %12.3f.\n",
-                                gridIndex, static_cast<unsigned int>(grid.size()), static_cast<unsigned int>(grid[0].size()), grid[0][0][0], grid[0][1][0],
-                                static_cast<unsigned int>(reorderedGrid.size() ),  static_cast<unsigned int>(reorderedGrid[0].size() ), reorderedGrid[0][0][0], reorderedGrid[0][1][0] );
-            }
-        }
-        for (unsigned int ii = 0; ii < grid.size(); ii++) {
-            floatGrids[gridIndex][ii].resize( grid[ii].size() );
-            for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
-                floatGrids[gridIndex][ii][jj].resize( grid[ii][jj].size() );
-                if( reorder ){
-                    for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
-                        floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(reorderedGrid[ii][jj][kk]);
-                    }
-                } else {
-                    for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
-                        floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
-                    }
-                }
-            }
-        }
-    }
-    gpuSetAmoebaTorsionTorsionGrids(data.getAmoebaGpu(), floatGrids );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    if( data.getAmoebaLocalForcesKernel() == this ){
-        computeAmoebaLocalForces( data );
-    }
-    return 0.0;
-}
-/* -------------------------------------------------------------------------- *
- *                             AmoebaMultipole                                *
- * -------------------------------------------------------------------------- */
-static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
-    amoebaGpuContext gpu = data.getAmoebaGpu();
-    data.incrementMultipoleForceCount();
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "In computeAmoebaMultipoleForce hasAmoebaGeneralizedKirkwood=%d\n",
-                        data.getHasAmoebaGeneralizedKirkwood() );
-        (void) fflush( data.getLog());
-    }
-    data.initializeGpu();
-    // calculate Born radii using either the Grycuk or OBC algorithm if GK is active
-    if( data.getHasAmoebaGeneralizedKirkwood() ){
-        kClearBornSum( gpu->gpuContext );
-        if( data.getUseGrycuk() ){
-            kCalculateAmoebaGrycukBornRadii( gpu );
-            kReduceGrycukGbsaBornSum( gpu );
-        } else {
-            throw OpenMMException("AmoebaGeneralizedKirkwood: Born radii must be calcualted using the Grycuk algorithm." );
-       }
-    }   
-    // multipoles
-    kCalculateAmoebaMultipoleForces(gpu, data.getHasAmoebaGeneralizedKirkwood() );
-    // GK
-    if( data.getHasAmoebaGeneralizedKirkwood() ){
-        kCalculateAmoebaKirkwood(gpu);
-    }
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "completed computeAmoebaMultipoleForce\n" );
-        (void) fflush( data.getLog());
-    }
-}
-static void computeAmoebaMultipolePotential( AmoebaCudaData& data, const std::vector< Vec3 >& inputGrid,
-                                             std::vector< double >& outputElectrostaticPotential) {
-    amoebaGpuContext gpu = data.getAmoebaGpu();
-    // load grid to board and allocate board memory for potential buffers
-    // calculate potential
-    // load potential into return vector
-    // deallocate board memory
-    gpuSetupElectrostaticPotentialCalculation( gpu, inputGrid );
-    data.setGpuInitialized( false );
-    data.initializeGpu();
-    kCalculateAmoebaMultipolePotential( gpu );
-    gpuLoadElectrostaticPotential( gpu, inputGrid.size(), outputElectrostaticPotential );
-    gpuCleanupElectrostaticPotentialCalculation( gpu );
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "completed computeAmoebaMultipolePotential\n" );
-        (void) fflush( data.getLog());
-    }
-}
-static void computeAmoebaSystemMultipoleMoments( AmoebaCudaData& data, std::vector< double >& outputMultipoleMonents) {
-    amoebaGpuContext gpu = data.getAmoebaGpu();
-    data.setGpuInitialized( false );
-    data.initializeGpu();
-    kCalculateAmoebaSystemMultipoleMoments( gpu, outputMultipoleMonents );
-}
-class CudaCalcAmoebaMultipoleForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaMultipoleForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, thole1, thole2, damping1, damping2, polarity1, polarity2;
-        int axis1, axis2, multipole11, multipole12, multipole21, multipole22, multipole31, multipole32;
-        vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
-        force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1);
-        force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2);
-        if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2){
-            return false;
-        }
-        for (int i = 0; i < (int) dipole1.size(); ++i){
-            if (dipole1[i] != dipole2[i]){
-                return false;
-            }
-        }
-        for (int i = 0; i < (int) quadrupole1.size(); ++i){
-            if (quadrupole1[i] != quadrupole2[i]){
-                return false;
-            }
-        }
-        return true;
-    }
-private:
-    const AmoebaMultipoleForce& force;
-};
-CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) : 
-         CalcAmoebaMultipoleForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
-    numMultipoles   = force.getNumMultipoles();
-    data.setHasAmoebaMultipole( true );
-    std::vector<float> charges(numMultipoles);
-    std::vector<float> dipoles(3*numMultipoles);
-    std::vector<float> quadrupoles(9*numMultipoles);
-    std::vector<float> tholes(numMultipoles);
-    std::vector<float> dampingFactors(numMultipoles);
-    std::vector<float> polarity(numMultipoles);
-    std::vector<int>   axisTypes(numMultipoles);
-    std::vector<int>   multipoleAtomZs(numMultipoles);
-    std::vector<int>   multipoleAtomXs(numMultipoles);
-    std::vector<int>   multipoleAtomYs(numMultipoles);
-    std::vector< std::vector< std::vector<int> > > multipoleAtomCovalentInfo(numMultipoles);
-    std::vector<int> minCovalentIndices(numMultipoles);
-    std::vector<int> minCovalentPolarizationIndices(numMultipoles);
-    float scalingDistanceCutoff = 50.0f;
-    std::vector<AmoebaMultipoleForce::CovalentType> covalentList;
-    covalentList.push_back( AmoebaMultipoleForce::Covalent12 );
-    covalentList.push_back( AmoebaMultipoleForce::Covalent13 );
-    covalentList.push_back( AmoebaMultipoleForce::Covalent14 );
-    covalentList.push_back( AmoebaMultipoleForce::Covalent15 );
-    std::vector<AmoebaMultipoleForce::CovalentType> polarizationCovalentList;
-    polarizationCovalentList.push_back( AmoebaMultipoleForce::PolarizationCovalent11 );
-    polarizationCovalentList.push_back( AmoebaMultipoleForce::PolarizationCovalent12 );
-    polarizationCovalentList.push_back( AmoebaMultipoleForce::PolarizationCovalent13 );
-    polarizationCovalentList.push_back( AmoebaMultipoleForce::PolarizationCovalent14 );
-    std::vector<int> covalentDegree;
-    AmoebaMultipoleForceImpl::getCovalentDegree( force, covalentDegree );
-    int dipoleIndex      = 0;
-    int quadrupoleIndex  = 0;
-    int maxCovalentRange = 0;
-    double totalCharge   = 0.0;
-    for (int i = 0; i < numMultipoles; i++) {
-        // multipoles
-        int axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY;
-        double charge, tholeD, dampingFactorD, polarityD;
-        std::vector<double> dipolesD;
-        std::vector<double> quadrupolesD;
-        force.getMultipoleParameters(i, charge, dipolesD, quadrupolesD, axisType, multipoleAtomZ, multipoleAtomX, multipoleAtomY,
-                                     tholeD, dampingFactorD, polarityD );
-        totalCharge                       += charge;
-        axisTypes[i]                       = axisType;
-        multipoleAtomZs[i]                 = multipoleAtomZ;
-        multipoleAtomXs[i]                 = multipoleAtomX;
-        multipoleAtomYs[i]                 = multipoleAtomY;
-        charges[i]                         = static_cast<float>(charge);
-        tholes[i]                          = static_cast<float>(tholeD);
-        dampingFactors[i]                  = static_cast<float>(dampingFactorD);
-        polarity[i]                        = static_cast<float>(polarityD);
-        dipoles[dipoleIndex++]             = static_cast<float>(dipolesD[0]);
-        dipoles[dipoleIndex++]             = static_cast<float>(dipolesD[1]);
-        dipoles[dipoleIndex++]             = static_cast<float>(dipolesD[2]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[0]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[1]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[2]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[3]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[4]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[5]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[6]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[7]);
-        quadrupoles[quadrupoleIndex++]     = static_cast<float>(quadrupolesD[8]);
-        // covalent info
-        std::vector< std::vector<int> > covalentLists;
-        force.getCovalentMaps(i, covalentLists );
-        multipoleAtomCovalentInfo[i] = covalentLists;
-        int minCovalentIndex, maxCovalentIndex;
-        AmoebaMultipoleForceImpl::getCovalentRange( force, i, covalentList, &minCovalentIndex, &maxCovalentIndex );
-        minCovalentIndices[i] = minCovalentIndex;
-        if( maxCovalentRange < (maxCovalentIndex - minCovalentIndex) ){
-            maxCovalentRange = maxCovalentIndex - minCovalentIndex;
-        }
-        AmoebaMultipoleForceImpl::getCovalentRange( force, i, polarizationCovalentList, &minCovalentIndex, &maxCovalentIndex );
-        minCovalentPolarizationIndices[i] = minCovalentIndex;
-        if( maxCovalentRange < (maxCovalentIndex - minCovalentIndex) ){
-            maxCovalentRange = maxCovalentIndex - minCovalentIndex;
-        }
-    }
-    int polarizationType = static_cast<int>(force.getPolarizationType());
-    int nonbondedMethod  = static_cast<int>(force.getNonbondedMethod());
-    if( nonbondedMethod != 0 && nonbondedMethod != 1 ){
-         throw OpenMMException("AmoebaMultipoleForce nonbonded method not recognized.\n");
-    }
-    if( polarizationType != 0 && polarizationType != 1 ){
-         throw OpenMMException("AmoebaMultipoleForce polarization type not recognized.\n");
-    }
-    gpuSetAmoebaMultipoleParameters(data.getAmoebaGpu(), charges, dipoles, quadrupoles, axisTypes, multipoleAtomZs, multipoleAtomXs, multipoleAtomYs,
-                                    tholes, scalingDistanceCutoff, dampingFactors, polarity,
-                                    multipoleAtomCovalentInfo, covalentDegree, minCovalentIndices, minCovalentPolarizationIndices, (maxCovalentRange+2),
-                                    0, force.getMutualInducedMaxIterations(),
-                                    static_cast<float>( force.getMutualInducedTargetEpsilon()),
-                                    nonbondedMethod, polarizationType,
-                                    static_cast<float>( force.getCutoffDistance()),
-                                    static_cast<float>( force.getAEwald()) );
-    if (nonbondedMethod == AmoebaMultipoleForce::PME) {
-        double alpha = force.getAEwald();
-        int xsize, ysize, zsize;
-        NonbondedForce nb;
-        nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
-        nb.setCutoffDistance(force.getCutoffDistance());
-        std::vector<int> pmeGridDimension;
-        force.getPmeGridDimensions( pmeGridDimension );
-        int pmeParametersSetBasedOnEwaldErrorTolerance;
-        if( pmeGridDimension[0] == 0 || alpha == 0.0 ){
-            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, xsize, ysize, zsize);
-            pmeParametersSetBasedOnEwaldErrorTolerance = 1;
-        } else {
-            alpha = force.getAEwald();
-            xsize = pmeGridDimension[0];
-            ysize = pmeGridDimension[1];
-            zsize = pmeGridDimension[2];
-            pmeParametersSetBasedOnEwaldErrorTolerance = 0;
-        }
-        gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
-        if( data.getLog() ){
-            (void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d]\n",
-                            force.getEwaldErrorTolerance(), force.getCutoffDistance(),  alpha, xsize, ysize, zsize );
-            if( pmeParametersSetBasedOnEwaldErrorTolerance  ){
-                 (void) fprintf( data.getLog(), "Parameters based on error tolerance and OpenMM algorithm.\n" );
-            } else {
-                 double alphaT;
-                 int xsizeT, ysizeT, zsizeT;
-                 NonbondedForceImpl::calcPMEParameters(system, nb, alphaT, xsizeT, ysizeT, zsizeT);
-                 double impliedTolerance  = alpha*force.getCutoffDistance();
-                        impliedTolerance  = 0.5*exp( -(impliedTolerance*impliedTolerance) );
-                 (void) fprintf( data.getLog(), "Using input parameters implied tolerance=%12.3e;", impliedTolerance );
-                 (void) fprintf( data.getLog(), "OpenMM param: aEwald=%12.3f [%6d %6d %6d]\n", alphaT, xsizeT, ysizeT, zsizeT);
-            }
-            (void) fprintf( data.getLog(), "\n" );
-            (void) fflush( data.getLog() );
-        }
-        data.setApplyMultipoleCutoff( 1 );
-        data.cudaPlatformData.nonbondedMethod = PARTICLE_MESH_EWALD;
-        amoebaGpuContext amoebaGpu            = data.getAmoebaGpu();
-        gpuContext gpu                        = amoebaGpu->gpuContext;
-        gpu->sim.nonbondedCutoffSqr           = static_cast<float>(force.getCutoffDistance()*force.getCutoffDistance());
-        gpu->sim.nonbondedMethod              = PARTICLE_MESH_EWALD;
-    }
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    computeAmoebaMultipoleForce( data );
-    return 0.0;
-}
-void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context,  const std::vector< Vec3 >& inputGrid,
-                                                                   std::vector< double >& outputElectrostaticPotential) {
-    computeAmoebaMultipolePotential( data, inputGrid, outputElectrostaticPotential );
-    return;
-}
-void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, std::vector< double >& outputMultipoleMonents) {
-    computeAmoebaSystemMultipoleMoments( data, outputMultipoleMonents);
-    return;
-}
-/* -------------------------------------------------------------------------- *
- *                       AmoebaGeneralizedKirkwood                            *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaGeneralizedKirkwoodForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaGeneralizedKirkwoodForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, radius1, radius2, scale1, scale2;
-        force.getParticleParameters(particle1, charge1, radius1, scale1);
-        force.getParticleParameters(particle2, charge2, radius2, scale2);
-        return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
-    }
-private:
-    const AmoebaGeneralizedKirkwoodForce& force;
-};
-CudaCalcAmoebaGeneralizedKirkwoodForceKernel::CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) : 
-           CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaGeneralizedKirkwoodForceKernel::~CudaCalcAmoebaGeneralizedKirkwoodForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& system, const AmoebaGeneralizedKirkwoodForce& force) {
-    data.setHasAmoebaGeneralizedKirkwood( true );
-    int numParticles = system.getNumParticles();
-    std::vector<float> radius(numParticles);
-    std::vector<float> scale(numParticles);
-    std::vector<float> charge(numParticles);
-    for( int ii = 0; ii < numParticles; ii++ ){
-        double particleCharge, particleRadius, scalingFactor;
-        force.getParticleParameters(ii, particleCharge, particleRadius, scalingFactor);
-        radius[ii]  = static_cast<float>( particleRadius );
-        scale[ii]   = static_cast<float>( scalingFactor );
-        charge[ii]  = static_cast<float>( particleCharge );
-    }   
-    if( data.getUseGrycuk() ){
-        gpuSetAmoebaGrycukParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
-                                      static_cast<float>( force.getSolventDielectric() ), 
-                                      radius, scale, charge,
-                                      force.getIncludeCavityTerm(),
-                                      static_cast<float>( force.getProbeRadius() ), 
-                                      static_cast<float>( force.getSurfaceAreaFactor() ) ); 
-    } else {
-        gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ), 
-                                   static_cast<float>( force.getSolventDielectric() ), 
-                                   radius, scale, charge,
-                                   force.getIncludeCavityTerm(),
-                                   static_cast<float>( force.getProbeRadius() ), 
-                                   static_cast<float>( force.getSurfaceAreaFactor() ) ); 
-    }
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaGeneralizedKirkwoodForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    // handled in computeAmoebaMultipoleForce()
-    return 0.0;
-}
-static void computeAmoebaVdwForce( AmoebaCudaData& data ) {
-    amoebaGpuContext gpu = data.getAmoebaGpu();
-    data.initializeGpu();
-    // Vdw14_7F
-    kCalculateAmoebaVdw14_7Forces(gpu, data.getUseVdwNeighborList());
-}
-/* -------------------------------------------------------------------------- *
- *                           AmoebaVdw                                        *
- * -------------------------------------------------------------------------- */
-class CudaCalcAmoebaVdwForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaVdwForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        int iv1, iv2;
-        double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2;
-        force.getParticleParameters(particle1, iv1, sigma1, epsilon1, reduction1);
-        force.getParticleParameters(particle2, iv2, sigma2, epsilon2, reduction2);
-        return (sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
-    }
-private:
-    const AmoebaVdwForce& force;
-};
-CudaCalcAmoebaVdwForceKernel::CudaCalcAmoebaVdwForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) :
-       CalcAmoebaVdwForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaVdwForceKernel::~CudaCalcAmoebaVdwForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const AmoebaVdwForce& force) {
-    // per-particle parameters
-    int numParticles = system.getNumParticles();
-    std::vector<int> indexIVs(numParticles);
-    std::vector< std::vector<int> > allExclusions(numParticles);
-    std::vector<float> sigmas(numParticles);
-    std::vector<float> epsilons(numParticles);
-    std::vector<float> reductions(numParticles);
-    for( int ii = 0; ii < numParticles; ii++ ){
-        int indexIV;
-        double sigma, epsilon, reduction;
-        std::vector<int> exclusions;
-        force.getParticleParameters( ii, indexIV, sigma, epsilon, reduction );
-        force.getParticleExclusions( ii, exclusions );
-        for( unsigned int jj = 0; jj < exclusions.size(); jj++ ){
-           allExclusions[ii].push_back( exclusions[jj] );
-        }
-        indexIVs[ii]      = indexIV;
-        sigmas[ii]        = static_cast<float>( sigma );
-        epsilons[ii]      = static_cast<float>( epsilon );
-        reductions[ii]    = static_cast<float>( reduction );
-    }   
-    bool useCutoff = (force.getNonbondedMethod() == AmoebaVdwForce::CutoffPeriodic);
-    gpuSetAmoebaVdwParameters( data.getAmoebaGpu(), indexIVs, sigmas, epsilons, reductions,
-                               force.getSigmaCombiningRule(), force.getEpsilonCombiningRule(),
-                               allExclusions, useCutoff, static_cast<float>(force.getCutoff()) );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-    if( data.getLog() ){
-        (void) fprintf( data.getLog(), "CudaCalcAmoebaVdwForceKernel useCutoff=%d\n",
-                        useCutoff );
-    }
-    data.setUseVdwNeighborList(force.getNonbondedMethod() != AmoebaVdwForce::NoCutoff);
-    if (force.getUseDispersionCorrection())
-        data.dispersionCoefficient = AmoebaVdwForceImpl::calcDispersionCorrection(system, force);
-    else
-        data.dispersionCoefficient = 0.0;               
-}
-double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    _gpuContext* gpu = data.cudaPlatformData.gpu;
-    computeAmoebaVdwForce( data );
-    if (data.dispersionCoefficient != 0.0) {
-        double Answer = data.dispersionCoefficient/(gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ);
-        return Answer;
-    } else {
-        return 0.0;
-    }
-}
-/* -------------------------------------------------------------------------- *
- *                           AmoebaWcaDispersion                              *
- * -------------------------------------------------------------------------- */
-static void computeAmoebaWcaDispersionForce( AmoebaCudaData& data ) {
-    data.initializeGpu();
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "Calling computeAmoebaWcaDispersionForce  " ); (void) fflush( data.getLog() );
-    }
-    kCalculateAmoebaWcaDispersionForces( data.getAmoebaGpu() );
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), " -- completed\n" ); (void) fflush( data.getLog() );
-    }
-}
-class CudaCalcAmoebaWcaDispersionForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const AmoebaWcaDispersionForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double radius1, radius2, epsilon1, epsilon2;
-        force.getParticleParameters(particle1, radius1, epsilon1);
-        force.getParticleParameters(particle2, radius2, epsilon2);
-        return (radius1 == radius2 && epsilon1 == epsilon2);
-    }
-private:
-    const AmoebaWcaDispersionForce& force;
-};
-CudaCalcAmoebaWcaDispersionForceKernel::CudaCalcAmoebaWcaDispersionForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system) : 
-           CalcAmoebaWcaDispersionForceKernel(name, platform), data(data), system(system) {
-    data.incrementKernelCount();
-}
-CudaCalcAmoebaWcaDispersionForceKernel::~CudaCalcAmoebaWcaDispersionForceKernel() {
-    data.decrementKernelCount();
-}
-void CudaCalcAmoebaWcaDispersionForceKernel::initialize(const System& system, const AmoebaWcaDispersionForce& force) {
-    // per-particle parameters
-    int numParticles = system.getNumParticles();
-    std::vector<float> radii(numParticles);
-    std::vector<float> epsilons(numParticles);
-    for( int ii = 0; ii < numParticles; ii++ ){
-        double radius, epsilon;
-        force.getParticleParameters( ii, radius, epsilon );
-        radii[ii]         = static_cast<float>( radius );
-        epsilons[ii]      = static_cast<float>( epsilon );
-    }   
-    float totalMaximumDispersionEnergy =  static_cast<float>( AmoebaWcaDispersionForceImpl::getTotalMaximumDispersionEnergy( force ) );
-    gpuSetAmoebaWcaDispersionParameters( data.getAmoebaGpu(), radii, epsilons, totalMaximumDispersionEnergy,
-                                          static_cast<float>( force.getEpso( )),
-                                          static_cast<float>( force.getEpsh( )),
-                                          static_cast<float>( force.getRmino( )),
-                                          static_cast<float>( force.getRminh( )),
-                                          static_cast<float>( force.getAwater( )),
-                                          static_cast<float>( force.getShctd( )),
-                                          static_cast<float>( force.getDispoff( ) ) );
-    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-double CudaCalcAmoebaWcaDispersionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-    computeAmoebaWcaDispersionForce( data );
-    return 0.0;
-}
--- a/plugins/amoeba/platforms/cuda-old/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda-old/src/AmoebaCudaKernels.h
-#ifndef AMOEBA_OPENMM_CUDAKERNELS_H_
-#define AMOEBA_OPENMM_CUDAKERNELS_H_
-/* -------------------------------------------------------------------------- *
- *                              OpenMMAmoeba                                  *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors:                                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "openmm/amoebaKernels.h"
-#include "CudaKernels.h"
-#include "openmm/kernels.h"
-#include "openmm/System.h"
-#include "AmoebaCudaData.h"
-namespace OpenMM {
-/**
- * This kernel is invoked by AmoebaBondForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaBondForceKernel : public CalcAmoebaBondForceKernel {
-public:
-    CudaCalcAmoebaBondForceKernel(std::string name, 
-                                          const Platform& platform,
-                                          AmoebaCudaData& data,
-                                          System& system);
-    ~CudaCalcAmoebaBondForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaBondForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaBondForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numBonds;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaAngleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaAngleForceKernel : public CalcAmoebaAngleForceKernel {
-public:
-    CudaCalcAmoebaAngleForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaAngleForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaAngleForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaAngleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numAngles;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaInPlaneAngleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaInPlaneAngleForceKernel : public CalcAmoebaInPlaneAngleForceKernel {
-public:
-    CudaCalcAmoebaInPlaneAngleForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaInPlaneAngleForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaInPlaneAngleForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaInPlaneAngleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numAngles;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaPiTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaPiTorsionForceKernel : public CalcAmoebaPiTorsionForceKernel {
-public:
-    CudaCalcAmoebaPiTorsionForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaPiTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaPiTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaPiTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numPiTorsions;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaStretchBendForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaStretchBendForceKernel : public CalcAmoebaStretchBendForceKernel {
-public:
-    CudaCalcAmoebaStretchBendForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaStretchBendForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaStretchBendForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaStretchBendForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numStretchBends;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaOutOfPlaneBendForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaOutOfPlaneBendForceKernel : public CalcAmoebaOutOfPlaneBendForceKernel {
-public:
-    CudaCalcAmoebaOutOfPlaneBendForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaOutOfPlaneBendForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaOutOfPlaneBendForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaOutOfPlaneBendForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numOutOfPlaneBends;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaTorsionTorsionForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaTorsionTorsionForceKernel : public CalcAmoebaTorsionTorsionForceKernel {
-public:
-    CudaCalcAmoebaTorsionTorsionForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaTorsionTorsionForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaTorsionTorsionForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaTorsionTorsionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    int numTorsionTorsions;
-    int numTorsionTorsionGrids;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaMultipoleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaMultipoleForceKernel : public CalcAmoebaMultipoleForceKernel {
-public:
-    CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaMultipoleForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaMultipoleForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaMultipoleForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Execute the kernel to calculate the electrostatic potential
-     *
-     * @param context        the context in which to execute this kernel
-     * @param inputGrid      input grid coordinates
-     * @param outputElectrostaticPotential output potential 
-     */
-    void getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
-                                   std::vector< double >& outputElectrostaticPotential );
-   /** 
-     * Get the system multipole moments
-     *
-     * @param context      context
-     * @param outputMultipoleMonents (charge,
-                                      dipole_x, dipole_y, dipole_z,
-                                      quadrupole_xx, quadrupole_xy, quadrupole_xz,
-                                      quadrupole_yx, quadrupole_yy, quadrupole_yz,
-                                      quadrupole_zx, quadrupole_zy, quadrupole_zz )
-     */
-    void getSystemMultipoleMoments( ContextImpl& context, std::vector< double >& outputMultipoleMonents );
-private:
-    class ForceInfo;
-    int numMultipoles;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked by AmoebaMultipoleForce to calculate the forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaGeneralizedKirkwoodForceKernel : public CalcAmoebaGeneralizedKirkwoodForceKernel {
-public:
-    CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaGeneralizedKirkwoodForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaMultipoleForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaGeneralizedKirkwoodForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked to calculate the vdw forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaVdwForceKernel : public CalcAmoebaVdwForceKernel {
-public:
-    CudaCalcAmoebaVdwForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaVdwForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaMultipoleForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaVdwForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    AmoebaCudaData& data;
-    System& system;
-};
-/**
- * This kernel is invoked to calculate the WCA dispersion forces acting on the system and the energy of the system.
- */
-class CudaCalcAmoebaWcaDispersionForceKernel : public CalcAmoebaWcaDispersionForceKernel {
-public:
-    CudaCalcAmoebaWcaDispersionForceKernel(std::string name, const Platform& platform, AmoebaCudaData& data, System& system);
-    ~CudaCalcAmoebaWcaDispersionForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the AmoebaMultipoleForce this kernel will be used for
-     */
-    void initialize(const System& system, const AmoebaWcaDispersionForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    class ForceInfo;
-    AmoebaCudaData& data;
-    System& system;
-};
-} // namespace OpenMM
-#endif /*AMOEBA_OPENMM_CUDAKERNELS_H*/
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaCudaGpu.cpp
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaCudaGpu.cpp
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaCudaKernels.h
-#ifndef __AMOEBA_GPU_TYPES_H__
-#define __AMOEBA_GPU_TYPES_H__
-/* -------------------------------------------------------------------------- *
- *                             OpenMMAmoeba                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaGpuTypes.h"
-#include "openmm/Vec3.h"
-#include <string>
-#include <vector>
-typedef std::vector<std::string> StringVector;
-typedef std::vector<StringVector> StringVectorVector;
-#define SQRT sqrtf
-#define EXP  expf
-#define DOT3(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
-typedef std::vector<std::vector<double> > VectorOfDoubleVectors;
-// local (bond) forces
-extern void SetCalculateAmoebaLocalForcesSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaLocalForcesSim(amoebaGpuContext gpu);
-extern void kCalculateAmoebaLocalForces(amoebaGpuContext gpu);
-// multipole forces
-extern void SetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaMultipoleForcesSim(amoebaGpuContext gpu);
-extern void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool performGk );
-extern void kSetupAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaGeneralizedKirkwood );
-// multipole potential
-extern void SetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext gpu);
-extern void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu );
-// system multipole moments
-extern void kCalculateAmoebaSystemMultipoleMoments(amoebaGpuContext amoebaGpu, std::vector< double >& outputMultipoleMonents );
-// vdw
-extern void SetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaVdw14_7Sim(amoebaGpuContext gpu);
-extern void kCalculateAmoebaVdw14_7Forces(amoebaGpuContext amoebaGpu, int applyCutoff );
-// wca dispersion
-extern void SetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaWcaDispersionSim(amoebaGpuContext gpu);
-extern void kCalculateAmoebaWcaDispersionForces(amoebaGpuContext amoebaGpu );
-// fixed electric field -- no cutoff
-extern void SetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaFixedEField( amoebaGpuContext gpu);
-// fixed electric field  -- PME
-extern void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext gpu);
-// fixed electric field and Gk
-extern void SetCalculateAmoebaCudaFixedEAndGKFieldsSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaFixedEAndGKFieldsSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext gpu);
-// mutual induced 
-extern void SetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaMutualInducedField( amoebaGpuContext gpu);
-extern void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext gpu);
-// mutual induced and Gk
-extern void SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu);
-extern void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu);
-extern void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext gpu);
-extern void cudaComputeAmoebaLabFrameMoments( amoebaGpuContext amoebaGpu );
-extern void cudaWriteFloat4AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, int timestep, int entriesPerAtom1, CUDAStream<float4>* array1, 
-                                                  int entriesPerAtom2, CUDAStream<float>* array2 );
-extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce );
-extern void cudaComputeAmoebaElectrostaticPotential( amoebaGpuContext amoebaGpu );
-extern void SetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaPmeDirectElectrostaticSim( amoebaGpuContext amoebaGpu );
-extern void cudaComputeAmoebaPmeElectrostatic( amoebaGpuContext amoebaGpu );
-extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
-extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
-extern void cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpuContext gpu, CUDAStream<float>* psTorque );
-extern void SetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaKirkwoodSim( amoebaGpuContext amoebaGpu );
-//extern void cudaComputeAmoebaKirkwood( amoebaGpuContext amoebaGpu );
-extern void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu );
-extern void SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaKirkwoodEDiffSim( amoebaGpuContext amoebaGpu );
-//extern void cudaComputeAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );
-extern void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );
-//extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
-//extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
-//extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
-extern void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu );
-extern void kReduceGrycukGbsaBornSum( amoebaGpuContext gpu );
-extern void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
-extern void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
-extern void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu );
-// OBC -- Part 1
-//extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);
-//extern void GetCalculateObcGbsaForces1Sim(gpuContext gpu);
-//extern void kCalculateObcGbsaForces1(gpuContext gpu);
-extern void SetCalculateAmoebaObcGbsaForces2Sim(amoebaGpuContext amoebaGpu);
-extern void GetCalculateAmoebaObcGbsaForces2Sim(amoebaGpuContext amoebaGpu);
-extern void kCalculateAmoebaObcGbsaForces2(  amoebaGpuContext amoebaGpu );
-extern void  cudaReduceN2ToN( float *N2Array, int N, float *NArray, int includeDiagonal, int offset );
-extern float cudaGetSum( int numberOfElements, CUDAStream<float>* array );
-extern float cudaGetNorm2( int numberOfElements, CUDAStream<float>* array );
-extern int   checkForNansAndInfinities( int numberOfElements, CUDAStream<float>* array );
-extern void cudaWriteFloat1AndFloat1ArraysToFile( int numberOfAtoms, const std::string& fname, std::vector<int>& fileId, int entriesPerAtom1, CUDAStream<float>* array1, 
-                                                  int entriesPerAtom2, CUDAStream<float>* array2 );
-extern void readFile( std::string fileName, StringVectorVector& fileContents );
-extern void cudaLoadCudaFloatArray(  int numberOfParticles, int entriesPerParticle, CUDAStream<float>*  array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
-extern void cudaLoadCudaFloat2Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float2>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
-extern void cudaLoadCudaFloat4Array( int numberOfParticles, int entriesPerParticle, CUDAStream<float4>* array, VectorOfDoubleVectors& outputVector, int* order, float conversion );
-extern void cudaWriteVectorOfDoubleVectorsToFile( const std::string& fname, std::vector<int>& fileId, VectorOfDoubleVectors& outputVector );
-extern void initializeCudaFloatArray( int numberOfParticles, int entriesPerParticle, CUDAStream<float>* array, float initValue );
-extern void checkForNans( int numberOfParticles, int entriesPerParticle,
-                          CUDAStream<float>* array, int* order, int iteration, std::string idString, FILE* log );
-extern void checkForNansFloat4( int numberOfParticles, CUDAStream<float4>* array, int* order, int iteration, std::string idString, FILE* log );
-extern void kClearFloat( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float>* fieldToClear );
-extern void kClearFloat4( amoebaGpuContext amoebaGpu, unsigned int entries, CUDAStream<float4>* fieldToClear );
-extern void kClearFields_1( amoebaGpuContext amoebaGpu );
-extern void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear );
-extern void kClearBornSum(gpuContext gpu);
-extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
-//extern int isNanOrInfinity( double number );
-extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
-extern void zeroCUDAStreamFloat4( CUDAStream<float4>* streamToCopy );
-extern void reduceAndCopyCUDAStreamFloat4( CUDAStream<float4>* streamToCopy, CUDAStream<float>*  outputStream, float conversion );
-extern void reduceAndCopyCUDAStreamFloat( CUDAStream<float>* streamToCopy, CUDAStream<float>*  outputStream, float conversion );
-// PME
-extern void SetCalculateAmoebaPMESim( amoebaGpuContext amoebaGpu );
-extern void kCalculateAmoebaPMEFixedMultipoles(amoebaGpuContext amoebaGpu);
-extern void kCalculateAmoebaPMEInducedDipoleField(amoebaGpuContext amoebaGpu);
-extern void kCalculateAmoebaPMEInducedDipoleForces(amoebaGpuContext amoebaGpu);
-extern void SetCalculateAmoebaCudaUtilitiesSim( amoebaGpuContext amoebaGpu );
-double getTimeOfDay( void );
-#endif //__AMOEBA_GPU_TYPES_H__
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaCudaTypes.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaCudaTypes.h
-#ifndef AMOEBA_CUDATYPES_H
-#define AMOEBA_CUDATYPES_H
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <kernels/cudatypes.h>
-#include <stdarg.h>
-#include <limits>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <cufft.h>
-#include <builtin_types.h>
-#include <vector_functions.h>
-enum CudaAmoebaNonbondedMethod
-{
-    AMOEBA_NO_CUTOFF,
-    AMOEBA_PARTICLE_MESH_EWALD
-};
-static const int AMOEBA_PME_ORDER = 5;
-static const int AMOEBA_MAX_TORSION_TORSION_GRIDS = 12;
-struct cudaAmoebaGmxSimulation {
-    // Constants
-    unsigned int    amoebaBonds;                    // Number of bonds
-    int4*           pAmoebaBondID;                  // Bond atom and output buffer IDs
-    float2*         pAmoebaBondParameter;           // Bond parameters
-    float           amoebaBondCubicParameter;       // cubic bond parameters
-    float           amoebaBondQuarticicParameter;   // quartic bond parameters
-    unsigned int    amoebaBond_offset;              // Offset to end of bonds
-    unsigned int    amoebaAngles;                   // Number of bond angles
-    int4*           pAmoebaAngleID1;                // Bond angle atom and first output buffer IDs
-    int2*           pAmoebaAngleID2;                // Bond angle output buffer IDs
-    float2*         pAmoebaAngleParameter;          // Bond angle parameters
-    unsigned int    amoebaAngle_offset;             // Offset to end of bond angles
-    float amoebaAngleCubicK;                        // cubic factor
-    float amoebaAngleQuarticK;                      // quartic factor
-    float amoebaAnglePenticK;                       // pentic factor
-    float amoebaAngleSexticK;                       // sextic factor
-    unsigned int    amoebaInPlaneAngles;            // Number of in-plane angles
-    int4*           pAmoebaInPlaneAngleID1;         // Bond angle atom and first output buffer IDs
-    int4*           pAmoebaInPlaneAngleID2;         // Bond angle output buffer IDs
-    float2*         pAmoebaInPlaneAngleParameter;   // Bond angle parameters
-    unsigned int    amoebaInPlaneAngle_offset;      // Offset to end of bond angles
-    float amoebaInPlaneAngleCubicK;                 // cubic factor
-    float amoebaInPlaneAngleQuarticK;               // quartic factor
-    float amoebaInPlaneAnglePenticK;                // pentic factor
-    float amoebaInPlaneAngleSexticK;                // sextic factor
-    unsigned int    amoebaTorsions;                 // Number of torsions
-    int4*           pAmoebaTorsionID1;              // Torsion atom and first output buffer IDs
-    int4*           pAmoebaTorsionID2;              // Torsion output buffer IDs
-    float4*         pAmoebaTorsionParameter1;       // Torsion parameters
-    float2*         pAmoebaTorsionParameter2;       // Torsion parameters
-    unsigned int    amoebaTorsion_offset;           // Offset to end of torsions
-    unsigned int    amoebaPiTorsions;               // Number of torsions
-    int4*           pAmoebaPiTorsionID1;            // PiTorsion atom and first output buffer IDs
-    int4*           pAmoebaPiTorsionID2;            // PiTorsion output buffer IDs
-    int4*           pAmoebaPiTorsionID3;            // PiTorsion output buffer IDs
-    float*          pAmoebaPiTorsionParameter;      // PiTorsion parameters
-    unsigned int    amoebaPiTorsion_offset;         // Offset to end of torsions
-    unsigned int    amoebaStretchBends;             // Number of stretch bends
-    int4*           pAmoebaStretchBendID1;          // stretch bend atoms and first output buffer IDs
-    int2*           pAmoebaStretchBendID2;          // stretch bend output buffer IDs
-    float4*         pAmoebaStretchBendParameter;    // stretch bend parameters
-    unsigned int    amoebaStretchBend_offset;       // Offset to end of stretch bends
-    unsigned int    amoebaOutOfPlaneBends;          // Number of stretch bends
-    int4*           pAmoebaOutOfPlaneBendID1;       // stretch bend atoms and first output buffer IDs
-    int4*           pAmoebaOutOfPlaneBendID2;       // stretch bend output buffer IDs
-    float*          pAmoebaOutOfPlaneBendParameter; // stretch bend parameters
-    unsigned int    amoebaOutOfPlaneBend_offset;    // Offset to end of stretch bends
-    float amoebaOutOfPlaneBendCubicK;               // cubic factor
-    float amoebaOutOfPlaneBendQuarticK;             // quartic factor
-    float amoebaOutOfPlaneBendPenticK;              // pentic factor
-    float amoebaOutOfPlaneBendSexticK;              // sextic factor
-    unsigned int    amoebaTorsionTorsions;          // Number of torsion torsions
-    int4*           pAmoebaTorsionTorsionID1;       // torsion torsion atoms and first output buffer IDs
-    int4*           pAmoebaTorsionTorsionID2;       // torsion torsion output buffer IDs
-    int4*           pAmoebaTorsionTorsionID3;       // torsion torsion parameters
-    unsigned int    amoebaTorsionTorsion_offset;    // Offset to end of torsion torsions
-                                                    // grids
-    int   amoebaTorTorGridOffset[AMOEBA_MAX_TORSION_TORSION_GRIDS];                // grid offset
-    int   amoebaTorTorGridNy[AMOEBA_MAX_TORSION_TORSION_GRIDS];                    // 25
-    float amoebaTorTorGridBegin[AMOEBA_MAX_TORSION_TORSION_GRIDS];                 // -180.0
-    float amoebaTorTorGridDelta[AMOEBA_MAX_TORSION_TORSION_GRIDS];                 // 15.0
-    float4*          pAmoebaTorsionTorsionGrids;    // torsion torsion grids
-    unsigned int    amoebaUreyBradleys;             // Number of UB ixns
-    int4*           pAmoebaUreyBradleyID;           // UreyBradley atom and output buffer IDs
-    float2*         pAmoebaUreyBradleyParameter;    // UreyBradley parameters
-    float           amoebaUreyBradleyCubicParameter;// cubic parameter
-    float           amoebaUreyBradleyQuarticicParameter; // quartic parameter
-    unsigned int    amoebaUreyBradley_offset;       // Offset to end of bonds
-    float sqrtPi;                                   // sqrt(PI)
-    float scalingDistanceCutoff;                    // scaling cutoff
-    float2*         pDampingFactorAndThole;         // Thole & damping factors
-    int polarizationType;                           // polarization type (0=Mutual, 1=Direct)
-    int4*  pMultipoleParticlesIdsAndAxisType; 
-    int4*  pMultipoleParticlesTorqueBufferIndices; 
-    int maxTorqueBufferIndex;
-    float4* pTorqueMapForce4;
-    float* pMolecularDipole; 
-    float* pMolecularQuadrupole; 
-    unsigned int paddedPotentialGridSize;
-    unsigned int potentialGridSize;
-    unsigned int* pPotentialWorkUnit;
-    unsigned int potentialWorkUnits;
-    float4* pPotentialGrid; 
-    float*  pPotential; 
-    float* pLabFrameDipole;
-    float* pLabFrameQuadrupole;
-    float* pInducedDipole;
-    float* pInducedDipolePolar;
-    float* pInducedDipoleS;
-    float* pInducedDipolePolarS;
-    float* pTorque;
-    float* pWorkArray_3_1;
-    float* pWorkArray_3_2;
-    float* pWorkArray_1_1;
-    float* pWorkArray_1_2;
-    int vdwUsePBC;
-    float vdwCutoff;
-    float vdwCutoff2;
-    float vdwTaperCutoff;
-    float vdwTaperCutoff2;
-    float vdwTaperDelta;
-#define VDW_TAPER_TABLE_SIZE 100
-    float vdwTaperTable[VDW_TAPER_TABLE_SIZE+1];
-    float vdw_dTaperTable[VDW_TAPER_TABLE_SIZE+1];
-    unsigned int amoebaVdwNonReductions;
-    int* pAmoebaVdwNonReductionID;
-    unsigned int* pVdwWorkUnit;
-    unsigned int amoebaVdwReductions;
-    int4* pAmoebaVdwReductionID;
-    float* pAmoebaVdwReduction;
-    int* pVdwExclusionIndicesIndex;
-    int* pVdwExclusionIndices;
-    // WCA constants
-    float epso;
-    float epsh;
-    float rmino;
-    float rminh;
-    float awater;
-    float shctd;
-    float dispoff;
-    float totalMaxWcaDispersionEnergy;
-    float2* pWcaDispersionRadiusEpsilon;
-                    // scaling indices
-    int*            pScaleIndicesIndex;
-    int*            pD_ScaleIndices;
-    int2*           pP_ScaleIndices;
-    int2*           pM_ScaleIndices;
-    float electric;   // 3.320637090E+02f;
-    float gkc;        // 2.455f;
-    float dielec;    // 1.0f;
-    float dwater;    // 78.3f;
-    float fc;        // electric * 1.0f * (1.0f-dwater)/(0.0f+1.0f*dwater);
-    float fd;        // electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
-    float fq;        // electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);
-    // PME arrays
-    float4* pThetai1;
-    float4* pThetai2;
-    float4* pThetai3;
-    int4* pIgrid;
-    float* pPhi;
-    float* pPhid;
-    float* pPhip;
-    float* pPhidp;
-};
-#endif
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaGpuTypes.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaGpuTypes.h
-#ifndef __AMOEBA_GPUTYPES_H__
-#define __AMOEBA_GPUTYPES_H__
-/* -------------------------------------------------------------------------- *
- *                          OpenMMAmoeba                                      *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "kernels/gputypes.h"
-#include "OpenMM.h"
-#include "openmm/Vec3.h"
-#include "amoebaCudaTypes.h"
-#include <map>
-typedef std::map<int,float> MapIntFloat;
-typedef MapIntFloat::const_iterator MapIntFloatCI;
-struct _amoebaGpuContext {
-    _gpuContext* gpuContext;
-    cudaAmoebaGmxSimulation amoebaSim;
-    FILE* log;
-    CUDAStream<int4>*   psAmoebaBondID;
-    CUDAStream<float2>* psAmoebaBondParameter;
-    CUDAStream<int4>*   psAmoebaUreyBradleyID;
-    CUDAStream<float2>* psAmoebaUreyBradleyParameter;
-    CUDAStream<int4>*   psAmoebaAngleID1;
-    CUDAStream<int2>*   psAmoebaAngleID2;
-    CUDAStream<float2>* psAmoebaAngleParameter;
-    CUDAStream<int4>*   psAmoebaInPlaneAngleID1;
-    CUDAStream<int4>*   psAmoebaInPlaneAngleID2;
-    CUDAStream<float2>* psAmoebaInPlaneAngleParameter;
-    CUDAStream<int4>*   psAmoebaTorsionID1;
-    CUDAStream<int4>*   psAmoebaTorsionID2;
-    CUDAStream<float4>* psAmoebaTorsionParameter1;
-    CUDAStream<float2>* psAmoebaTorsionParameter2;
-    CUDAStream<int4>*   psAmoebaPiTorsionID1;
-    CUDAStream<int4>*   psAmoebaPiTorsionID2;
-    CUDAStream<int4>*   psAmoebaPiTorsionID3;
-    CUDAStream<float>*  psAmoebaPiTorsionParameter;
-    CUDAStream<int4>*   psAmoebaStretchBendID1;
-    CUDAStream<int2>*   psAmoebaStretchBendID2;
-    CUDAStream<float4>* psAmoebaStretchBendParameter;
-    CUDAStream<int4>*   psAmoebaOutOfPlaneBendID1;
-    CUDAStream<int4>*   psAmoebaOutOfPlaneBendID2;
-    CUDAStream<float>*  psAmoebaOutOfPlaneBendParameter;
-    CUDAStream<int4>*   psAmoebaTorsionTorsionID1;
-    CUDAStream<int4>*   psAmoebaTorsionTorsionID2;
-    CUDAStream<int4>*   psAmoebaTorsionTorsionID3;
-    CUDAStream<float4>* psAmoebaTorsionTorsionGrids;
-    unsigned int workUnits; 
-    // workspace arrays
-    CUDAStream<float>*  psWorkArray_3_1; 
-    CUDAStream<float>*  psWorkArray_3_2; 
-    CUDAStream<float>*  psWorkArray_3_3; 
-    CUDAStream<float>*  psWorkArray_3_4; 
-    CUDAStream<float>*  psWorkArray_1_1; 
-    CUDAStream<float>*  psWorkArray_1_2; 
-    CUDAStream<int>*  psScalingIndicesIndex; 
-    CUDAStream<int>*  ps_D_ScaleIndices; 
-    CUDAStream<int2>* ps_P_ScaleIndices; 
-    CUDAStream<int2>* ps_M_ScaleIndices; 
-    int maxCovalentDegreeSz;
-    float solventDielectric;
-    // multipole parameters
-    CUDAStream<int4>* psMultipoleParticlesIdsAndAxisType;
-    // buffer indices used for mapping torques onto forces 
-    int torqueMapForce4Delete;
-    CUDAStream<int4>*    psMultipoleParticlesTorqueBufferIndices;
-    CUDAStream<float4>*  psTorqueMapForce4; 
-    CUDAStream<float>* psMolecularDipole;
-    CUDAStream<float>* psMolecularQuadrupole;
-    CUDAStream<unsigned int>* psPotentialWorkUnit;
-    CUDAStream<float4>* psPotentialGrid;
-    CUDAStream<float>*  psPotential;
-    // molecular frame multipoles
-    CUDAStream<float>* psLabFrameDipole;
-    CUDAStream<float>* psLabFrameQuadrupole;
-    // scaling-related parameters
-    CUDAStream<float2>*  psDampingFactorAndThole;
-    // used to setup scaling constants
-    std::vector<int>    covalentDegree;
-    std::vector<int>    polarizationDegree;
-    // fixed-E field
-    CUDAStream<float>*  psE_Field;
-    CUDAStream<float>*  psE_FieldPolar;
-    int multipoleNonbondedMethod;
-    double cutoffDistance;
-    // mutual induced field
-    int mutualInducedIterativeMethod;
-    int mutualInducedMaxIterations;
-    int mutualInducedConverged;
-    int mutualInducedDone;
-    int epsilonThreadsPerBlock;
-    float mutualInducedTargetEpsilon;
-    float mutualInducedCurrentEpsilon;
-    CUDAStream<float>*  psInducedDipole; 
-    CUDAStream<float>*  psInducedDipolePolar; 
-    CUDAStream<float>*  psPolarizability; 
-    CUDAStream<float>*  psCurrentEpsilon; 
-    // SOR arrays for mutual induced field
-    unsigned int numberOfSorWorkVectors;
-    CUDAStream<float>*  psWorkVector[4]; 
-    // electrostatic
-    CUDAStream<float>*  psTorque; 
-    // Kirkwood fields
-    CUDAStream<float>*  psGk_Field;
-    CUDAStream<float>*  psInducedDipoleS; 
-    CUDAStream<float>*  psInducedDipolePolarS; 
-    CUDAStream<float>*  psBorn; 
-    CUDAStream<float>*  psBornPolar; 
-    int includeObcCavityTerm;
-    // Vdw fields
-    CUDAStream<float2>*  psVdwSigmaEpsilon;
-    CUDAStream<int>*     psAmoebaVdwNonReductionID; 
-    CUDAStream<int4>*    psAmoebaVdwReductionID; 
-    CUDAStream<float>*   psAmoebaVdwReduction; 
-    CUDAStream<float4>*  psAmoebaVdwCoordinates; 
-    CUDAStream<unsigned int>*   psVdwWorkUnit; 
-    CUDAStream<int>* psVdwExclusionIndicesIndex;
-    CUDAStream<int>* psVdwExclusionIndices;
-    int vdwSigmaCombiningRule;
-    int vdwEpsilonCombiningRule;
-    std::vector< std::vector<int> > vdwExclusions;
-    // Wca dispersion fields
-    CUDAStream<float2>*  psWcaDispersionRadiusEpsilon;
-    // PME fields
-    CUDAStream<float4>* psThetai1;
-    CUDAStream<float4>* psThetai2;
-    CUDAStream<float4>* psThetai3;
-    CUDAStream<int4>* psIgrid;
-    CUDAStream<float>* psPhi;
-    CUDAStream<float>* psPhid;
-    CUDAStream<float>* psPhip;
-    CUDAStream<float>* psPhidp;
-};
-typedef struct _amoebaGpuContext *amoebaGpuContext;
-// Function prototypes
-extern "C"
-amoebaGpuContext amoebaGpuInit( _gpuContext* gpu );
-extern "C"
-void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext gpu,  FILE* log );
-extern "C"
-void amoebaGpuShutDown(amoebaGpuContext gpu);
-extern "C"
-void amoebaGpuBuildOutputBuffers( amoebaGpuContext gpu, int hasKirkwood );
-extern "C"
-int amoebaGpuBuildThreadBlockWorkList( amoebaGpuContext gpu );
-extern "C"
-void amoebaGpuBuildScalingList( amoebaGpuContext gpu );
-extern "C"
-void gpuSetAmoebaBondParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, 
-                                const std::vector<float>& length, const std::vector<float>& k, float cubic, float quartic);
-extern "C"
-void gpuSetAmoebaUreyBradleyParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, 
-                                       const std::vector<float>& length, const std::vector<float>& k, float cubic, float quartic);
-extern "C"
-void gpuSetAmoebaAngleParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
-                                 const std::vector<float>& angle, const std::vector<float>& k, float cubicK,
-                                 float quarticK, float penticK, float sexticK);
-extern "C"
-void gpuSetAmoebaInPlaneAngleParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2,
-                                                              const std::vector<int>& atom3, const std::vector<int>& atom4,
-                                        const std::vector<float>& angle, const std::vector<float>& k, float cubicK,
-                                        float quarticK, float penticK, float sexticK);
-extern "C"
-void gpuSetAmoebaTorsionParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2,
-                                                         const std::vector<int>& atom3, const std::vector<int>& atom4,
-                                   const std::vector< std::vector<float> >& torsion1,
-                                   const std::vector< std::vector<float> >& torsion2,
-                                   const std::vector< std::vector<float> >& torsion3 );
-extern "C"
-void gpuSetAmoebaPiTorsionParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2,
-                                                           const std::vector<int>& atom3, const std::vector<int>& atom4,
-                                                           const std::vector<int>& atom5, const std::vector<int>& atom6,
-                                                           const std::vector<float>& torsion1 );
-extern "C"
-void gpuSetAmoebaStretchBendParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
-                                       const std::vector<float>& lengthAB,
-                                       const std::vector<float>& lengthCB,
-                                       const std::vector<float>& angle,
-                                       const std::vector<float>& k );
-extern "C"
-void gpuSetAmoebaOutOfPlaneBendParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
-                                          const std::vector<int>& atom4, const std::vector<float>& k,
-                                          float cubicK, float quarticK, float penticK, float sexticK );
-extern "C"
-void gpuSetAmoebaTorsionTorsionParameters(amoebaGpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
-                                          const std::vector<int>& atom4, const std::vector<int>& atom5, const std::vector<int>& chiralAtomIndex, const std::vector<int>& gridIndex );
-extern "C"
-void gpuSetAmoebaTorsionTorsionGrids(amoebaGpuContext gpu, const std::vector< std::vector< std::vector< std::vector<float> > > >& floatGrids );
-extern "C"  
-void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vector<float>& charges, const std::vector<float>& dipoles, const std::vector<float>& quadrupoles,
-                                     const std::vector<int>& axisType, const std::vector<int>& multipoleAtomZ, const std::vector<int>& multipoleAtomX,  const std::vector<int>& multipoleAtomY,
-                                     const std::vector<float>& tholes, float scalingDistanceCutoff,const std::vector<float>& dampingFactors, const std::vector<float>& polarity,
-                                     const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo, const std::vector<int>& covalentDegree,
-                                     const std::vector<int>& minCovalentIndices,  const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
-                                     int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
-                                     int nonbondedMethod, int polarizationType, float cutoffDistance,  float alphaEwald );
-extern "C"
-void gpuSetupElectrostaticPotentialCalculation( amoebaGpuContext amoebaGpu, const std::vector< OpenMM::Vec3 >& inputGrid );
-extern "C"
-void gpuLoadElectrostaticPotential( amoebaGpuContext amoebaGpu, unsigned int gridSize, std::vector< double >& outputElectrostaticPotential );
-extern "C"
-void gpuCleanupElectrostaticPotentialCalculation( amoebaGpuContext amoebaGpu );
-extern "C"
-void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu , float innerDielectric, float solventDielectric,
-                                const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
-                                int includeCavityTerm, float probeRadius, float surfaceAreaFactor);
-extern "C"
-void gpuSetAmoebaGrycukParameters( amoebaGpuContext amoebaGpu , float innerDielectric, float solventDielectric,
-                                   const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
-                                   int includeCavityTerm, float probeRadius, float surfaceAreaFactor);
-extern "C"
-void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
-                                const std::vector<int>& indexIVs, 
-                                const std::vector<float>& sigmas,
-                                const std::vector<float>& epsilons,
-                                const std::vector<float>& reductions,
-                                const std::string& sigmaCombiningRule,
-                                const std::string& epsilonCombiningRule,
-                                const std::vector< std::vector<int> >& allExclusions, int usePBC, float cutoff );
-extern "C"
-void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
-extern "C"
-void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu );
-extern "C"
-void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
-                                const std::vector<float>& radii,
-                                const std::vector<float>& epsilons,
-                                const float totalMaxWcaDisperionEnergy,
-                                const float epso, const float epsh, const float rmino, const float rminh,
-                                const float awater, const float shctd, const float dispoff );
-extern "C"
-void amoebaGpuSetConstants(amoebaGpuContext gpu, int updateFlag );
-extern "C"
-void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
-#endif //__AMOEBA_GPUTYPES_H__
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaScaleFactors.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/amoebaScaleFactors.h
-#ifndef __AMOEBA_SCALE_FACTORS_H__
-#define __AMOEBA_SCALE_FACTORS_H__
-/* -------------------------------------------------------------------------- *
- *                          OpenMMAmoeba                                      *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-static __constant__ float mpoleScale[5]   = { 0.0f, 0.0f, 0.0f, 0.4f, 0.8f };
-static __constant__ float polarScale[5]   = { 0.0f, 0.0f, 0.0f, 1.0f, 1.0f };
-static __constant__ float directScale[5]  = { 0.0f, 1.0f, 1.0f, 1.0f, 1.0f };
-//float mutualScale[5]  = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
-// must be explicitly initialized!
-//static __constant__ float mScale[4]       = { 0.0f, 0.4f, 0.8f, 1.0f };
-//static __constant__ float pScale[4]       = { 1.0f, 0.5f, 0.0f, -2.0f };
-//static __constant__ float dScale[2]       = { 0.0f, 1.0f };
-//static __constant__ float uScale[5]       = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
-// subroutine to get masked scale factors
-__device__ static void getMaskedDScaleFactor( unsigned int gridIndex, int scaleMask, float* dScale )
-{
-    unsigned int mask             = 1 << gridIndex;   
-    *dScale                       = (scaleMask & mask) ? 0.0f : 1.0f;
-}
-__device__ static void getMaskedPScaleFactor( unsigned int gridIndex, int2 scaleMask, float* pScale )
-{
-    unsigned int mask             = 1 << gridIndex;   
-    *pScale                       = (scaleMask.x & mask) ? 0.5f : 1.0f;
-    *pScale                      *= (scaleMask.y & mask) ? 0.0f : 1.0f;
-}
-__device__ static void getMaskedMScaleFactor( unsigned int gridIndex, int2 scaleMask, float* mScale )
-{
-    unsigned int mask             = 1 << gridIndex;   
-    // 0 0 -> 1 -> 1   -> 1.0
-    // 1 0 -> 1 -> 0.4 -> 0.4
-    // 0 1 -> 1 -> 0.8 -> 0.8
-    // 1 1 -> 0 ->   0 -> 0.0
-    *mScale                       = (scaleMask.x & mask) && (scaleMask.y & mask) ? 0.0f : 1.0f;
-    *mScale                      *= (scaleMask.x & mask) ? 0.8f : 1.0f;
-    *mScale                      *= (scaleMask.y & mask) ? 0.4f : 1.0f;
-}
-// subroutine to get cell coordinates
-__device__ static void decodeCell( unsigned int cellId, unsigned int* x, unsigned int* y, bool* exclusions )
-{
-    *x          = cellId;
-    *y          = ((*x >> 2) & 0x7fff) << GRIDBITS;
-    *exclusions = (*x & 0x1);
-    *x          = (*x >> 17) << GRIDBITS;
-}
-__device__ static void load3dArrayBufferPerWarp( unsigned int offset, float* forceSum, float* outputForce )
-{
-    float of; 
-    of                                  = outputForce[offset];
-    of                                 += forceSum[0];
-    outputForce[offset]                 = of;  
-    of                                  = outputForce[offset+1];
-    of                                 += forceSum[1];
-    outputForce[offset+1]               = of;  
-    of                                  = outputForce[offset+2];
-    of                                 += forceSum[2];
-    outputForce[offset+2]               = of;  
-}
-__device__ static void add3dArrayToFloat4( unsigned int offset, volatile float* forceSum, float4* outputForce )
-{
-    float4 of; 
-    of                                  = outputForce[offset];
-    of.x                               += forceSum[0];
-    of.y                               += forceSum[1];
-    of.z                               += forceSum[2];
-    outputForce[offset]                 = of;  
-}
-__device__ static void load3dArrayToFloat4( unsigned int offset, float* forceSum, float4* outputForce )
-{
-    float4 of; 
-    of.x                                = forceSum[0];
-    of.y                                = forceSum[1];
-    of.z                                = forceSum[2];
-    of.w                                = 0.0f;
-    outputForce[offset]                 = of;  
-}
-__device__ static void load3dArray( unsigned int offset, volatile float* forceSum, float* outputForce )
-{
-    outputForce[offset]                 = forceSum[0];  
-    outputForce[offset+1]               = forceSum[1];  
-    outputForce[offset+2]               = forceSum[2];  
-}
-__device__ static void add3dArray( unsigned int offset, volatile float* forceSum, float* outputForce )
-{
-    outputForce[offset]                += forceSum[0];  
-    outputForce[offset+1]              += forceSum[1];  
-    outputForce[offset+2]              += forceSum[2];  
-}
-__device__ static void scale3dArray( float scaleFactor, float* force )
-{
-    force[0]  *= scaleFactor;  
-    force[1]  *= scaleFactor;  
-    force[2]  *= scaleFactor;  
-}
-#endif //__AMOEBA_SCALE_FACTORS_H__
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/calculateSystemMultipoleMoments.cpp
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/calculateSystemMultipoleMoments.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaCudaKernels.h"
-#include "openmm/OpenMMException.h"
-#include <stdio.h>
-using namespace std; 
-void kCalculateAmoebaSystemMultipoleMoments( amoebaGpuContext amoebaGpu, std::vector< double >& outputMultipoleMoments ) 
-{
-    // setup
-    kSetupAmoebaMultipoleForces(amoebaGpu, false ); 
-    gpuContext gpu         = amoebaGpu->gpuContext;
-    gpu->psPosq4->Download();
-    gpu->psVelm4->Download();
-    float4* posq           = gpu->psPosq4->_pSysData;    
-    float4* velm           = gpu->psVelm4->_pSysData;    
-    float totalMass        = 0.0f;
-    float centerOfMass[3]  = { 0.0f, 0.0f, 0.0f };
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        float mass;
-        if( velm->w > 0.0f ){
-            mass        = 1.0f/velm[ii].w;
-        } else {
-            mass        = 0.0f;
-        }
-        totalMass        += mass;
-        centerOfMass[0]  += mass*posq[ii].x;
-        centerOfMass[1]  += mass*posq[ii].y;
-        centerOfMass[2]  += mass*posq[ii].z;
-    }
-    std::vector<float4>  posqLocal(gpu->natoms);
-    if( totalMass > 0.0f ){
-        centerOfMass[0]  /= totalMass;
-        centerOfMass[1]  /= totalMass;
-        centerOfMass[2]  /= totalMass;
-    }
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        posqLocal[ii].x = posq[ii].x  - centerOfMass[0];
-        posqLocal[ii].y = posq[ii].y  - centerOfMass[1];
-        posqLocal[ii].z = posq[ii].z  - centerOfMass[2];
-        posqLocal[ii].w = posq[ii].w;
-    }
-    float netchg  = 0.0f;
-    float xdpl    = 0.0f;
-    float ydpl    = 0.0f;
-    float zdpl    = 0.0f;
-    float xxqdp   = 0.0f;
-    float xyqdp   = 0.0f;
-    float xzqdp   = 0.0f;
-    float yxqdp   = 0.0f;
-    float yyqdp   = 0.0f;
-    float yzqdp   = 0.0f;
-    float zxqdp   = 0.0f;
-    float zyqdp   = 0.0f;
-    float zzqdp   = 0.0f;
-    amoebaGpu->psLabFrameDipole->Download();
-    float* labFrameDipole      = amoebaGpu->psLabFrameDipole->_pSysData;    
-    amoebaGpu->psInducedDipole->Download();
-    float* inducedDipole       = amoebaGpu->psInducedDipole->_pSysData;    
-    amoebaGpu->psLabFrameQuadrupole->Download();
-    float* labFrameQuadrupole  = amoebaGpu->psLabFrameQuadrupole->_pSysData;    
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        netchg              += posqLocal[ii].w;
-        float netDipoleX     = (labFrameDipole[3*ii]    + inducedDipole[3*ii]);
-        float netDipoleY     = (labFrameDipole[3*ii+1]  + inducedDipole[3*ii+1]);
-        float netDipoleZ     = (labFrameDipole[3*ii+2]  + inducedDipole[3*ii+2]);
-        xdpl    += posqLocal[ii].x*posqLocal[ii].w + netDipoleX;
-        ydpl    += posqLocal[ii].y*posqLocal[ii].w + netDipoleY;
-        zdpl    += posqLocal[ii].z*posqLocal[ii].w + netDipoleZ;
-        xxqdp   += posqLocal[ii].x*posqLocal[ii].x*posqLocal[ii].w + 2.0f*posqLocal[ii].x*netDipoleX;
-        xyqdp   += posqLocal[ii].x*posqLocal[ii].y*posqLocal[ii].w + posqLocal[ii].x*netDipoleY + posqLocal[ii].y*netDipoleX;
-        xzqdp   += posqLocal[ii].x*posqLocal[ii].z*posqLocal[ii].w + posqLocal[ii].x*netDipoleZ + posqLocal[ii].z*netDipoleX;
-        yxqdp   += posqLocal[ii].y*posqLocal[ii].x*posqLocal[ii].w + posqLocal[ii].y*netDipoleX + posqLocal[ii].x*netDipoleY;
-        yyqdp   += posqLocal[ii].y*posqLocal[ii].y*posqLocal[ii].w + 2.0f*posqLocal[ii].y*netDipoleY;
-        yzqdp   += posqLocal[ii].y*posqLocal[ii].z*posqLocal[ii].w + posqLocal[ii].y*netDipoleZ + posqLocal[ii].z*netDipoleY;
-        zxqdp   += posqLocal[ii].z*posqLocal[ii].x*posqLocal[ii].w + posqLocal[ii].z*netDipoleX + posqLocal[ii].x*netDipoleZ;
-        zyqdp   += posqLocal[ii].z*posqLocal[ii].y*posqLocal[ii].w + posqLocal[ii].z*netDipoleY + posqLocal[ii].y*netDipoleZ;
-        zzqdp   += posqLocal[ii].z*posqLocal[ii].z*posqLocal[ii].w + 2.0f*posqLocal[ii].z*netDipoleZ;
-    }
-//  convert the quadrupole from traced to traceless form
-    float qave   = (xxqdp + yyqdp + zzqdp)/3.0f;
-          xxqdp  = 1.5f*(xxqdp-qave);
-          xyqdp  = 1.5f*xyqdp;
-          xzqdp  = 1.5f*xzqdp;
-          yxqdp  = 1.5f*yxqdp;
-          yyqdp  = 1.5f*(yyqdp-qave);
-          yzqdp  = 1.5f*yzqdp;
-          zxqdp  = 1.5f*zxqdp;
-          zyqdp  = 1.5f*zyqdp;
-          zzqdp  = 1.5f*(zzqdp-qave);
-//  add the traceless atomic quadrupoles to total quadrupole
-    for( unsigned int ii  = 0; ii < gpu->natoms; ii++ ){
-        xxqdp  = xxqdp + 3.0f*labFrameQuadrupole[9*ii];
-        xyqdp  = xyqdp + 3.0f*labFrameQuadrupole[9*ii+1];
-        xzqdp  = xzqdp + 3.0f*labFrameQuadrupole[9*ii+2];
-        yxqdp  = yxqdp + 3.0f*labFrameQuadrupole[9*ii+3];
-        yyqdp  = yyqdp + 3.0f*labFrameQuadrupole[9*ii+4];
-        yzqdp  = yzqdp + 3.0f*labFrameQuadrupole[9*ii+5];
-        zxqdp  = zxqdp + 3.0f*labFrameQuadrupole[9*ii+6];
-        zyqdp  = zyqdp + 3.0f*labFrameQuadrupole[9*ii+7];
-        zzqdp  = zzqdp + 3.0f*labFrameQuadrupole[9*ii+8];
-    }
-    float debye                = 4.80321f;
-    outputMultipoleMoments.resize( 13 );
-    outputMultipoleMoments[0]  = netchg;
-    outputMultipoleMoments[1]  = xdpl*debye;
-    outputMultipoleMoments[2]  = ydpl*debye;
-    outputMultipoleMoments[3]  = zdpl*debye;
-    outputMultipoleMoments[4]  = xxqdp*debye;
-    outputMultipoleMoments[5]  = xyqdp*debye;
-    outputMultipoleMoments[6]  = xzqdp*debye;
-    outputMultipoleMoments[7]  = yxqdp*debye;
-    outputMultipoleMoments[8]  = yyqdp*debye;
-    outputMultipoleMoments[9]  = yzqdp*debye;
-    outputMultipoleMoments[10] = zxqdp*debye;
-    outputMultipoleMoments[11] = zyqdp*debye;
-    outputMultipoleMoments[12] = zzqdp*debye;
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostatic.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaGpuTypes.h"
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-#include <stdio.h>
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-void SetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-void GetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-static int const PScaleIndex            =  0; 
-static int const DScaleIndex            =  1; 
-static int const UScaleIndex            =  2; 
-static int const MScaleIndex            =  3;
-static int const LastScalingIndex       =  4;
-struct ElectrostaticParticle {
-    // coordinates charge
-    float x;
-    float y;
-    float z;
-    float q;
-    // lab frame dipole
-    float labFrameDipole[3];
-    // lab frame quadrupole
-    float labFrameQuadrupole[9];
-    // induced dipole
-    float inducedDipole[3];
-    // polar induced dipole
-    float inducedDipoleP[3];
-    // scaling factors
-    float thole;
-    float damp;
-    float force[3];
-    //float torque[3];
-    //float padding;
-};
-#ifdef Original
-#define i35 0.257142857f
-#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
-#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
-  u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
-  u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
-#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))
-#define one 1.0f
-__device__ void calculateElectrostaticPairIxnOrig_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
-                                                          float* scalingFactors, float4*  outputForce, float4  outputTorque[2]){
-    float deltaR[3];
-    // ---------------------------------------------------------------------------------------
-    // ---------------------------------------------------------------------------------------
-    float* ddsc3                    =  scalingFactors + Ddsc30Index;
-    float* ddsc5                    =  scalingFactors + Ddsc50Index;
-    float* ddsc7                    =  scalingFactors + Ddsc70Index;
-    deltaR[0]                       = atomJ.x - atomI.x;
-    deltaR[1]                       = atomJ.y - atomI.y;
-    deltaR[2]                       = atomJ.z - atomI.z;
-    float r2                        = DOT31( deltaR, deltaR );
-    float r                         = sqrtf( r2 );
-    float rr1                       = 1.0f/r;
-    float rr2                       = rr1*rr1;
-    float rr3                       = rr1*rr2;
-    float rr5                       = 3.0f*rr3*rr2;
-    float rr7                       = 5.0f*rr5*rr2;
-    float rr9                       = 7.0f*rr7*rr2;
-    float rr11                      = 9.0f*rr9*rr2;
-    //-------------------------------------------
-    if( atomI.damp != 0.0f && atomJ.damp != 0.0 && r < cAmoebaSim.scalingDistanceCutoff ){
-        float distanceIJ, r2I;
-        distanceIJ                    = r;
-        r2I                           = rr2;
-        float ratio                   = distanceIJ/(atomI.damp*atomJ.damp);
-        float pGamma                  = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole;
-        float damp                    = ratio*ratio*ratio*pGamma;
-        float dampExp                 = expf( -damp );
-        float damp1                   = damp + one;
-        float damp2                   = damp*damp;
-        float damp3                   = damp2*damp;
-        scalingFactors[Scale3Index]   = one - dampExp;
-        scalingFactors[Scale5Index]   = one - damp1*dampExp;
-        scalingFactors[Scale7Index]   = one - ( damp1 + 0.6f*damp2)*dampExp;
-        scalingFactors[Scale9Index]   = one - ( damp1 + ( 2.0f*damp2 + damp3 )*i35)*dampExp;
-        float factor                  = 3.0f*damp*dampExp*r2I;
-        float factor7                 = -0.2f + 0.6f*damp;
-        for( int ii = 0; ii < 3; ii++ ){
-            scalingFactors[Ddsc30Index + ii] = factor*deltaR[ii];
-            scalingFactors[Ddsc50Index + ii] = scalingFactors[Ddsc30Index + ii]*damp;
-            scalingFactors[Ddsc70Index + ii] = scalingFactors[Ddsc50Index + ii]*factor7;
-        }
-    }
-    float scaleI0 = scalingFactors[Scale3Index]*scalingFactors[UScaleIndex];
-    float dsc0    = scalingFactors[Scale3Index]*scalingFactors[DScaleIndex];
-    float psc0    = scalingFactors[Scale3Index]*scalingFactors[PScaleIndex];
-    float scaleI1 = scalingFactors[Scale3Index+1]*scalingFactors[UScaleIndex];
-    float dsc1    = scalingFactors[Scale3Index+1]*scalingFactors[DScaleIndex];
-    float psc1    = scalingFactors[Scale3Index+1]*scalingFactors[PScaleIndex];
-    float dsc2    = scalingFactors[Scale3Index+2]*scalingFactors[DScaleIndex];
-    float psc2    = scalingFactors[Scale3Index+2]*scalingFactors[PScaleIndex];
-    float qIr[3], qJr[3];
-    amatrixProductVector3( atomJ.labFrameQuadrupole,      deltaR,      qJr);
-    amatrixProductVector3( atomI.labFrameQuadrupole,      deltaR,      qIr);
-    float sc2     = DOT3_4(        atomI.labFrameDipole,  atomJ.labFrameDipole );
-    float sc3     = DOT3_4(        atomI.labFrameDipole,  deltaR  );
-    float sc4     = DOT3_4(        atomJ.labFrameDipole,  deltaR  );
-    float sc5     = DOT3_4(        qIr, deltaR  );
-    float sc6     = DOT3_4(        qJr, deltaR  );
-    float sc7     = DOT3_4(        qIr, atomJ.labFrameDipole );
-    float sc8     = DOT3_4(        qJr, atomI.labFrameDipole );
-    float sc9     = DOT3_4(        qIr, qJr );
-    float sc10    = MATRIXDOT31( atomI.labFrameQuadrupole, atomJ.labFrameQuadrupole );
-    float sci1    = DOT3_4(        atomI.inducedDipole,  atomJ.labFrameDipole ) +
-                    DOT3_4(        atomJ.inducedDipole,  atomI.labFrameDipole );
-    float sci3    = DOT3_4(        atomI.inducedDipole,  deltaR  );
-    float sci4    = DOT3_4(        atomJ.inducedDipole,  deltaR  );
-    float sci7    = DOT3_4(        qIr, atomJ.inducedDipole );
-    float sci8    = DOT3_4(        qJr, atomI.inducedDipole );
-    float scip1   = DOT3_4(        atomI.inducedDipoleP, atomJ.labFrameDipole ) +
-                    DOT3_4(        atomJ.inducedDipoleP, atomI.labFrameDipole );
-    float scip2   = DOT3_4(        atomI.inducedDipole,  atomJ.inducedDipoleP) +
-                    DOT3_4(        atomJ.inducedDipole,  atomI.inducedDipoleP);
-    float scip3   = DOT3_4(        atomI.inducedDipoleP, deltaR );
-    float scip4   = DOT3_4(        atomJ.inducedDipoleP, deltaR );
-    float scip7   = DOT3_4(        qIr, atomJ.inducedDipoleP );
-    float scip8   = DOT3_4(        qJr, atomI.inducedDipoleP );
-    float scaleF             = 0.5f*scalingFactors[UScaleIndex];
-    float inducedFactor3     = scip2*rr3*scaleF;
-    float inducedFactor5     = (sci3*scip4+scip3*sci4)*rr5*scaleF;
-    float findmp_0           = inducedFactor3*ddsc3[0] - inducedFactor5*ddsc5[0];
-    float findmp_1           = inducedFactor3*ddsc3[1] - inducedFactor5*ddsc5[1];
-    float findmp_2           = inducedFactor3*ddsc3[2] - inducedFactor5*ddsc5[2];
-    float gli1               = atomJ.q*sci3 - atomI.q*sci4;
-    float gli2               = -sc3*sci4 - sci3*sc4;
-    float gli3               = sci3*sc6 - sci4*sc5;
-    float gli6               = sci1;
-    float gli7               = 2.0f*(sci7-sci8);
-    float glip1              = atomJ.q*scip3 - atomI.q*scip4;
-    float glip2              = -sc3*scip4 - scip3*sc4;
-    float glip3              = scip3*sc6 - scip4*sc5;
-    float glip6              = scip1;
-    float glip7              = 2.0f*(scip7-scip8);
-    float factor3            = rr3*(( gli1  +  gli6)*scalingFactors[PScaleIndex] + (glip1  + glip6)*scalingFactors[DScaleIndex]);
-    float factor5            = rr5*(( gli2  +  gli7)*scalingFactors[PScaleIndex] + (glip2  + glip7)*scalingFactors[DScaleIndex]);
-    float factor7            = rr7*( gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
-    float fridmp_0           = 0.5f*(factor3*ddsc3[0] + factor5*ddsc5[0] + factor7*ddsc7[0]);
-    float fridmp_1           = 0.5f*(factor3*ddsc3[1] + factor5*ddsc5[1] + factor7*ddsc7[1]);
-    float fridmp_2           = 0.5f*(factor3*ddsc3[2] + factor5*ddsc5[2] + factor7*ddsc7[2]);
-    float gl0 = atomI.q*atomJ.q;
-    float gl1 = atomJ.q*sc3 - atomI.q*sc4;
-    float gl2 = atomI.q*sc6 + atomJ.q*sc5 - sc3*sc4;
-    float gl3 = sc3*sc6 - sc4*sc5;
-    float gl4 = sc5*sc6;
-    float gl6 = sc2;
-    float gl7 = 2.0f*(sc7-sc8);
-    float gl8 = 2.0f*sc10;
-    float gl5 = -4.0f*sc9;
-    float gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
-    float gf2 = -atomJ.q*rr3 + sc4*rr5 - sc6*rr7;
-    float gf3 =  atomI.q*rr3 + sc3*rr5 + sc5*rr7;
-    float gf4 = 2.0f*rr5;
-    float gf5 = 2.0f*(-atomJ.q*rr5+sc4*rr7-sc6*rr9);
-    float gf6 = 2.0f*(-atomI.q*rr5-sc3*rr7-sc5*rr9);
-    float gf7 = 4.0f*rr7;
-    // energy
-    float em                 = scalingFactors[MScaleIndex]*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
-    float ei                 = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
-    outputForce->w           = em+ei;
-    float temp1[3],temp2[3],temp3[3];
-    float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
-    amatrixProductVector3( atomI.labFrameQuadrupole,      atomJ.labFrameDipole,     qIdJ );//MK
-    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     qJdI );//MK
-    amatrixProductVector3( atomI.labFrameQuadrupole,      qJr,    qIqJr );//MK
-    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    qJqIr );//MK
-    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    temp1 );
-    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     temp2 );
-    float ftm2_0 = gf1*deltaR[0] +
-                     gf2*atomI.labFrameDipole[0] + gf3*atomJ.labFrameDipole[0]  +
-                     gf4*(temp2[0]  - qIdJ[0])   +
-                     gf5*qIr[0]    + gf6*qJr[0]  +
-                     gf7*(qIqJr[0] + temp1[0]);
-    float ftm2_1 = gf1*deltaR[1]                 +
-                     gf2*atomI.labFrameDipole[1] + gf3*atomJ.labFrameDipole[1]  +
-                     gf4*(temp2[1]  - qIdJ[1])   +
-                     gf5*qIr[1]    + gf6*qJr[1]  +
-                     gf7*(qIqJr[1] + temp1[1]);
-    float ftm2_2 = gf1*deltaR[2]                 +
-                     gf2*atomI.labFrameDipole[2] + gf3*atomJ.labFrameDipole[2]  +
-                     gf4*(temp2[2]  - qIdJ[2])   +
-                     gf5*qIr[2]    + gf6*qJr[2]  +
-                     gf7*(qIqJr[2] + temp1[2]);
-    // get the induced force;
-    // intermediate variables for the induced-permanent terms;
-    float gfi1 = rr5*0.5f*((gli1+gli6)*psc0 + (glip1+glip6)*dsc0 + scip2*scaleI0) + rr7*((gli7+gli2)*psc1 + (glip7+glip2)*dsc1 -
-                                                       (sci3*scip4+scip3*sci4)*scaleI1)*0.5f + rr9*(gli3*psc2+glip3*dsc2)*0.5f;
-    float gfi4 = 2.0f*rr5;
-    float gfi5 = rr7* (sci4*psc2 + scip4*dsc2);
-    float gfi6 = -rr7*(sci3*psc2 + scip3*dsc2);
-    float temp4[3];
-    float temp5[3];
-    float temp6[3];
-    float temp7[3];
-    float temp8[3];
-    float temp9[3];
-    float temp10[3];
-    float temp11[3];
-    float temp12[3];
-    float temp13[3];
-    float temp14[3];
-    float temp15[3];
-    float qIuJp[3], qJuIp[3];
-    float qIuJ[3], qJuI[3];
-    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    temp4);
-    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipoleP,    qIuJp);//MK
-    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    qJuIp);//MK
-    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole ,    qJuI);//MK
-    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole,    temp5);
-    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipole ,     qIuJ);//MK
-    float ftm2i_0 = gfi1*deltaR[0] +
-                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[0]*psc0 + atomI.inducedDipoleP[0]*dsc0) +
-                    rr5*sc4*(atomI.inducedDipole[0]*psc1 + atomI.inducedDipoleP[0]*dsc1) -
-                    rr7*sc6*(atomI.inducedDipole[0]*psc2 + atomI.inducedDipoleP[0]*dsc2)) +
-                   (rr3*atomI.q*(atomJ.inducedDipole[0]*psc0+atomJ.inducedDipoleP[0]*dsc0) +
-                     rr5*sc3*(atomJ.inducedDipole[0]*psc1 +atomJ.inducedDipoleP[0]*dsc1) +
-                     rr7*sc5*(atomJ.inducedDipole[0]*psc2 +atomJ.inducedDipoleP[0]*dsc2))*0.5f +
-                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0] +
-                     sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0])*0.5f +
-                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[0] +
-                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[0] +
-                    0.5f*gfi4*((temp5[0]-qIuJ[0])*psc1 +
-                    (temp4[0]-qIuJp[0])*dsc1) + gfi5*qIr[0] + gfi6*qJr[0];
-    float ftm2i_1  = gfi1*deltaR[1] +
-                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[1]*psc0 + atomI.inducedDipoleP[1]*dsc0) +
-                    rr5*sc4*(atomI.inducedDipole[1]*psc1 + atomI.inducedDipoleP[1]*dsc1) -
-                    rr7*sc6*(atomI.inducedDipole[1]*psc2 + atomI.inducedDipoleP[1]*dsc2)) +
-                    (rr3*atomI.q*(atomJ.inducedDipole[1]*psc0+atomJ.inducedDipoleP[1]*dsc0) +
-                     rr5*sc3*(atomJ.inducedDipole[1]*psc1 +atomJ.inducedDipoleP[1]*dsc1) +
-                     rr7*sc5*(atomJ.inducedDipole[1]*psc2 +atomJ.inducedDipoleP[1]*dsc2))*0.5f +
-                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1] +
-                     sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1])*0.5f +
-                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[1] +
-                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[1] +
-                    0.5f*gfi4*((temp5[1]-qIuJ[1])*psc1 +
-                    (temp4[1]-qIuJp[1])*dsc1) + gfi5*qIr[1] + gfi6*qJr[1];
-    float ftm2i_2  = gfi1*deltaR[2] +
-                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[2]*psc0 + atomI.inducedDipoleP[2]*dsc0) +
-                    rr5*sc4*(atomI.inducedDipole[2]*psc1 + atomI.inducedDipoleP[2]*dsc1) -
-                    rr7*sc6*(atomI.inducedDipole[2]*psc2 + atomI.inducedDipoleP[2]*dsc2)) +
-                    (rr3*atomI.q*(atomJ.inducedDipole[2]*psc0+atomJ.inducedDipoleP[2]*dsc0) +
-                     rr5*sc3*(atomJ.inducedDipole[2]*psc1 +atomJ.inducedDipoleP[2]*dsc1) +
-                     rr7*sc5*(atomJ.inducedDipole[2]*psc2 +atomJ.inducedDipoleP[2]*dsc2))*0.5f +
-                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2] +
-                     sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2])*0.5f +
-                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[2] +
-                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[2] +
-                    0.5f*gfi4*((temp5[2]-qIuJ[2])*psc1 +
-                    (temp4[2]-qIuJp[2])*dsc1) + gfi5*qIr[2] + gfi6*qJr[2];
-    // handle of scaling for partially excluded interactions;
-    // correction to convert mutual to direct polarization force;
-    ftm2i_0 -= (fridmp_0 + findmp_0);
-    ftm2i_1 -= (fridmp_1 + findmp_1);
-    ftm2i_2 -= (fridmp_2 + findmp_2);
-    if( cAmoebaSim.polarizationType )
-    {
-        float gfd     = 0.5*(rr5*scip2*scaleI0 - rr7*(scip3*sci4+sci3*scip4)*scaleI1);
-        float temp5   = 0.5*rr5*scaleI1;
-        float fdir_0  = gfd*deltaR[0] + temp5*(sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0] + sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0]);
-        float fdir_1  = gfd*deltaR[1] + temp5*(sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1]);
-        float fdir_2  = gfd*deltaR[2] + temp5*(sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2] + sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2]);
-        ftm2i_0      -= fdir_0 - findmp_0;
-        ftm2i_1      -= fdir_1 - findmp_1;
-        ftm2i_2      -= fdir_2 - findmp_2;
-    }
-    // now perform the torque calculation;
-    // intermediate terms for torque between multipoles i and j;
-    float gti2 = 0.5f*(sci4*psc1+scip4*dsc1)*rr5;
-    float gti3 = 0.5f*(sci3*psc1+scip3*dsc1)*rr5;
-    float gti4 = gfi4;
-    float gti5 = gfi5;
-    float gti6 = gfi6;
-    // get the permanent (ttm2, ttm3) and induced interaction torques (ttm2i, ttm3i)
-    acrossProductVector3(atomI.labFrameDipole,      atomJ.labFrameDipole,      temp1);
-    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipole ,      temp2);
-    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipoleP,     temp3);
-    acrossProductVector3(atomI.labFrameDipole,      deltaR,       temp4);
-    acrossProductVector3(deltaR,       qIuJp,   temp5);
-    acrossProductVector3(deltaR,       qIr,     temp6);
-    acrossProductVector3(deltaR,       qIuJ,    temp7);
-    acrossProductVector3(atomJ.inducedDipole ,     qIr,     temp8);
-    acrossProductVector3(atomJ.inducedDipoleP,     qIr,     temp9);
-    acrossProductVector3(atomI.labFrameDipole,     qJr,     temp10);
-    acrossProductVector3(atomJ.labFrameDipole,     qIr,     temp11);
-    acrossProductVector3(deltaR,       qIqJr,   temp12);
-    acrossProductVector3(deltaR,       qIdJ,    temp13);
-    amatrixCrossProductMatrix3(atomI.labFrameQuadrupole,      atomJ.labFrameQuadrupole,      temp14);
-    acrossProductVector3(qJr, qIr,     temp15);
-    float ttm2_0  = -rr3*temp1[0] + gf2*temp4[0]-gf5*temp6[0] + gf4*(temp10[0] + temp11[0] + temp13[0]-2.0f*temp14[0]) - gf7*(temp12[0] + temp15[0]);
-    float ttm2i_0 = -rr3*(temp2[0]*psc0+temp3[0]*dsc0)*0.5f + gti2*temp4[0] + gti4*((temp8[0]+ temp7[0])*psc1 + (temp9[0] + temp5[0])*dsc1)*0.5f - gti5*temp6[0];
-    float ttm2_1  = -rr3*temp1[1] + gf2*temp4[1]-gf5*temp6[1] + gf4*(temp10[1] + temp11[1] + temp13[1]-2.0f*temp14[1]) - gf7*(temp12[1] + temp15[1]);
-    float ttm2i_1 = -rr3*(temp2[1]*psc0+temp3[1]*dsc0)*0.5f + gti2*temp4[1] + gti4*((temp8[1]+ temp7[1])*psc1 + (temp9[1] + temp5[1])*dsc1)*0.5f - gti5*temp6[1];
-    float ttm2_2  = -rr3*temp1[2] + gf2*temp4[2]-gf5*temp6[2] + gf4*(temp10[2] + temp11[2] + temp13[2]-2.0f*temp14[2]) - gf7*(temp12[2] + temp15[2]);
-    float ttm2i_2 = -rr3*(temp2[2]*psc0+temp3[2]*dsc0)*0.5f + gti2*temp4[2] + gti4*((temp8[2]+ temp7[2])*psc1 + (temp9[2] + temp5[2])*dsc1)*0.5f - gti5*temp6[2];
-    acrossProductVector3(atomJ.labFrameDipole,      deltaR,       temp2  );
-    acrossProductVector3(deltaR,       qJr,     temp3  );
-    acrossProductVector3(atomI.labFrameDipole,      qJr,     temp4  );
-    acrossProductVector3(atomJ.labFrameDipole,      qIr,     temp5  );
-    acrossProductVector3(deltaR,       qJdI,    temp6  );
-    acrossProductVector3(deltaR,       qJqIr,   temp7  );
-    acrossProductVector3(qJr,     qIr,     temp8  ); // _qJrxqIr
-    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipole ,      temp9  ); // _dJxuI
-    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipoleP,     temp10 ); // _dJxuIp
-    acrossProductVector3(atomI.inducedDipoleP,     qJr,     temp11 ); // _uIxqJrp
-    acrossProductVector3(atomI.inducedDipole ,     qJr,     temp12 ); // _uIxqJr
-    acrossProductVector3(deltaR,       qJuIp,   temp13 ); // _rxqJuIp
-    acrossProductVector3(deltaR,       qJuI,    temp15 ); // _rxqJuI
-    float ttm3_0 = rr3*temp1[0] + gf3*temp2[0] - gf6*temp3[0] - gf4*(temp4[0] + temp5[0] + temp6[0] - 2.0f*temp14[0]) - gf7*(temp7[0] - temp8[0]);
-    float ttm3i_0 = -rr3*(temp9[0]*psc0+ temp10[0]*dsc0)*0.5f + gti3*temp2[0] - gti4*((temp12[0] + temp15[0])*psc1 + (temp11[0] + temp13[0])*dsc1)*0.5f - gti6*temp3[0];
-    float ttm3_1 = rr3*temp1[1] + gf3*temp2[1] - gf6*temp3[1] - gf4*(temp4[1] + temp5[1] + temp6[1] - 2.0f*temp14[1]) - gf7*(temp7[1] - temp8[1]);
-    float ttm3i_1 = -rr3*(temp9[1]*psc0+ temp10[1]*dsc0)*0.5f + gti3*temp2[1] - gti4*((temp12[1] + temp15[1])*psc1 + (temp11[1] + temp13[1])*dsc1)*0.5f - gti6*temp3[1];
-    float ttm3_2 = rr3*temp1[2] + gf3*temp2[2] - gf6*temp3[2] - gf4*(temp4[2] + temp5[2] + temp6[2] - 2.0f*temp14[2]) - gf7*(temp7[2] - temp8[2]);
-    float ttm3i_2 = -rr3*(temp9[2]*psc0+ temp10[2]*dsc0)*0.5f + gti3*temp2[2] - gti4*((temp12[2] + temp15[2])*psc1 + (temp11[2] + temp13[2])*dsc1)*0.5f - gti6*temp3[2];
-    if( scalingFactors[MScaleIndex] < 1.0f ){
-        ftm2_0 *= scalingFactors[MScaleIndex];
-        ftm2_1 *= scalingFactors[MScaleIndex];
-        ftm2_2 *= scalingFactors[MScaleIndex];
-        ttm2_0 *= scalingFactors[MScaleIndex];
-        ttm2_1 *= scalingFactors[MScaleIndex];
-        ttm2_2 *= scalingFactors[MScaleIndex];
-        ttm3_0 *= scalingFactors[MScaleIndex];
-        ttm3_1 *= scalingFactors[MScaleIndex];
-        ttm3_2 *= scalingFactors[MScaleIndex];
-    }
-    outputForce->x       = -(ftm2_0 + ftm2i_0);
-    outputForce->y       = -(ftm2_1 + ftm2i_1);
-    outputForce->z       = -(ftm2_2 + ftm2i_2);
-    outputTorque[0].x    =  (ttm2_0 + ttm2i_0);
-    outputTorque[0].y    =  (ttm2_1 + ttm2i_1);
-    outputTorque[0].z    =  (ttm2_2 + ttm2i_2);
-    outputTorque[1].x    =  (ttm3_0 + ttm3i_0);
-    outputTorque[1].y    =  (ttm3_1 + ttm3i_1);
-    outputTorque[1].z    =  (ttm3_2 + ttm3i_2);
-    return;
-}
-#endif
-static __device__ void loadElectrostaticParticle( volatile struct ElectrostaticParticle* sA, unsigned int atomI ){
-    // coordinates & charge
-    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = cSim.pPosq[atomI].z;
-    sA->q                        = cSim.pPosq[atomI].w;
-    // lab dipole
-    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
-    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
-    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
-    // lab quadrupole
-    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
-    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
-    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
-    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
-    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
-    // induced dipole
-    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
-    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
-    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
-    // induced dipole polar
-    sA->inducedDipoleP[0]        = cAmoebaSim.pInducedDipolePolar[atomI*3];
-    sA->inducedDipoleP[1]        = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
-    sA->inducedDipoleP[2]        = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
-    sA->damp                     = cAmoebaSim.pDampingFactorAndThole[atomI].x;
-    sA->thole                    = cAmoebaSim.pDampingFactorAndThole[atomI].y;
-}
-static __device__ void zeroElectrostaticParticle( volatile struct ElectrostaticParticle* sA ){
-    sA->force[0]                 = 0.0f;
-    sA->force[1]                 = 0.0f;
-    sA->force[2]                 = 0.0f;
-}
-#undef SUB_METHOD_NAME
-#undef F1
-#define SUB_METHOD_NAME(a, b) a##F1##b
-#define F1
-#include "kCalculateAmoebaCudaElectrostatic_b.h"
-#undef F1
-#undef SUB_METHOD_NAME
-#undef SUB_METHOD_NAME
-#undef F2
-#define SUB_METHOD_NAME(a, b) a##F2##b
-#define F2
-//#include "kCalculateAmoebaCudaElectrostatic_b.h"
-#undef F2
-#undef SUB_METHOD_NAME
-#undef SUB_METHOD_NAME
-#undef T1
-#define SUB_METHOD_NAME(a, b) a##T1##b
-#define T1
-#include "kCalculateAmoebaCudaElectrostatic_b.h"
-#undef T1
-#undef SUB_METHOD_NAME
-#undef SUB_METHOD_NAME
-#undef T3
-#define SUB_METHOD_NAME(a, b) a##T3##b
-#define T3
-#include "kCalculateAmoebaCudaElectrostatic_b.h"
-#undef T3
-#undef SUB_METHOD_NAME
-__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
-                                                      float* scalingFactors, float4*  outputForce, float4 outputTorque[2], float forceFactor){
-#ifdef Orig
-    return calculateElectrostaticPairIxn_kernel( atomI, atomJ, scalingFactors, outputForce, outputTorque);
-#else
-    float force[3];
-    float energy;
-    calculateElectrostaticPairIxnF1_kernel( atomI,  atomJ, scalingFactors, &energy, force);
-    outputForce->x = force[0];
-    outputForce->y = force[1];
-    outputForce->z = force[2];
-    outputForce->w = energy;
-    calculateElectrostaticPairIxnT1_kernel( atomI,  atomJ, scalingFactors, force);
-    outputTorque[0].x = force[0];
-    outputTorque[0].y = force[1];
-    outputTorque[0].z = force[2];
-    calculateElectrostaticPairIxnT3_kernel( atomI,  atomJ, scalingFactors, force);
-    outputTorque[1].x = force[0];
-    outputTorque[1].y = force[1];
-    outputTorque[1].z = force[2];
-    return;
-#endif
-}
-// Include versions of the kernels for N^2 calculations.
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaElectrostatic.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaElectrostatic.h"
-// reduce psWorkArray_3_1 -> torque
-static void kReduceTorque(amoebaGpuContext amoebaGpu ){
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
-    LAUNCHERROR("kReduceElectrostaticTorque");
-}
-/**---------------------------------------------------------------------------------------
-   Compute Amoeba electrostatic force & torque
-   @param amoebaGpu        amoebaGpu context
-   @param addTorqueToForce if set, then add force resulting from torque to force array
-   --------------------------------------------------------------------------------------- */
-void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce ){
-   // ---------------------------------------------------------------------------------------
-    gpuContext gpu = amoebaGpu->gpuContext;
-    // on first pass, set threads/block
-    static unsigned int threadsPerBlock = 0;
-    if( threadsPerBlock == 0 ){
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            //maxThreads = 384;
-            maxThreads = 512;
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle), gpu->sharedMemoryPerBlock), maxThreads);
-    }
-    kClearFields_3( amoebaGpu, 1 );
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
-    } else {
-        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData, amoebaGpu->psWorkArray_3_1->_pDevData );
-    }
-    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");
-    if( addTorqueToForce ){
-        kReduceTorque( amoebaGpu );
-        cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
-    }
-   // ---------------------------------------------------------------------------------------
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostatic.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostatic.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaScaleFactors.h"
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(512, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(128, 1)
-#else
-__launch_bounds__(64, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaCudaElectrostatic, Forces_kernel)(
-                            unsigned int* workUnit, float* outputTorque){
-    extern __shared__ volatile ElectrostaticParticle sA[];
-    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
-    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
-    unsigned int pos             = warp*numWorkUnits/totalWarps;
-    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
-    unsigned int lasty           = 0xFFFFFFFF;
-    float totalEnergy            = 0.0f;     
-    float conversionFactor       = (cAmoebaSim.electric/cAmoebaSim.dielec);
-    float scalingFactors[LastScalingIndex];
-    while (pos < end)
-    {
-        unsigned int x;
-        unsigned int y;
-        bool bExclusionFlag;
-        // Extract cell coordinates
-        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
-        unsigned int tgx              = threadIdx.x & (GRID - 1);
-        unsigned int tbx              = threadIdx.x - tgx;
-        unsigned int tj               = tgx;
-        volatile ElectrostaticParticle* psA = &sA[tbx];
-        unsigned int atomI            = x + tgx;
-        ElectrostaticParticle localParticle;
-        loadElectrostaticParticle( &localParticle, atomI );
-        zeroElectrostaticParticle( &localParticle );
-        scalingFactors[PScaleIndex]   = 1.0f;
-        scalingFactors[DScaleIndex]   = 1.0f;
-        scalingFactors[UScaleIndex]   = 1.0f;
-        scalingFactors[MScaleIndex]   = 1.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // load shared data
-            loadElectrostaticParticle( &(sA[threadIdx.x]), atomI );
-            unsigned int xi       = x >> GRIDBITS;
-            unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-            int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-            int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-            int2 mScaleMask       = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                unsigned int atomJ = y + j;
-                if( (atomI != atomJ) && (atomI < cSim.atoms) && (atomJ < cSim.atoms) ){
-                    getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
-                    getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
-                    getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
-                    float force[3];
-                    float energy;
-                    calculateElectrostaticPairIxnF1_kernel( localParticle, psA[j], scalingFactors, &energy, force);
-                    localParticle.force[0]            += force[0];
-                    localParticle.force[1]            += force[1];
-                    localParticle.force[2]            += force[2];
-                    totalEnergy                       += 0.5f*energy;
-                }
-            }
-            // Write results
-            localParticle.force[0]  *= conversionFactor;
-            localParticle.force[1]  *= conversionFactor;
-            localParticle.force[2]  *= conversionFactor;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset      = (x + tgx + warp*cSim.paddedNumberOfAtoms);
-#else
-            unsigned int offset      = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-#endif
-            add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
-            zeroElectrostaticParticle( &localParticle );
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                unsigned int atomJ = y + j;
-                if( (atomI != atomJ) && (atomI < cSim.atoms) && (atomJ < cSim.atoms) ){
-                    getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
-                    getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
-                    getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
-                    float force[3];
-                    calculateElectrostaticPairIxnT1_kernel( localParticle, psA[j], scalingFactors, force);
-                    localParticle.force[0]  += force[0];
-                    localParticle.force[1]  += force[1];
-                    localParticle.force[2]  += force[2];
-                }
-            }
-            localParticle.force[0] *= conversionFactor;
-            localParticle.force[1] *= conversionFactor;
-            localParticle.force[2] *= conversionFactor;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
-            add3dArray( 3*offset, localParticle.force, outputTorque );
-#else
-            offset                 = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( 3*offset, localParticle.force, outputTorque );
-#endif
-        } else {
-            // Read fixed atom data into registers and GRF
-            if( lasty != y ){
-               loadElectrostaticParticle( &(sA[threadIdx.x]), (y+tgx) );
-            }
-            zeroElectrostaticParticle( &(sA[threadIdx.x]) );
-            int  dScaleMask;
-            int2 pScaleMask;
-            int2 mScaleMask;
-            if( bExclusionFlag ){
-                unsigned int xi   = x >> GRIDBITS;
-                unsigned int yi   = y >> GRIDBITS;
-                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                mScaleMask        = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-            }
-            for (unsigned int j = 0; j < GRID; j++){
-                unsigned int atomJ = y + tj;
-                if( (atomI < cSim.atoms) && (atomJ < cSim.atoms) ){
-                    if( bExclusionFlag ){
-                        getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
-                        getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
-                        getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
-                    }
-                    float force[3];
-                    float energy;
-                    calculateElectrostaticPairIxnF1_kernel( localParticle, psA[tj], scalingFactors, &energy, force);
-                    totalEnergy                       += energy;
-                    localParticle.force[0]            += force[0];
-                    localParticle.force[1]            += force[1];
-                    localParticle.force[2]            += force[2];
-                    psA[tj].force[0]                  -= force[0];
-                    psA[tj].force[1]                  -= force[1];
-                    psA[tj].force[2]                  -= force[2];
-                }
-                tj = (tj + 1) & (GRID - 1);
-            }
-            // Write results
-            localParticle.force[0]     *= conversionFactor;
-            localParticle.force[1]     *= conversionFactor;
-            localParticle.force[2]     *= conversionFactor;
-            sA[threadIdx.x].force[0]   *= conversionFactor;
-            sA[threadIdx.x].force[1]   *= conversionFactor;
-            sA[threadIdx.x].force[2]   *= conversionFactor;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
-            add3dArrayToFloat4(   offset, localParticle.force,   cSim.pForce4 );
-            offset                              = (y + tgx + warp*cSim.paddedNumberOfAtoms);
-            add3dArrayToFloat4(   offset, sA[threadIdx.x].force,   cSim.pForce4 );
-#else
-            unsigned int offset                 = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            add3dArrayToFloat4(   offset, localParticle.force,   cSim.pForce4 );
-            offset                              = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
-#endif
-            zeroElectrostaticParticle( &(sA[threadIdx.x]) );
-            zeroElectrostaticParticle( &localParticle );
-            tj = tgx;
-            for (unsigned int j = 0; j < GRID; j++){
-                unsigned int atomJ = y + tj;
-                if( (atomI < cSim.atoms) && (atomJ < cSim.atoms) ){
-                    if( bExclusionFlag ){
-                        getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
-                        getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
-                        getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
-                    }
-                    float force[3];
-                    calculateElectrostaticPairIxnT1_kernel( localParticle, psA[tj], scalingFactors, force);
-                    localParticle.force[0]           += force[0];
-                    localParticle.force[1]           += force[1];
-                    localParticle.force[2]           += force[2];
-                    calculateElectrostaticPairIxnT3_kernel( localParticle, psA[tj], scalingFactors, force);
-                    psA[tj].force[0]                 += force[0];
-                    psA[tj].force[1]                 += force[1];
-                    psA[tj].force[2]                 += force[2];
-                }
-                tj = (tj + 1) & (GRID - 1);
-            }
-            localParticle.force[0]    *= conversionFactor;
-            localParticle.force[1]    *= conversionFactor;
-            localParticle.force[2]    *= conversionFactor;
-            sA[threadIdx.x].force[0]  *= conversionFactor;
-            sA[threadIdx.x].force[1]  *= conversionFactor;
-            sA[threadIdx.x].force[2]  *= conversionFactor;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
-            add3dArray( 3*offset, localParticle.force,  outputTorque );
-            offset                 = (y + tgx + warp*cSim.paddedNumberOfAtoms);
-            add3dArray( 3*offset, sA[threadIdx.x].force,  outputTorque );
-#else
-            offset                 = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray(         3*offset, localParticle.force, outputTorque );
-            offset                 = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray(       3*offset, sA[threadIdx.x].force, outputTorque );
-#endif
-            lasty = y;
-        }
-        pos++;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += (conversionFactor*totalEnergy);
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "cudaKernels.h"
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-#include "openmm/OpenMMException.h"
-#include <stdio.h>
-#include <cuda.h>
-#include <cstdlib>
-using namespace std; 
-#define SQRT sqrtf
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
-void SetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "SetCalculateAmoebaMultipolePotentialSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));     
-    RTERROR(status, "SetCalculateAmoebaMultipolePotentialSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-void GetCalculateAmoebaMultipolePotentialSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "GetCalculateAmoebaMultipolePotentialSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));     
-    RTERROR(status, "GetCalculateAmoebaMultipolePotentialSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-struct ElectrostaticPotentialParticle {
-    // coordinates charge
-    float x;
-    float y;
-    float z;
-    float q;
-    // lab frame dipole
-    float labFrameDipole[3];
-    // lab frame quadrupole
-    float labFrameQuadrupole[9];
-    // induced dipole
-    float inducedDipole[3];
-};
-/**---------------------------------------------------------------------------------------
-   Load data for particle w/ index=atomI
-   @param sa        address to store atomI's coordinates and multipole moments
-   @param atomI     index of atom whose data is to be stored
-   --------------------------------------------------------------------------------------- */
-static __device__ void loadElectrostaticPotentialParticle( volatile struct ElectrostaticPotentialParticle* sA, unsigned int atomI ){
-    // coordinates & charge
-    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = cSim.pPosq[atomI].z;
-    sA->q                        = cSim.pPosq[atomI].w;
-    // lab dipole
-    sA->labFrameDipole[0]        = cAmoebaSim.pLabFrameDipole[atomI*3];
-    sA->labFrameDipole[1]        = cAmoebaSim.pLabFrameDipole[atomI*3+1];
-    sA->labFrameDipole[2]        = cAmoebaSim.pLabFrameDipole[atomI*3+2];
-    // lab quadrupole
-    sA->labFrameQuadrupole[0]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
-    sA->labFrameQuadrupole[1]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole[2]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole[3]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+3];
-    sA->labFrameQuadrupole[4]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole[5]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole[6]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+6];
-    sA->labFrameQuadrupole[7]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+7];
-    sA->labFrameQuadrupole[8]    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
-    // induced dipole
-    sA->inducedDipole[0]         = cAmoebaSim.pInducedDipole[atomI*3];
-    sA->inducedDipole[1]         = cAmoebaSim.pInducedDipole[atomI*3+1];
-    sA->inducedDipole[2]         = cAmoebaSim.pInducedDipole[atomI*3+2];
-}
-/**---------------------------------------------------------------------------------------
-   Calculate potential at grid point due atomI
-   Code adapted from TINKER routine potpoint in potpoint.f
-   @param atomI     atomI's coordinates and multipole moments
-   @param gridPoint grid coordinates
-   @param potential output potential
-   --------------------------------------------------------------------------------------- */
-__device__ void calculateElectrostaticPotentialForAtomGridPoint_kernel( volatile ElectrostaticPotentialParticle& atomI, volatile float4& gridPoint, float* potential ){
-    float xr                 = atomI.x - gridPoint.x;
-    float yr                 = atomI.y - gridPoint.y;
-    float zr                 = atomI.z - gridPoint.z;
-    xr                      -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-    yr                      -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-    zr                      -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-    float r2                 = xr*xr + yr*yr + zr*zr;
-    float r                  = sqrtf( r2 );
-    float rr1                = 1.0f/r;
-    *potential               = atomI.q*rr1;
-    float rr2                = rr1*rr1;
-    float rr3                = rr1*rr2;
-    float scd                = atomI.labFrameDipole[0]*xr     +  atomI.labFrameDipole[1]*yr    + atomI.labFrameDipole[2]*zr;
-    float scu                =  atomI.inducedDipole[0]*xr     +   atomI.inducedDipole[1]*yr    +  atomI.inducedDipole[2]*zr;
-    *potential              -= (scd + scu)*rr3;
-    float rr5                = 3.0f*rr3*rr2;
-    float scq                = xr*(atomI.labFrameQuadrupole[0]*xr + atomI.labFrameQuadrupole[1]*yr + atomI.labFrameQuadrupole[2]*zr);
-          scq               += yr*(atomI.labFrameQuadrupole[1]*xr + atomI.labFrameQuadrupole[4]*yr + atomI.labFrameQuadrupole[5]*zr);
-          scq               += zr*(atomI.labFrameQuadrupole[2]*xr + atomI.labFrameQuadrupole[5]*yr + atomI.labFrameQuadrupole[8]*zr);
-    *potential              += scq*rr5;
-    return;
-}
-// Include versions of the kernels for N x PotentialGridSize calculations.
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME 
-#define METHOD_NAME(a, b) a##NxG##b
-#include "kCalculateAmoebaCudaElectrostaticPotential.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##NxGByWarp##b
-#include "kCalculateAmoebaCudaElectrostaticPotential.h"
-// Kernel to reduce potential
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReducePotential_kernel()
-{
-    unsigned int pos             = (blockIdx.x * blockDim.x + threadIdx.x);
-    float conversionFactor       = (cAmoebaSim.electric/cAmoebaSim.dielec);
-    // Reduce potential
-    while (pos < cAmoebaSim.paddedPotentialGridSize)
-    {
-        float totalPotential         = 0.0f;
-        float* pFt                   = cAmoebaSim.pPotential + pos;
-        int i                        = cSim.outputBuffers;
-        while (i >= 4)
-        {
-            float f1             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f2             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f3             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f4             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            totalPotential      += f1 + f2 + f3 + f4;
-            i                   -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            float f2             = *pFt;
-            pFt                 += cAmoebaSim.paddedPotentialGridSize;
-            totalPotential      += f1 + f2;
-            i                   -= 2;
-        }
-        if (i > 0)
-        {
-            totalPotential += *pFt;
-        }
-        totalPotential *= conversionFactor;
-        pFt             = cAmoebaSim.pPotential + pos;
-        *pFt            = totalPotential;
-        pos            += gridDim.x*blockDim.x;
-    }   
-}
-/**---------------------------------------------------------------------------------------
-   Reduce Amoeba electrostatic potential
-   @param gpu        gpu context
-   --------------------------------------------------------------------------------------- */
-void kReducePotential(gpuContext gpu)
-{
-    kReducePotential_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-    LAUNCHERROR("kReducePotential");
-}
-/**---------------------------------------------------------------------------------------
-   Compute Amoeba electrostatic potential
-   @param amoebaGpu        amoebaGpu context
-   --------------------------------------------------------------------------------------- */
-void cudaComputeAmoebaElectrostaticPotential( amoebaGpuContext amoebaGpu ){
-   // ---------------------------------------------------------------------------------------
-    gpuContext gpu = amoebaGpu->gpuContext;
-    // on first pass, set threads/block
-    static unsigned int threadsPerBlock = 0;
-    if( threadsPerBlock == 0 ){
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            //maxThreads = 384;
-            maxThreads = 512;
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticPotentialParticle), gpu->sharedMemoryPerBlock), maxThreads);
-    }
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaCudaElectrostaticPotentialNxGByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
-    } else {
-        kCalculateAmoebaCudaElectrostaticPotentialNxG_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticPotentialParticle)*threadsPerBlock>>>( );
-    }
-    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticPotential");
-    kReducePotential( amoebaGpu->gpuContext );
-   // ---------------------------------------------------------------------------------------
-}
-void kCalculateAmoebaMultipolePotential(amoebaGpuContext amoebaGpu ) 
-{
-    // setup
-    kSetupAmoebaMultipoleForces(amoebaGpu, false ); 
-    // calculate electrostatic potential
-    cudaComputeAmoebaElectrostaticPotential( amoebaGpu );
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostaticPotential.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaScaleFactors.h"
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(512, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(128, 1)
-#else
-__launch_bounds__(64, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaCudaElectrostaticPotential, _kernel)( void ){
-    extern __shared__ volatile ElectrostaticPotentialParticle sAPotential[];
-    unsigned int* workUnit       = cAmoebaSim.pPotentialWorkUnit;
-    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
-    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits    = cAmoebaSim.potentialWorkUnits;
-    unsigned int pos             = warp*numWorkUnits/totalWarps;
-    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
-    while (pos < end){
-        unsigned int x;
-        unsigned int y;
-        bool bExclusionFlag;
-        // Extract cell coordinates
-        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
-        unsigned int tgx              = threadIdx.x & (GRID - 1);
-        unsigned int tbx              = threadIdx.x - tgx;
-        unsigned int tj               = tgx;
-        volatile ElectrostaticPotentialParticle* psA = &sAPotential[tbx];
-        unsigned int gridPointIndex   = x + tgx;
-        unsigned int particleIndex    = y + tgx;
-        // load particle info
-        loadElectrostaticPotentialParticle( &(sAPotential[threadIdx.x]), particleIndex );
-        float totalPotential  = 0.0f;
-        for (unsigned int j = 0; j < GRID; j++){
-            unsigned int particleJ = y + tj;
-            float potential;
-            calculateElectrostaticPotentialForAtomGridPoint_kernel( psA[tj], cAmoebaSim.pPotentialGrid[gridPointIndex], &potential );
-            if( particleJ < cSim.atoms && gridPointIndex < cAmoebaSim.potentialGridSize ){
-                totalPotential += potential;
-            }
-            tj = (tj + 1) & (GRID - 1);
-        }
-        // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-        unsigned int offset            = (x + tgx + warp*cAmoebaSim.paddedPotentialGridSize);
-        cAmoebaSim.pPotential[offset] += totalPotential; 
-#else
-        unsigned int offset            = (x + tgx + (y >> GRIDBITS)*cAmoebaSim.paddedPotentialGridSize);
-        cAmoebaSim.pPotential[offset]  = totalPotential; 
-#endif
-        pos++;
-    }
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostatic_b.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaElectrostatic_b.h
-__device__ void SUB_METHOD_NAME( calculateElectrostaticPairIxn, _kernel )( ElectrostaticParticle& atomI, volatile ElectrostaticParticle& atomJ,
-                                                                           float* scalingFactors,
-#ifdef F1
-                                                                           float* energy,
-#endif
-                                                                           float* outputForce ){
-#ifdef F1
-    float ddsc3_0            = 0.0f;
-    float ddsc3_1            = 0.0f;
-    float ddsc3_2            = 0.0f;
-    float ddsc5_0            = 0.0f;
-    float ddsc5_1            = 0.0f;
-    float ddsc5_2            = 0.0f;
-    float ddsc7_0            = 0.0f;
-    float ddsc7_1            = 0.0f;
-    float ddsc7_2            = 0.0f;
-#endif
-    float xr                 = atomJ.x - atomI.x;
-    float yr                 = atomJ.y - atomI.y;
-    float zr                 = atomJ.z - atomI.z;
-    float r2                 = xr*xr + yr*yr + zr*zr;
-    float r                  = sqrtf( r2 );
-    float rr1                = 1.0f/r;
-    float rr2                = rr1*rr1;
-    float rr3                = rr1*rr2;
-    float rr5                = 3.0f*rr3*rr2;
-    float rr7                = 5.0f*rr5*rr2;
-    float rr9                = 7.0f*rr7*rr2;
-#ifdef F1
-    float rr11               = 9.0f*rr9*rr2;
-#endif
-    float scale3             = 1.0f;
-    float scale5             = 1.0f;
-    float scale7             = 1.0f;
-    float pdamp              = atomI.damp*atomJ.damp;
-    if( pdamp != 0.0 && r < cAmoebaSim.scalingDistanceCutoff ){
-        float ratio                   = r/pdamp;
-        float pGamma                  = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole;
-        float damp                    = ratio*ratio*ratio*pGamma;
-        float dampExp                 = expf( -damp );
-        float damp1                   = damp + 1.0f;
-        float damp2                   = damp*damp;
-        scale3                        = 1.0f - dampExp;
-        scale5                        = 1.0f - damp1*dampExp;
-        scale7                        = 1.0f - ( damp1 + 0.6f*damp2)*dampExp;
-#ifdef F1
-        float factor                  = 3.0f*damp*dampExp*rr2;
-        float factor7                 = -0.2f + 0.6f*damp;
-        ddsc3_0                       = factor*xr;
-        ddsc5_0                       = ddsc3_0*damp;
-        ddsc7_0                       = ddsc5_0*factor7;
-        ddsc3_1                       = factor*yr;
-        ddsc5_1                       = ddsc3_1*damp;
-        ddsc7_1                       = ddsc5_1*factor7;
-        ddsc3_2                       = factor*zr;
-        ddsc5_2                       = ddsc3_2*damp;
-        ddsc7_2                       = ddsc5_2*factor7;
-#endif
-    }
-#if defined F1
-    float scale3i            = rr3*scale3*scalingFactors[UScaleIndex];
-    float scale5i            = rr5*scale5*scalingFactors[UScaleIndex];
-#endif
-    float dsc3               = rr3*scale3*scalingFactors[DScaleIndex];
-    float psc3               = rr3*scale3*scalingFactors[PScaleIndex];
-    float dsc5               = rr5*scale5*scalingFactors[DScaleIndex];
-    float psc5               = rr5*scale5*scalingFactors[PScaleIndex];
-    float dsc7               = rr7*scale7*scalingFactors[DScaleIndex];
-    float psc7               = rr7*scale7*scalingFactors[PScaleIndex];
-    float qJr_0              = atomJ.labFrameQuadrupole[0]*xr + atomJ.labFrameQuadrupole[3]*yr + atomJ.labFrameQuadrupole[6]*zr;
-    float qJr_1              = atomJ.labFrameQuadrupole[1]*xr + atomJ.labFrameQuadrupole[4]*yr + atomJ.labFrameQuadrupole[7]*zr;
-    float qJr_2              = atomJ.labFrameQuadrupole[2]*xr + atomJ.labFrameQuadrupole[5]*yr + atomJ.labFrameQuadrupole[8]*zr;
-    float qIr_0              = atomI.labFrameQuadrupole[0]*xr + atomI.labFrameQuadrupole[3]*yr + atomI.labFrameQuadrupole[6]*zr;
-    float qIr_1              = atomI.labFrameQuadrupole[1]*xr + atomI.labFrameQuadrupole[4]*yr + atomI.labFrameQuadrupole[7]*zr;
-    float qIr_2              = atomI.labFrameQuadrupole[2]*xr + atomI.labFrameQuadrupole[5]*yr + atomI.labFrameQuadrupole[8]*zr;
-#if defined F1
-    float sc2                = atomI.labFrameDipole[0]*atomJ.labFrameDipole[0] + atomI.labFrameDipole[1]*atomJ.labFrameDipole[1] + atomI.labFrameDipole[2]*atomJ.labFrameDipole[2];
-#endif
-#if defined F1 || defined T1
-    float sc4                = atomJ.labFrameDipole[0]*xr + atomJ.labFrameDipole[1]*yr + atomJ.labFrameDipole[2]*zr;
-    float sc6                = qJr_0*xr + qJr_1*yr + qJr_2*zr;
-#endif
-#if defined F1 || defined T3
-    float sc3                = atomI.labFrameDipole[0]*xr + atomI.labFrameDipole[1]*yr + atomI.labFrameDipole[2]*zr;
-    float sc5                = qIr_0*xr + qIr_1*yr + qIr_2*zr;
-#endif
-#if defined F1
-    float sc7                = qIr_0*atomJ.labFrameDipole[0] + qIr_1*atomJ.labFrameDipole[1] + qIr_2*atomJ.labFrameDipole[2];
-    float sc8                = qJr_0*atomI.labFrameDipole[0] + qJr_1*atomI.labFrameDipole[1] + qJr_2*atomI.labFrameDipole[2];
-    float sc9                = qIr_0*qJr_0 + qIr_1*qJr_1 + qIr_2*qJr_2;
-	 float sc10               = atomI.labFrameQuadrupole[0]*atomJ.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[1]*atomJ.labFrameQuadrupole[1] + atomI.labFrameQuadrupole[2]*atomJ.labFrameQuadrupole[2] +
-                               atomI.labFrameQuadrupole[3]*atomJ.labFrameQuadrupole[3] + atomI.labFrameQuadrupole[4]*atomJ.labFrameQuadrupole[4] + atomI.labFrameQuadrupole[5]*atomJ.labFrameQuadrupole[5] +
-                               atomI.labFrameQuadrupole[6]*atomJ.labFrameQuadrupole[6] + atomI.labFrameQuadrupole[7]*atomJ.labFrameQuadrupole[7] + atomI.labFrameQuadrupole[8]*atomJ.labFrameQuadrupole[8];
-    float sci1               = atomI.inducedDipole[0]*atomJ.labFrameDipole[0] + atomI.inducedDipole[1]*atomJ.labFrameDipole[1] + atomI.inducedDipole[2]*atomJ.labFrameDipole[2] +
-                               atomJ.inducedDipole[0]*atomI.labFrameDipole[0] + atomJ.inducedDipole[1]*atomI.labFrameDipole[1] + atomJ.inducedDipole[2]*atomI.labFrameDipole[2];
-#endif
-#if defined F1 || defined T3
-    float sci3               = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
-#endif
-#if defined F1
-    float sci7               = qIr_0*atomJ.inducedDipole[0] + qIr_1*atomJ.inducedDipole[1] + qIr_2*atomJ.inducedDipole[2];
-    float sci8               = qJr_0*atomI.inducedDipole[0] + qJr_1*atomI.inducedDipole[1] + qJr_2*atomI.inducedDipole[2];
-#endif
-#if defined F1 || defined T1
-    float sci4               = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
-#endif
-#if defined F1
-    float scip1              = atomI.inducedDipoleP[0]*atomJ.labFrameDipole[0] + atomI.inducedDipoleP[1]*atomJ.labFrameDipole[1] + atomI.inducedDipoleP[2]*atomJ.labFrameDipole[2] +
-                               atomJ.inducedDipoleP[0]*atomI.labFrameDipole[0] + atomJ.inducedDipoleP[1]*atomI.labFrameDipole[1] + atomJ.inducedDipoleP[2]*atomI.labFrameDipole[2];
-    float scip2              = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0] + atomI.inducedDipole[1]*atomJ.inducedDipoleP[1] + atomI.inducedDipole[2]*atomJ.inducedDipoleP[2] +
-                               atomJ.inducedDipole[0]*atomI.inducedDipoleP[0] + atomJ.inducedDipole[1]*atomI.inducedDipoleP[1] + atomJ.inducedDipole[2]*atomI.inducedDipoleP[2];
-#endif
-#if defined F1 || defined T3
-    float scip3              = ((atomI.inducedDipoleP[0])*(xr) + (atomI.inducedDipoleP[1])*(yr) + (atomI.inducedDipoleP[2])*(zr));
-#endif
-#if defined F1 || defined T1
-    float scip4              = ((atomJ.inducedDipoleP[0])*(xr) + (atomJ.inducedDipoleP[1])*(yr) + (atomJ.inducedDipoleP[2])*(zr));
-#endif
-#ifdef F1
-    float scip7              = ((qIr_0)*(atomJ.inducedDipoleP[0]) + (qIr_1)*(atomJ.inducedDipoleP[1]) + (qIr_2)*(atomJ.inducedDipoleP[2]));
-    float scip8              = ((qJr_0)*(atomI.inducedDipoleP[0]) + (qJr_1)*(atomI.inducedDipoleP[1]) + (qJr_2)*(atomI.inducedDipoleP[2]));
-    float gli1               = atomJ.q*sci3 - atomI.q*sci4;
-    float gli6               = sci1;
-    float glip1              = atomJ.q*scip3 - atomI.q*scip4;
-    float glip6              = scip1;
-    float gli2               = -sc3*sci4 - sci3*sc4;
-    float gli3               = sci3*sc6 - sci4*sc5;
-    float gli7               = 2.0f*(sci7-sci8);
-    float glip2              = -sc3*scip4 - scip3*sc4;
-    float glip3              = scip3*sc6 - scip4*sc5;
-    float glip7              = 2.0f*(scip7-scip8);
-    float factor3            = rr3*(( gli1  +  gli6)*scalingFactors[PScaleIndex] + (glip1  + glip6)*scalingFactors[DScaleIndex]);
-    float factor5            = rr5*(( gli2  +  gli7)*scalingFactors[PScaleIndex] + (glip2  + glip7)*scalingFactors[DScaleIndex]);
-    float factor7            = rr7*( gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
-    float ftm2i_0            = -0.5f*(factor3*ddsc3_0 + factor5*ddsc5_0 + factor7*ddsc7_0);
-    float ftm2i_1            = -0.5f*(factor3*ddsc3_1 + factor5*ddsc5_1 + factor7*ddsc7_1);
-    float ftm2i_2            = -0.5f*(factor3*ddsc3_2 + factor5*ddsc5_2 + factor7*ddsc7_2);
-    float gl0                = atomI.q*atomJ.q;
-    float gl1                = atomJ.q*sc3 - atomI.q*sc4;
-    float gl2                = atomI.q*sc6 + atomJ.q*sc5 - sc3*sc4;
-    float gl3                = sc3*sc6 - sc4*sc5;
-    float gl4                = sc5*sc6;
-    float gl6                = sc2;
-    float gl7                = 2.0f*(sc7-sc8);
-    float gl8                = 2.0f*sc10;
-    float gl5                = -4.0f*sc9;
-    float gf1                = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
-#endif
-#if defined F1 || defined T1
-    float gf2                = -atomJ.q*rr3 + sc4*rr5 - sc6*rr7;
-    float gf5                = 2.0f*(-atomJ.q*rr5+sc4*rr7-sc6*rr9);
-#endif
-#if defined F1 || defined T3
-    float gf3                =  atomI.q*rr3 + sc3*rr5 + sc5*rr7;
-    float gf6                = 2.0f*(-atomI.q*rr5-sc3*rr7-sc5*rr9);
-#endif
-#ifdef F1
-    float em                 = scalingFactors[MScaleIndex]*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
-    float ei                 = 0.5f*((gli1+gli6)*psc3 + (gli2+gli7)*psc5 + gli3*psc7);
-    *energy                  = em+ei;
-#endif
-#if defined F1 || defined T1
-    float qIdJ_0 = atomI.labFrameQuadrupole[0]*atomJ.labFrameDipole[0] + atomI.labFrameQuadrupole[3]*atomJ.labFrameDipole[1] + atomI.labFrameQuadrupole[6]*atomJ.labFrameDipole[2];
-    float qIdJ_1 = atomI.labFrameQuadrupole[1]*atomJ.labFrameDipole[0] + atomI.labFrameQuadrupole[4]*atomJ.labFrameDipole[1] + atomI.labFrameQuadrupole[7]*atomJ.labFrameDipole[2];
-    float qIdJ_2 = atomI.labFrameQuadrupole[2]*atomJ.labFrameDipole[0] + atomI.labFrameQuadrupole[5]*atomJ.labFrameDipole[1] + atomI.labFrameQuadrupole[8]*atomJ.labFrameDipole[2];
-    float qIqJr_0 = atomI.labFrameQuadrupole[0]*qJr_0 + atomI.labFrameQuadrupole[3]*qJr_1 + atomI.labFrameQuadrupole[6]*qJr_2;
-    float qIqJr_1 = atomI.labFrameQuadrupole[1]*qJr_0 + atomI.labFrameQuadrupole[4]*qJr_1 + atomI.labFrameQuadrupole[7]*qJr_2;
-    float qIqJr_2 = atomI.labFrameQuadrupole[2]*qJr_0 + atomI.labFrameQuadrupole[5]*qJr_1 + atomI.labFrameQuadrupole[8]*qJr_2;
-#endif
-#ifdef F1
-    float qkqir_0 = atomJ.labFrameQuadrupole[0]*qIr_0 + atomJ.labFrameQuadrupole[3]*qIr_1 + atomJ.labFrameQuadrupole[6]*qIr_2;
-    float qkqir_1 = atomJ.labFrameQuadrupole[1]*qIr_0 + atomJ.labFrameQuadrupole[4]*qIr_1 + atomJ.labFrameQuadrupole[7]*qIr_2;
-    float qkqir_2 = atomJ.labFrameQuadrupole[2]*qIr_0 + atomJ.labFrameQuadrupole[5]*qIr_1 + atomJ.labFrameQuadrupole[8]*qIr_2;
-    float qkdi_0 = atomJ.labFrameQuadrupole[0]*atomI.labFrameDipole[0] + atomJ.labFrameQuadrupole[3]*atomI.labFrameDipole[1] + atomJ.labFrameQuadrupole[6]*atomI.labFrameDipole[2];
-    float qkdi_1 = atomJ.labFrameQuadrupole[1]*atomI.labFrameDipole[0] + atomJ.labFrameQuadrupole[4]*atomI.labFrameDipole[1] + atomJ.labFrameQuadrupole[7]*atomI.labFrameDipole[2];
-    float qkdi_2 = atomJ.labFrameQuadrupole[2]*atomI.labFrameDipole[0] + atomJ.labFrameQuadrupole[5]*atomI.labFrameDipole[1] + atomJ.labFrameQuadrupole[8]*atomI.labFrameDipole[2];
-    float ftm2_0   = scalingFactors[MScaleIndex]*(gf1*xr + gf2*atomI.labFrameDipole[0] + gf3*atomJ.labFrameDipole[0] + 2.0f*rr5*(qkdi_0 - qIdJ_0) + gf5*qIr_0 + gf6*qJr_0 + 4.0f*rr7*(qIqJr_0 + qkqir_0));
-    float ftm2_1   = scalingFactors[MScaleIndex]*(gf1*yr + gf2*atomI.labFrameDipole[1] + gf3*atomJ.labFrameDipole[1] + 2.0f*rr5*(qkdi_1 - qIdJ_1) + gf5*qIr_1 + gf6*qJr_1 + 4.0f*rr7*(qIqJr_1 + qkqir_1));
-    float ftm2_2   = scalingFactors[MScaleIndex]*(gf1*zr + gf2*atomI.labFrameDipole[2] + gf3*atomJ.labFrameDipole[2] + 2.0f*rr5*(qkdi_2 - qIdJ_2) + gf5*qIr_2 + gf6*qJr_2 + 4.0f*rr7*(qIqJr_2 + qkqir_2));
-    float gfi1 = rr2*(1.5f*((gli1+gli6)*psc3 + (glip1+glip6)*dsc3 + scip2*scale3i) + 2.5f*((gli7+gli2)*psc5 + (glip7+glip2)*dsc5 - (sci3*scip4+scip3*sci4)*scale5i) + 3.5f*(gli3*psc7+glip3*dsc7));
-    ftm2i_0   += gfi1*xr;
-    ftm2i_1   += gfi1*yr;
-    ftm2i_2   += gfi1*zr;
-#endif
-#if defined F1 || defined T1
-    float gfi5 =  (sci4*psc7 + scip4*dsc7);
-#endif
-#if defined F1 || defined T3
-    float gfi6 = -(sci3*psc7 + scip3*dsc7);
-#endif
-#if defined F1 || defined T1
-    float qIuJ_0 = atomI.labFrameQuadrupole[0]*atomJ.inducedDipole[0]   + atomI.labFrameQuadrupole[3]*atomJ.inducedDipole[1]  + atomI.labFrameQuadrupole[6]*atomJ.inducedDipole[2];
-    float qIuJ_1 = atomI.labFrameQuadrupole[1]*atomJ.inducedDipole[0]   + atomI.labFrameQuadrupole[4]*atomJ.inducedDipole[1]  + atomI.labFrameQuadrupole[7]*atomJ.inducedDipole[2];
-    float qIuJ_2 = atomI.labFrameQuadrupole[2]*atomJ.inducedDipole[0]   + atomI.labFrameQuadrupole[5]*atomJ.inducedDipole[1]  + atomI.labFrameQuadrupole[8]*atomJ.inducedDipole[2];
-    float qIuJp_0 = atomI.labFrameQuadrupole[0]*atomJ.inducedDipoleP[0] + atomI.labFrameQuadrupole[3]*atomJ.inducedDipoleP[1] + atomI.labFrameQuadrupole[6]*atomJ.inducedDipoleP[2];
-    float qIuJp_1 = atomI.labFrameQuadrupole[1]*atomJ.inducedDipoleP[0] + atomI.labFrameQuadrupole[4]*atomJ.inducedDipoleP[1] + atomI.labFrameQuadrupole[7]*atomJ.inducedDipoleP[2];
-    float qIuJp_2 = atomI.labFrameQuadrupole[2]*atomJ.inducedDipoleP[0] + atomI.labFrameQuadrupole[5]*atomJ.inducedDipoleP[1] + atomI.labFrameQuadrupole[8]*atomJ.inducedDipoleP[2];
-#endif
-#if defined T3
-    float qJuIp_0 = atomJ.labFrameQuadrupole[0]*atomI.inducedDipoleP[0] + atomJ.labFrameQuadrupole[3]*atomI.inducedDipoleP[1] + atomJ.labFrameQuadrupole[6]*atomI.inducedDipoleP[2];
-    float qJuIp_1 = atomJ.labFrameQuadrupole[1]*atomI.inducedDipoleP[0] + atomJ.labFrameQuadrupole[4]*atomI.inducedDipoleP[1] + atomJ.labFrameQuadrupole[7]*atomI.inducedDipoleP[2];
-    float qJuIp_2 = atomJ.labFrameQuadrupole[2]*atomI.inducedDipoleP[0] + atomJ.labFrameQuadrupole[5]*atomI.inducedDipoleP[1] + atomJ.labFrameQuadrupole[8]*atomI.inducedDipoleP[2];
-     float qJuI_0 = atomJ.labFrameQuadrupole[0]*atomI.inducedDipole[0] + atomJ.labFrameQuadrupole[3]*atomI.inducedDipole[1] + atomJ.labFrameQuadrupole[6]*atomI.inducedDipole[2];
-     float qJuI_1 = atomJ.labFrameQuadrupole[1]*atomI.inducedDipole[0] + atomJ.labFrameQuadrupole[4]*atomI.inducedDipole[1] + atomJ.labFrameQuadrupole[7]*atomI.inducedDipole[2];
-     float qJuI_2 = atomJ.labFrameQuadrupole[2]*atomI.inducedDipole[0] + atomJ.labFrameQuadrupole[5]*atomI.inducedDipole[1] + atomJ.labFrameQuadrupole[8]*atomI.inducedDipole[2];
-#endif
-#ifdef F1
-    float qkui_0 = atomJ.labFrameQuadrupole[0]*atomI.inducedDipole[0] + atomJ.labFrameQuadrupole[3]*atomI.inducedDipole[1] + atomJ.labFrameQuadrupole[6]*atomI.inducedDipole[2];
-    float qkui_1 = atomJ.labFrameQuadrupole[1]*atomI.inducedDipole[0] + atomJ.labFrameQuadrupole[4]*atomI.inducedDipole[1] + atomJ.labFrameQuadrupole[7]*atomI.inducedDipole[2];
-    float qkui_2 = atomJ.labFrameQuadrupole[2]*atomI.inducedDipole[0] + atomJ.labFrameQuadrupole[5]*atomI.inducedDipole[1] + atomJ.labFrameQuadrupole[8]*atomI.inducedDipole[2];
-    float qkuip_0 = atomJ.labFrameQuadrupole[0]*atomI.inducedDipoleP[0] + atomJ.labFrameQuadrupole[3]*atomI.inducedDipoleP[1] + atomJ.labFrameQuadrupole[6]*atomI.inducedDipoleP[2];
-    float qkuip_1 = atomJ.labFrameQuadrupole[1]*atomI.inducedDipoleP[0] + atomJ.labFrameQuadrupole[4]*atomI.inducedDipoleP[1] + atomJ.labFrameQuadrupole[7]*atomI.inducedDipoleP[2];
-    float qkuip_2 = atomJ.labFrameQuadrupole[2]*atomI.inducedDipoleP[0] + atomJ.labFrameQuadrupole[5]*atomI.inducedDipoleP[1] + atomJ.labFrameQuadrupole[8]*atomI.inducedDipoleP[2];
-    ftm2i_0 += 0.5f*(-atomJ.q*(atomI.inducedDipole[0]*psc3 + atomI.inducedDipoleP[0]*dsc3) +
-                    sc4*(atomI.inducedDipole[0]*psc5 + atomI.inducedDipoleP[0]*dsc5) -
-                    sc6*(atomI.inducedDipole[0]*psc7 + atomI.inducedDipoleP[0]*dsc7)) +
-                   0.5f*(atomI.q*(atomJ.inducedDipole[0]*psc3+atomJ.inducedDipoleP[0]*dsc3) +
-                     sc3*(atomJ.inducedDipole[0]*psc5 +atomJ.inducedDipoleP[0]*dsc5) +
-                     sc5*(atomJ.inducedDipole[0]*psc7 +atomJ.inducedDipoleP[0]*dsc7)) +
-                     scale5i*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0] +
-                     sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0])*0.5f +
-                    0.5f*(sci4*psc5+scip4*dsc5)*atomI.labFrameDipole[0] +
-                    0.5f*(sci3*psc5+scip3*dsc5)*atomJ.labFrameDipole[0] +
-                    ((qkui_0-qIuJ_0)*psc5 + (qkuip_0-qIuJp_0)*dsc5) +
-                    gfi5*qIr_0 + gfi6*qJr_0;
-    ftm2i_1 += 0.5f*(-atomJ.q*(atomI.inducedDipole[1]*psc3 + atomI.inducedDipoleP[1]*dsc3) +
-                    sc4*(atomI.inducedDipole[1]*psc5 + atomI.inducedDipoleP[1]*dsc5) -
-                    sc6*(atomI.inducedDipole[1]*psc7 + atomI.inducedDipoleP[1]*dsc7)) +
-                    (atomI.q*(atomJ.inducedDipole[1]*psc3+atomJ.inducedDipoleP[1]*dsc3) +
-                     sc3*(atomJ.inducedDipole[1]*psc5 +atomJ.inducedDipoleP[1]*dsc5) +
-                     sc5*(atomJ.inducedDipole[1]*psc7 +atomJ.inducedDipoleP[1]*dsc7))*0.5f +
-                     scale5i*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1])*0.5f +
-                    0.5f*(sci4*psc5+scip4*dsc5)*atomI.labFrameDipole[1] +
-                    0.5f*(sci3*psc5+scip3*dsc5)*atomJ.labFrameDipole[1] +
-                    ((qkui_1-qIuJ_1)*psc5 + (qkuip_1-qIuJp_1)*dsc5) +
-                    gfi5*qIr_1 + gfi6*qJr_1;
-    ftm2i_2 += 0.5f*(-atomJ.q*(atomI.inducedDipole[2]*psc3 + atomI.inducedDipoleP[2]*dsc3) +
-                    sc4*(atomI.inducedDipole[2]*psc5 + atomI.inducedDipoleP[2]*dsc5) -
-                    sc6*(atomI.inducedDipole[2]*psc7 + atomI.inducedDipoleP[2]*dsc7)) +
-                    (atomI.q*(atomJ.inducedDipole[2]*psc3+atomJ.inducedDipoleP[2]*dsc3) +
-                     sc3*(atomJ.inducedDipole[2]*psc5 +atomJ.inducedDipoleP[2]*dsc5) +
-                     sc5*(atomJ.inducedDipole[2]*psc7 +atomJ.inducedDipoleP[2]*dsc7))*0.5f +
-                     scale5i*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2] +
-                     sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2])*0.5f +
-                    0.5f*(sci4*psc5+scip4*dsc5)*atomI.labFrameDipole[2] +
-                    0.5f*(sci3*psc5+scip3*dsc5)*atomJ.labFrameDipole[2] +
-                    ((qkui_2-qIuJ_2)*psc5 + (qkuip_2-qIuJp_2)*dsc5) +
-                    gfi5*qIr_2 + gfi6*qJr_2;
-    if( cAmoebaSim.polarizationType )
-    {
-        float gfd                 = 0.5*(3.0*rr2*scip2*scale3i - 5.0f*rr2*(scip3*sci4+sci3*scip4)*scale5i);
-        float temp5               = 0.5*scale5i;
-        float fdir_0              = gfd*xr + temp5*(sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0] + sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0]);
-        float fdir_1              = gfd*yr + temp5*(sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1]);
-        float fdir_2              = gfd*zr + temp5*(sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2] + sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2]);
-        ftm2i_0                  -= fdir_0;
-        ftm2i_1                  -= fdir_1;
-        ftm2i_2                  -= fdir_2;
-    } else {
-        float scaleF              = 0.5f*scalingFactors[UScaleIndex];
-        float inducedFactor3      = scip2*rr3*scaleF;
-        float inducedFactor5      = (sci3*scip4+scip3*sci4)*rr5*scaleF;
-        float findmp_0            = inducedFactor3*ddsc3_0 - inducedFactor5*ddsc5_0;
-        float findmp_1            = inducedFactor3*ddsc3_1 - inducedFactor5*ddsc5_1;
-        float findmp_2            = inducedFactor3*ddsc3_2 - inducedFactor5*ddsc5_2;
-        ftm2i_0                  -= findmp_0;
-        ftm2i_1                  -= findmp_1;
-        ftm2i_2                  -= findmp_2;
-    }
-#endif
-#if defined T1
-    float gti2 = 0.5f*(sci4*psc5+scip4*dsc5);
-    float gti5 = gfi5;
-#endif
-#if defined T3
-    float gti3 = 0.5f*(sci3*psc5+scip3*dsc5);
-    float gti6 = gfi6;
-#endif
-#if defined T1 || defined T3
-    float dixdk_0 = atomI.labFrameDipole[1]*atomJ.labFrameDipole[2] - atomI.labFrameDipole[2]*atomJ.labFrameDipole[1];
-    float dixdk_1 = atomI.labFrameDipole[2]*atomJ.labFrameDipole[0] - atomI.labFrameDipole[0]*atomJ.labFrameDipole[2];
-    float dixdk_2 = atomI.labFrameDipole[0]*atomJ.labFrameDipole[1] - atomI.labFrameDipole[1]*atomJ.labFrameDipole[0];
-#if defined T1
-    float dixuk_0 = atomI.labFrameDipole[1]*atomJ.inducedDipole[2] - atomI.labFrameDipole[2]*atomJ.inducedDipole[1];
-    float dixuk_1 = atomI.labFrameDipole[2]*atomJ.inducedDipole[0] - atomI.labFrameDipole[0]*atomJ.inducedDipole[2];
-    float dixuk_2 = atomI.labFrameDipole[0]*atomJ.inducedDipole[1] - atomI.labFrameDipole[1]*atomJ.inducedDipole[0];
-#endif
-#endif
-#ifdef T1
-    float dixukp_0 = atomI.labFrameDipole[1]*atomJ.inducedDipoleP[2] - atomI.labFrameDipole[2]*atomJ.inducedDipoleP[1];
-    float dixukp_1 = atomI.labFrameDipole[2]*atomJ.inducedDipoleP[0] - atomI.labFrameDipole[0]*atomJ.inducedDipoleP[2];
-    float dixukp_2 = atomI.labFrameDipole[0]*atomJ.inducedDipoleP[1] - atomI.labFrameDipole[1]*atomJ.inducedDipoleP[0];
-#endif
-#ifdef T1
-    float dixr_0 = atomI.labFrameDipole[1]*zr - atomI.labFrameDipole[2]*yr;
-    float dixr_1 = atomI.labFrameDipole[2]*xr - atomI.labFrameDipole[0]*zr;
-    float dixr_2 = atomI.labFrameDipole[0]*yr - atomI.labFrameDipole[1]*xr;
-#endif
-#ifdef T1
-    float rxqiukp_0 = yr*qIuJp_2 - zr*qIuJp_1;
-    float rxqiukp_1 = zr*qIuJp_0 - xr*qIuJp_2;
-    float rxqiukp_2 = xr*qIuJp_1 - yr*qIuJp_0;
-    float rxqir_0   = yr*qIr_2 - zr*qIr_1;
-    float rxqir_1   = zr*qIr_0 - xr*qIr_2;
-    float rxqir_2   = xr*qIr_1 - yr*qIr_0;
-    float rxqiuk_0 = yr*qIuJ_2 - zr*qIuJ_1;
-    float rxqiuk_1 = zr*qIuJ_0 - xr*qIuJ_2;
-    float rxqiuk_2 = xr*qIuJ_1 - yr*qIuJ_0;
-    float ukxqir_0 = atomJ.inducedDipole[1]*qIr_2 - atomJ.inducedDipole[2]*qIr_1;
-    float ukxqir_1 = atomJ.inducedDipole[2]*qIr_0 - atomJ.inducedDipole[0]*qIr_2;
-    float ukxqir_2 = atomJ.inducedDipole[0]*qIr_1 - atomJ.inducedDipole[1]*qIr_0;
-    float ukxqirp_0 = atomJ.inducedDipoleP[1]*qIr_2 - atomJ.inducedDipoleP[2]*qIr_1;
-    float ukxqirp_1 = atomJ.inducedDipoleP[2]*qIr_0 - atomJ.inducedDipoleP[0]*qIr_2;
-    float ukxqirp_2 = atomJ.inducedDipoleP[0]*qIr_1 - atomJ.inducedDipoleP[1]*qIr_0;
-    float dixqkr_0 = atomI.labFrameDipole[1]*qJr_2 - atomI.labFrameDipole[2]*qJr_1;
-    float dixqkr_1 = atomI.labFrameDipole[2]*qJr_0 - atomI.labFrameDipole[0]*qJr_2;
-    float dixqkr_2 = atomI.labFrameDipole[0]*qJr_1 - atomI.labFrameDipole[1]*qJr_0;
-    float dkxqir_0 = atomJ.labFrameDipole[1]*qIr_2 - atomJ.labFrameDipole[2]*qIr_1;
-    float dkxqir_1 = atomJ.labFrameDipole[2]*qIr_0 - atomJ.labFrameDipole[0]*qIr_2;
-    float dkxqir_2 = atomJ.labFrameDipole[0]*qIr_1 - atomJ.labFrameDipole[1]*qIr_0;
-    float rxqikr_0 = yr*qIqJr_2 - zr*qIqJr_1;
-    float rxqikr_1 = zr*qIqJr_0 - xr*qIqJr_2;
-    float rxqikr_2 = xr*qIqJr_1 - yr*qIqJr_0;
-    float rxqidk_0 = yr*qIdJ_2 - zr*qIdJ_1;
-    float rxqidk_1 = zr*qIdJ_0 - xr*qIdJ_2;
-    float rxqidk_2 = xr*qIdJ_1 - yr*qIdJ_0;
-    float qkrxqir_0 = qJr_1*qIr_2 - qJr_2*qIr_1;
-    float qkrxqir_1 = qJr_2*qIr_0 - qJr_0*qIr_2;
-    float qkrxqir_2 = qJr_0*qIr_1 - qJr_1*qIr_0;
-#endif
-#if defined T1 || defined T3
-    float qixqk_0 = atomI.labFrameQuadrupole[1]*atomJ.labFrameQuadrupole[2] + atomI.labFrameQuadrupole[4]*atomJ.labFrameQuadrupole[5] + atomI.labFrameQuadrupole[7]*atomJ.labFrameQuadrupole[8] -
-                    atomI.labFrameQuadrupole[2]*atomJ.labFrameQuadrupole[1] - atomI.labFrameQuadrupole[5]*atomJ.labFrameQuadrupole[4] - atomI.labFrameQuadrupole[8]*atomJ.labFrameQuadrupole[7];
-    float qixqk_1 = atomI.labFrameQuadrupole[2]*atomJ.labFrameQuadrupole[0] + atomI.labFrameQuadrupole[5]*atomJ.labFrameQuadrupole[3] + atomI.labFrameQuadrupole[8]*atomJ.labFrameQuadrupole[6] -
-                    atomI.labFrameQuadrupole[0]*atomJ.labFrameQuadrupole[2] - atomI.labFrameQuadrupole[3]*atomJ.labFrameQuadrupole[5] - atomI.labFrameQuadrupole[6]*atomJ.labFrameQuadrupole[8];
-    float qixqk_2 = atomI.labFrameQuadrupole[0]*atomJ.labFrameQuadrupole[1] + atomI.labFrameQuadrupole[3]*atomJ.labFrameQuadrupole[4] + atomI.labFrameQuadrupole[6]*atomJ.labFrameQuadrupole[7] -
-                    atomI.labFrameQuadrupole[1]*atomJ.labFrameQuadrupole[0] - atomI.labFrameQuadrupole[4]*atomJ.labFrameQuadrupole[3] - atomI.labFrameQuadrupole[7]*atomJ.labFrameQuadrupole[6];
-#endif
-#ifdef T1
-    float ttm2_0  = -rr3*dixdk_0 + gf2*dixr_0-gf5*rxqir_0 + 2.0f*rr5*(dixqkr_0 + dkxqir_0 + rxqidk_0-2.0f*qixqk_0) - 4.0f*rr7*(rxqikr_0 + qkrxqir_0);
-    float ttm2_1  = -rr3*dixdk_1 + gf2*dixr_1-gf5*rxqir_1 + 2.0f*rr5*(dixqkr_1 + dkxqir_1 + rxqidk_1-2.0f*qixqk_1) - 4.0f*rr7*(rxqikr_1 + qkrxqir_1);
-    float ttm2_2  = -rr3*dixdk_2 + gf2*dixr_2-gf5*rxqir_2 + 2.0f*rr5*(dixqkr_2 + dkxqir_2 + rxqidk_2-2.0f*qixqk_2) - 4.0f*rr7*(rxqikr_2 + qkrxqir_2);
-    float ttm2i_0 = -(dixuk_0*psc3+dixukp_0*dsc3)*0.5f + gti2*dixr_0 + ((ukxqir_0+ rxqiuk_0)*psc5 + (ukxqirp_0 + rxqiukp_0)*dsc5) - gti5*rxqir_0;
-    float ttm2i_1 = -(dixuk_1*psc3+dixukp_1*dsc3)*0.5f + gti2*dixr_1 + ((ukxqir_1+ rxqiuk_1)*psc5 + (ukxqirp_1 + rxqiukp_1)*dsc5) - gti5*rxqir_1;
-    float ttm2i_2 = -(dixuk_2*psc3+dixukp_2*dsc3)*0.5f + gti2*dixr_2 + ((ukxqir_2+ rxqiuk_2)*psc5 + (ukxqirp_2 + rxqiukp_2)*dsc5) - gti5*rxqir_2;
-#endif
-#ifdef T3
-    float qJqIr_0 = atomJ.labFrameQuadrupole[0]*qIr_0 + atomJ.labFrameQuadrupole[3]*qIr_1 + atomJ.labFrameQuadrupole[6]*qIr_2;
-    float qJqIr_1 = atomJ.labFrameQuadrupole[1]*qIr_0 + atomJ.labFrameQuadrupole[4]*qIr_1 + atomJ.labFrameQuadrupole[7]*qIr_2;
-    float qJqIr_2 = atomJ.labFrameQuadrupole[2]*qIr_0 + atomJ.labFrameQuadrupole[5]*qIr_1 + atomJ.labFrameQuadrupole[8]*qIr_2;
-    float qJdI_0 = atomJ.labFrameQuadrupole[0]*atomI.labFrameDipole[0] + atomJ.labFrameQuadrupole[3]*atomI.labFrameDipole[1] + atomJ.labFrameQuadrupole[6]*atomI.labFrameDipole[2];
-    float qJdI_1 = atomJ.labFrameQuadrupole[1]*atomI.labFrameDipole[0] + atomJ.labFrameQuadrupole[4]*atomI.labFrameDipole[1] + atomJ.labFrameQuadrupole[7]*atomI.labFrameDipole[2];
-    float qJdI_2 = atomJ.labFrameQuadrupole[2]*atomI.labFrameDipole[0] + atomJ.labFrameQuadrupole[5]*atomI.labFrameDipole[1] + atomJ.labFrameQuadrupole[8]*atomI.labFrameDipole[2];
-    float dkxr_0 = atomJ.labFrameDipole[1]*zr - atomJ.labFrameDipole[2]*yr;
-    float dkxr_1 = atomJ.labFrameDipole[2]*xr - atomJ.labFrameDipole[0]*zr;
-    float dkxr_2 = atomJ.labFrameDipole[0]*yr - atomJ.labFrameDipole[1]*xr;
-    float rxqkr_0 = yr*qJr_2 - zr*qJr_1;
-    float rxqkr_1 = zr*qJr_0 - xr*qJr_2;
-    float rxqkr_2 = xr*qJr_1 - yr*qJr_0;
-    float dixqkr_0 = atomI.labFrameDipole[1]*qJr_2 - atomI.labFrameDipole[2]*qJr_1;
-    float dixqkr_1 = atomI.labFrameDipole[2]*qJr_0 - atomI.labFrameDipole[0]*qJr_2;
-    float dixqkr_2 = atomI.labFrameDipole[0]*qJr_1 - atomI.labFrameDipole[1]*qJr_0;
-    float dkxqir_0 = atomJ.labFrameDipole[1]*qIr_2 - atomJ.labFrameDipole[2]*qIr_1;
-    float dkxqir_1 = atomJ.labFrameDipole[2]*qIr_0 - atomJ.labFrameDipole[0]*qIr_2;
-    float dkxqir_2 = atomJ.labFrameDipole[0]*qIr_1 - atomJ.labFrameDipole[1]*qIr_0;
-    float rxqkdi_0 = yr*qJdI_2 - zr*qJdI_1;
-    float rxqkdi_1 = zr*qJdI_0 - xr*qJdI_2;
-    float rxqkdi_2 = xr*qJdI_1 - yr*qJdI_0;
-    float rxqkir_0 = yr*qJqIr_2 - zr*qJqIr_1;
-    float rxqkir_1 = zr*qJqIr_0 - xr*qJqIr_2;
-    float rxqkir_2 = xr*qJqIr_1 - yr*qJqIr_0;
-    float qkrxqir_0 = qJr_1*qIr_2 - qJr_2*qIr_1;
-    float qkrxqir_1 = qJr_2*qIr_0 - qJr_0*qIr_2;
-    float qkrxqir_2 = qJr_0*qIr_1 - qJr_1*qIr_0;
-    float dkxui_0 = atomJ.labFrameDipole[1]*atomI.inducedDipole[2] - atomJ.labFrameDipole[2]*atomI.inducedDipole[1];
-    float dkxui_1 = atomJ.labFrameDipole[2]*atomI.inducedDipole[0] - atomJ.labFrameDipole[0]*atomI.inducedDipole[2]; 
-    float dkxui_2 = atomJ.labFrameDipole[0]*atomI.inducedDipole[1] - atomJ.labFrameDipole[1]*atomI.inducedDipole[0];
-    float dkxuip_0 = atomJ.labFrameDipole[1]*atomI.inducedDipoleP[2] - atomJ.labFrameDipole[2]*atomI.inducedDipoleP[1];
-    float dkxuip_1 = atomJ.labFrameDipole[2]*atomI.inducedDipoleP[0] - atomJ.labFrameDipole[0]*atomI.inducedDipoleP[2];
-    float dkxuip_2 = atomJ.labFrameDipole[0]*atomI.inducedDipoleP[1] - atomJ.labFrameDipole[1]*atomI.inducedDipoleP[0];
-    float uixqkrp_0 = atomI.inducedDipoleP[1]*qJr_2 - atomI.inducedDipoleP[2]*qJr_1;
-    float uixqkrp_1 = atomI.inducedDipoleP[2]*qJr_0 - atomI.inducedDipoleP[0]*qJr_2;
-    float uixqkrp_2 = atomI.inducedDipoleP[0]*qJr_1 - atomI.inducedDipoleP[1]*qJr_0;
-    float uixqkr_0 = atomI.inducedDipole[1]*qJr_2 - atomI.inducedDipole[2]*qJr_1;
-    float uixqkr_1 = atomI.inducedDipole[2]*qJr_0 - atomI.inducedDipole[0]*qJr_2;
-    float uixqkr_2 = atomI.inducedDipole[0]*qJr_1 - atomI.inducedDipole[1]*qJr_0;
-    float rxqkuip_0 = yr*qJuIp_2 - zr*qJuIp_1;
-    float rxqkuip_1 = zr*qJuIp_0 - xr*qJuIp_2;
-    float rxqkuip_2 = xr*qJuIp_1 - yr*qJuIp_0;
-    float rxqkui_0 = yr*qJuI_2 - zr*qJuI_1;
-    float rxqkui_1 = zr*qJuI_0 - xr*qJuI_2;
-    float rxqkui_2 = xr*qJuI_1 - yr*qJuI_0;
-    float ttm3_0   =  rr3*dixdk_0 + gf3*dkxr_0 - gf6*rxqkr_0 - 2.0f*rr5*(dixqkr_0 + dkxqir_0 + rxqkdi_0 - 2.0f*qixqk_0) - 4.0f*rr7*(rxqkir_0 - qkrxqir_0);
-    float ttm3_1   =  rr3*dixdk_1 + gf3*dkxr_1 - gf6*rxqkr_1 - 2.0f*rr5*(dixqkr_1 + dkxqir_1 + rxqkdi_1 - 2.0f*qixqk_1) - 4.0f*rr7*(rxqkir_1 - qkrxqir_1);
-    float ttm3_2   =  rr3*dixdk_2 + gf3*dkxr_2 - gf6*rxqkr_2 - 2.0f*rr5*(dixqkr_2 + dkxqir_2 + rxqkdi_2 - 2.0f*qixqk_2) - 4.0f*rr7*(rxqkir_2 - qkrxqir_2);
-    float ttm3i_0  = -(dkxui_0*psc3+ dkxuip_0*dsc3)*0.5f + gti3*dkxr_0 - ((uixqkr_0 + rxqkui_0)*psc5 + (uixqkrp_0 + rxqkuip_0)*dsc5) - gti6*rxqkr_0;
-    float ttm3i_1  = -(dkxui_1*psc3+ dkxuip_1*dsc3)*0.5f + gti3*dkxr_1 - ((uixqkr_1 + rxqkui_1)*psc5 + (uixqkrp_1 + rxqkuip_1)*dsc5) - gti6*rxqkr_1;
-    float ttm3i_2  = -(dkxui_2*psc3+ dkxuip_2*dsc3)*0.5f + gti3*dkxr_2 - ((uixqkr_2 + rxqkui_2)*psc5 + (uixqkrp_2 + rxqkuip_2)*dsc5) - gti6*rxqkr_2;
-#endif
-    if( scalingFactors[MScaleIndex] < 1.0f ){
-#ifdef T1
-        ttm2_0 *= scalingFactors[MScaleIndex];
-        ttm2_1 *= scalingFactors[MScaleIndex];
-        ttm2_2 *= scalingFactors[MScaleIndex];
-#endif
-#ifdef T3
-        ttm3_0 *= scalingFactors[MScaleIndex];
-        ttm3_1 *= scalingFactors[MScaleIndex];
-        ttm3_2 *= scalingFactors[MScaleIndex];
-#endif
-    }
-#ifdef F1
-    outputForce[0]       = -(ftm2_0+ftm2i_0);
-    outputForce[1]       = -(ftm2_1+ftm2i_1);
-    outputForce[2]       = -(ftm2_2+ftm2i_2);
-#endif
-#ifdef T1
-    outputForce[0]       =  (ttm2_0 + ttm2i_0);
-    outputForce[1]       =  (ttm2_1 + ttm2i_1);
-    outputForce[2]       =  (ttm2_2 + ttm2i_2);
-#endif
-#ifdef T3
-    outputForce[0]       =  (ttm3_0 + ttm3i_0);
-    outputForce[1]       =  (ttm3_1 + ttm3i_1);
-    outputForce[2]       =  (ttm3_2 + ttm3i_2);
-#endif
-    return;
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-void SetCalculateAmoebaCudaFixedEAndGKFieldsSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-void GetCalculateAmoebaCudaFixedEAndGKFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
-    RTERROR(status, "GetCalculateAmoebaCudaFixedEAndGKFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-// reduce psWorkArray_3_1 -> E_Field
-// reduce psWorkArray_3_2 -> E_FieldPolar
-// reduce psWorkArray_3_3 -> Gk_FieldPolar
-static void kReduceEAndGkFields(amoebaGpuContext amoebaGpu )
-{
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
-    LAUNCHERROR("kReduceEAndGK_Fields1");
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
-    LAUNCHERROR("kReduceEAndGK_Fields2");
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_3->_pDevData, amoebaGpu->psGk_Field->_pDevData, 0 );
-    LAUNCHERROR("kReduceEAndGK_Fields3");
-}
-// file includes FixedFieldParticle struct definition/load/unload struct and kernel body for fixed E-field
-#define GK
-#include "kCalculateAmoebaCudaFixedFieldParticle.h"
-#undef GK
-__device__ void calculateFixedGkFieldPairIxn_kernel( float4 atomCoordinatesI,       float4 atomCoordinatesJ,
-                                                     float* labFrameDipoleI,        float* labFrameDipoleJ,
-                                                     float* labFrameQuadrupoleI,    float* labFrameQuadrupoleJ,
-                                                     float  rb2,
-                                                     float outputField[2][3]
- ){
-    float xi,yi,zi;
-    float xr,yr,zr;
-    float xr2,yr2,zr2;
-    float ci,ck;
-    float uxi,uyi,uzi;
-    float uxk,uyk,uzk;
-    float qxxi,qxyi,qxzi;
-    float qyyi,qyzi,qzzi;
-    float qxxk,qxyk,qxzk;
-    float qyyk,qyzk,qzzk;
-    float r2;
-    float fc,fd,fq;
-    float expterm;
-    float gf,gf2,gf3,gf5;
-    float gf7;
-    float expc,dexpc;
-    float expc1,expcdexpc;
-    float a[4][4];
-    float gc[5];
-    float gux[11],guy[11],guz[11];
-    float gqxx[5],gqxy[5];
-    float gqxz[5],gqyy[5];
-    float gqyz[5],gqzz[5];
-    float gkc;
-    gkc          = cAmoebaSim.gkc;
-    fc           = cAmoebaSim.fc;
-    fd           = cAmoebaSim.fd;
-    fq           = cAmoebaSim.fq;
-    xi           = atomCoordinatesI.x;
-    yi           = atomCoordinatesI.y;
-    zi           = atomCoordinatesI.z;
-    ci           = atomCoordinatesI.w;
-    uxi          = labFrameDipoleI[0];
-    uyi          = labFrameDipoleI[1];
-    uzi          = labFrameDipoleI[2];
-    qxxi         = labFrameQuadrupoleI[0];
-    qxyi         = labFrameQuadrupoleI[1];
-    qxzi         = labFrameQuadrupoleI[2];
-    qyyi         = labFrameQuadrupoleI[4];
-    qyzi         = labFrameQuadrupoleI[5];
-    qzzi         = labFrameQuadrupoleI[8];
-    xr           = atomCoordinatesJ.x - xi;
-    yr           = atomCoordinatesJ.y - yi;
-    zr           = atomCoordinatesJ.z - zi;
-    ck           = atomCoordinatesJ.w;
-    xr2          = xr*xr;
-    yr2          = yr*yr;
-    zr2          = zr*zr;
-    r2           = xr2 + yr2 + zr2;
-    uxk          = labFrameDipoleJ[0];
-    uyk          = labFrameDipoleJ[1];
-    uzk          = labFrameDipoleJ[2];
-    qxxk         = labFrameQuadrupoleJ[0];
-    qxyk         = labFrameQuadrupoleJ[1];
-    qxzk         = labFrameQuadrupoleJ[2];
-    qyyk         = labFrameQuadrupoleJ[4];
-    qyzk         = labFrameQuadrupoleJ[5];
-    qzzk         = labFrameQuadrupoleJ[8];
-    expterm      = expf(-r2/(gkc*rb2));
-    expc         = expterm / gkc;
-    dexpc        = -2.0f / (gkc*rb2);
-    gf2          = 1.0f / (r2+rb2*expterm);
-    gf           = sqrtf(gf2);
-    gf3          = gf2 * gf;
-    gf5          = gf3 * gf2;
-    gf7          = gf5 * gf2;
-    // reaction potential auxiliary terms
-    a[0][0]      = gf;
-    a[1][0]      = -gf3;
-    a[2][0]      = 3.0f * gf5;
-    a[3][0]      = -15.0f * gf7;
-    // reaction potential gradient auxiliary terms
-    expc1        = 1.0f - expc;
-    a[0][1]      = expc1 * a[1][0];
-    a[1][1]      = expc1 * a[2][0];
-    a[2][1]      = expc1 * a[3][0];
-    // dipole second reaction potential gradient auxiliary term
-    expcdexpc    = -expc * dexpc;
-    a[1][2]      = expc1*a[2][1] + expcdexpc*a[2][0];
-    // multiply the auxillary terms by dielectric functions;
-    a[0][1]      = fc * a[0][1];
-    a[1][0]      = fd * a[1][0];
-    a[1][1]      = fd * a[1][1];
-    a[1][2]      = fd * a[1][2];
-    a[2][0]      = fq * a[2][0];
-    a[2][1]      = fq * a[2][1];
-    // unweighted dipole reaction potential tensor
-    gux[1]       = xr * a[1][0];
-    guy[1]       = yr * a[1][0];
-    guz[1]       = zr * a[1][0];
-    // unweighted reaction potential gradient tensor
-    gc[2]        = xr * a[0][1];
-    gc[3]        = yr * a[0][1];
-    gc[4]        = zr * a[0][1];
-    gux[2]       = a[1][0] + xr2*a[1][1];
-    gux[3]       = xr * yr * a[1][1];
-    gux[4]       = xr * zr * a[1][1];
-    guy[2]       = gux[3];
-    guy[3]       = a[1][0] + yr2*a[1][1];
-    guy[4]       = yr * zr * a[1][1];
-    guz[2]       = gux[4];
-    guz[3]       = guy[4];
-    guz[4]       = a[1][0] + zr2*a[1][1];
-    gqxx[2]      = xr * (2.0f*a[2][0]+xr2*a[2][1]);
-    gqxx[3]      = yr * xr2*a[2][1];
-    gqxx[4]      = zr * xr2*a[2][1];
-    gqyy[2]      = xr * yr2*a[2][1];
-    gqyy[3]      = yr * (2.0f*a[2][0]+yr2*a[2][1]);
-    gqyy[4]      = zr * yr2 * a[2][1];
-    gqzz[2]      = xr * zr2 * a[2][1];
-    gqzz[3]      = yr * zr2 * a[2][1];
-    gqzz[4]      = zr * (2.0f*a[2][0]+zr2*a[2][1]);
-    gqxy[2]      = yr * (a[2][0]+xr2*a[2][1]);
-    gqxy[3]      = xr * (a[2][0]+yr2*a[2][1]);
-    gqxy[4]      = zr * xr * yr * a[2][1];
-    gqxz[2]      = zr * (a[2][0]+xr2*a[2][1]);
-    gqxz[3]      = gqxy[4];
-    gqxz[4]      = xr * (a[2][0]+zr2*a[2][1]);
-    gqyz[2]      = gqxy[4];
-    gqyz[3]      = zr * (a[2][0]+yr2*a[2][1]);
-    gqyz[4]      = yr * (a[2][0]+zr2*a[2][1]);
-    // unweighted dipole second reaction potential gradient tensor
-    gux[5]       = xr * (3.0f*a[1][1]+xr2*a[1][2]);
-    gux[6]       = yr * (a[1][1]+xr2*a[1][2]);
-    gux[7]       = zr * (a[1][1]+xr2*a[1][2]);
-    gux[8]       = xr * (a[1][1]+yr2*a[1][2]);
-    gux[9]       = zr * xr * yr * a[1][2];
-    gux[10]      = xr * (a[1][1]+zr2*a[1][2]);
-    guy[5]       = yr * (a[1][1]+xr2*a[1][2]);
-    guy[6]       = xr * (a[1][1]+yr2*a[1][2]);
-    guy[7]       = gux[9];
-    guy[8]       = yr * (3.0f*a[1][1]+yr2*a[1][2]);
-    guy[9]       = zr * (a[1][1]+yr2*a[1][2]);
-    guy[10]      = yr * (a[1][1]+zr2*a[1][2]);
-    guz[5]       = zr * (a[1][1]+xr2*a[1][2]);
-    guz[6]       = gux[9];
-    guz[7]       = xr * (a[1][1]+zr2*a[1][2]);
-    guz[8]       = zr * (a[1][1]+yr2*a[1][2]);
-    guz[9]       = yr * (a[1][1]+zr2*a[1][2]);
-    guz[10]      = zr * (3.0f*a[1][1]+zr2*a[1][2]);
-    // generalized Kirkwood permanent reaction field
-    outputField[0][0] = uxk*gux[2] + uyk*gux[3] + uzk*gux[4]
-                                   + 0.5f * (ck*gux[1] + qxxk*gux[5]
-                                   + qyyk*gux[8] + qzzk*gux[10]
-                                   + 2.0f*(qxyk*gux[6]+qxzk*gux[7]
-                                   + qyzk*gux[9]))
-                                   + 0.5f * (ck*gc[2] + qxxk*gqxx[2]
-                                   + qyyk*gqyy[2] + qzzk*gqzz[2]
-                                   + 2.0f*(qxyk*gqxy[2]+qxzk*gqxz[2]
-                                   + qyzk*gqyz[2]));
-    outputField[0][1] = uxk*guy[2] + uyk*guy[3] + uzk*guy[4]
-                                   + 0.5f * (ck*guy[1] + qxxk*guy[5]
-                                   + qyyk*guy[8] + qzzk*guy[10]
-                                   + 2.0f*(qxyk*guy[6]+qxzk*guy[7]
-                                   + qyzk*guy[9]))
-                                   + 0.5f * (ck*gc[3] + qxxk*gqxx[3]
-                                   + qyyk*gqyy[3] + qzzk*gqzz[3]
-                                   + 2.0f*(qxyk*gqxy[3]+qxzk*gqxz[3]
-                                   + qyzk*gqyz[3]));
-    outputField[0][2] = uxk*guz[2] + uyk*guz[3] + uzk*guz[4]
-                                   + 0.5f * (ck*guz[1] + qxxk*guz[5]
-                                   + qyyk*guz[8] + qzzk*guz[10]
-                                   + 2.0f*(qxyk*guz[6]+qxzk*guz[7]
-                                   + qyzk*guz[9]))
-                                   + 0.5f * (ck*gc[4] + qxxk*gqxx[4]
-                                   + qyyk*gqyy[4] + qzzk*gqzz[4]
-                                   + 2.0f*(qxyk*gqxy[4]+qxzk*gqxz[4]
-                                   + qyzk*gqyz[4]));
-    outputField[1][0] = uxi*gux[2] + uyi*gux[3] + uzi*gux[4]
-                                   - 0.5f * (ci*gux[1] + qxxi*gux[5]
-                                   + qyyi*gux[8] + qzzi*gux[10]
-                                   + 2.0f*(qxyi*gux[6]+qxzi*gux[7]
-                                   + qyzi*gux[9]))
-                                   - 0.5f * (ci*gc[2] + qxxi*gqxx[2]
-                                   + qyyi*gqyy[2] + qzzi*gqzz[2]
-                                   + 2.0f*(qxyi*gqxy[2]+qxzi*gqxz[2]
-                                   + qyzi*gqyz[2]));
-    outputField[1][1] = uxi*guy[2] + uyi*guy[3] + uzi*guy[4]
-                                   - 0.5f * (ci*guy[1] + qxxi*guy[5]
-                                   + qyyi*guy[8] + qzzi*guy[10]
-                                   + 2.0f*(qxyi*guy[6]+qxzi*guy[7]
-                                   + qyzi*guy[9]))
-                                   - 0.5f * (ci*gc[3]      + qxxi*gqxx[3]
-                                   + qyyi*gqyy[3] + qzzi*gqzz[3]
-                                   + 2.0f*(qxyi*gqxy[3]+qxzi*gqxz[3]
-                                   + qyzi*gqyz[3]));
-    outputField[1][2] = uxi*guz[2] + uyi*guz[3] + uzi*guz[4]
-                                   - 0.5f * (ci*guz[1] + qxxi*guz[5]
-                                   + qyyi*guz[8] + qzzi*guz[10]
-                                   + 2.0f*(qxyi*guz[6]+qxzi*guz[7]
-                                   + qyzi*guz[9]))
-                                   - 0.5f * (ci*gc[4] + qxxi*gqxx[4]
-                                   + qyyi*gqyy[4] + qzzi*gqzz[4]
-                                   + 2.0f*(qxyi*gqxy[4]+qxzi*gqxz[4]
-                                   + qyzi*gqyz[4]));
-}
-// Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaFixedEAndGkFields.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaFixedEAndGkFields.h"
-/**---------------------------------------------------------------------------------------
-   Compute fixed electric field
-   @param amoebaGpu        amoebaGpu context
-   @param gpu              OpenMM gpu Cuda context
-   --------------------------------------------------------------------------------------- */
-void cudaComputeAmoebaFixedEAndGkFields( amoebaGpuContext amoebaGpu )
-{
-   // ---------------------------------------------------------------------------------------
-   // ---------------------------------------------------------------------------------------
-    gpuContext gpu                             = amoebaGpu->gpuContext;
-    // on first pass, set threads/block
-    static unsigned int threadsPerBlock        = 0;
-    if( threadsPerBlock == 0 ){
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            maxThreads = 256;
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-    }
-    kClearFields_3( amoebaGpu, 3 );
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaFixedEAndGkFieldN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_3->_pDevData );
-    } else {
-        kCalculateAmoebaFixedEAndGkFieldN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                          gpu->psWorkUnit->_pDevData,
-                                                          amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData,
-                                                          amoebaGpu->psWorkArray_3_3->_pDevData );
-    }
-    LAUNCHERROR("kCalculateAmoebaFixedEAndGkFieldN2_kernel");
-    kReduceEAndGkFields( amoebaGpu );
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEAndGkFields.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaScaleFactors.h"
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(256, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(128, 1)
-#else
-__launch_bounds__(64, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaFixedEAndGkField, _kernel)(
-                            unsigned int* workUnit,
-                            float* outputEField,
-                            float* outputEFieldPolar,
-                            float* outputGkField){
-    extern __shared__ FixedFieldParticle sA[];
-    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
-    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
-    unsigned int pos             = warp*numWorkUnits/totalWarps;
-    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
-    unsigned int lasty           = 0xFFFFFFFF;
-    float4* atomCoord            = cSim.pPosq;
-    float* labFrameDipole        = cAmoebaSim.pLabFrameDipole;
-    float* labFrameQuadrupole    = cAmoebaSim.pLabFrameQuadrupole;
-    float* bornRadii             = cSim.pBornRadii;
-    float4 jCoord;
-    float  jBornRadius;
-    float  jDipole[3];    
-    float  jQuadrupole[9];    
-    while (pos < end)
-    {
-        unsigned int x;
-        unsigned int y;
-        bool bExclusionFlag;
-        // Extract cell coordinates
-        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
-        unsigned int tgx           = threadIdx.x & (GRID - 1);
-        unsigned int tbx           = threadIdx.x - tgx;
-        unsigned int tj            = tgx;
-        FixedFieldParticle* psA    = &sA[tbx];
-        unsigned int atomI         = x + tgx;
-        FixedFieldParticle localParticle;
-        loadFixedFieldShared( &localParticle, atomI, bornRadii ); 
-        float4 iCoord              = atomCoord[atomI];
-        float eFieldSum[3];
-        float eFieldPolarSum[3];
-        float gkFieldSum[3];
-        eFieldSum[0]               = 0.0f;
-        eFieldSum[1]               = 0.0f;
-        eFieldSum[2]               = 0.0f;
-        eFieldPolarSum[0]          = 0.0f;
-        eFieldPolarSum[1]          = 0.0f;
-        eFieldPolarSum[2]          = 0.0f;
-        gkFieldSum[0]              = 0.0f;
-        gkFieldSum[1]              = 0.0f;
-        gkFieldSum[2]              = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // load coordinates, charge, ...
-            loadFixedFieldShared( &(sA[threadIdx.x]), atomI, bornRadii );
-            if (!bExclusionFlag)
-            {
-                // this branch is never exercised since it includes the
-                // interaction between atomI and itself which is always excluded
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijField[2][3];
-                    // load coords, charge, ...
-                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
-                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    eFieldSum[0]           += match ? 0.0f : ijField[0][0];
-                    eFieldSum[1]           += match ? 0.0f : ijField[0][1];
-                    eFieldSum[2]           += match ? 0.0f : ijField[0][2];
-                    eFieldPolarSum[0]      += match ? 0.0f : ijField[0][0];
-                    eFieldPolarSum[1]      += match ? 0.0f : ijField[0][1];
-                    eFieldPolarSum[2]      += match ? 0.0f : ijField[0][2];
-                    // GK field
-                    calculateFixedGkFieldPairIxn_kernel( iCoord,                             jCoord,
-                                                         &(labFrameDipole[atomI*3]),         jDipole,
-                                                         &(labFrameQuadrupole[atomI*9]),     jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,       ijField);
-                    gkFieldSum[0]          += match ? 0.0f : ijField[0][0];
-                    gkFieldSum[1]          += match ? 0.0f : ijField[0][1];
-                    gkFieldSum[2]          += match ? 0.0f : ijField[0][2];
-                }
-            }
-            else  // bExclusion
-            {
-                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    // load coords, charge, ...
-                    float ijField[2][3];
-                    loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole, &jBornRadius );
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
-                    float dScaleVal;
-                    float pScaleVal;
-                    getMaskedDScaleFactor( j, dScaleMask, &dScaleVal );
-                    getMaskedPScaleFactor( j, pScaleMask, &pScaleVal );
-                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
-                    // by setting match flag
-                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    eFieldSum[0]           += match ? 0.0f : dScaleVal*ijField[0][0];
-                    eFieldSum[1]           += match ? 0.0f : dScaleVal*ijField[0][1];
-                    eFieldSum[2]           += match ? 0.0f : dScaleVal*ijField[0][2];
-                    eFieldPolarSum[0]      += match ? 0.0f : pScaleVal*ijField[0][0];
-                    eFieldPolarSum[1]      += match ? 0.0f : pScaleVal*ijField[0][1];
-                    eFieldPolarSum[2]      += match ? 0.0f : pScaleVal*ijField[0][2];
-                    // GK field
-                    calculateFixedGkFieldPairIxn_kernel( iCoord,                                 jCoord,
-                                                         &(labFrameDipole[atomI*3]),             jDipole,
-                                                         &(labFrameQuadrupole[atomI*9]),         jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,           ijField);
-                    match                   = (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ? 1 : 0;
-                    gkFieldSum[0]          += match ? 0.0f : ijField[0][0];
-                    gkFieldSum[1]          += match ? 0.0f : ijField[0][1];
-                    gkFieldSum[2]          += match ? 0.0f : ijField[0][2];
-                }
-            }
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, eFieldSum,       outputEField );
-            load3dArrayBufferPerWarp( offset, eFieldPolarSum,  outputEFieldPolar );
-            load3dArrayBufferPerWarp( offset, gkFieldSum,      outputGkField );
-#else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, eFieldSum,       outputEField );
-            load3dArray( offset, eFieldPolarSum,  outputEFieldPolar );
-            load3dArray( offset, gkFieldSum,      outputGkField );
-#endif
-        }
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                // load coordinates, charge, ...
-                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx),  bornRadii );
-            }
-            // zero shared fields
-            zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijField[2][3];
-                    // load coords, charge, ...
-                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole, &jBornRadius );
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    eFieldSum[0]       += ijField[0][0];
-                    eFieldSum[1]       += ijField[0][1];
-                    eFieldSum[2]       += ijField[0][2];
-                    eFieldPolarSum[0]  += ijField[0][0];
-                    eFieldPolarSum[1]  += ijField[0][1];
-                    eFieldPolarSum[2]  += ijField[0][2];
-                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-                    psA[tj].eField[0]  += ijField[1][0];
-                    psA[tj].eField[1]  += ijField[1][1];
-                    psA[tj].eField[2]  += ijField[1][2];
-                    psA[tj].eFieldP[0] += ijField[1][0];
-                    psA[tj].eFieldP[1] += ijField[1][1];
-                    psA[tj].eFieldP[2] += ijField[1][2];
-                    // Gk field
-                    calculateFixedGkFieldPairIxn_kernel( iCoord,                                          jCoord,
-                                                         &(labFrameDipole[atomI*3]),                      jDipole,
-                                                         &(labFrameQuadrupole[atomI*9]),                  jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,                    ijField);
-                    gkFieldSum[0]              += ijField[0][0];
-                    gkFieldSum[1]              += ijField[0][1];
-                    gkFieldSum[2]              += ijField[0][2];
-                    psA[tj].gkField[0]         += ijField[1][0];
-                    psA[tj].gkField[1]         += ijField[1][1];
-                    psA[tj].gkField[2]         += ijField[1][2];
-                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            else  // bExclusion
-            {
-                // Read fixed atom data into registers and GRF
-                unsigned int xi   = x >> GRIDBITS;
-                unsigned int yi   = y >> GRIDBITS;
-                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    // load coords, charge, ...
-                    float ijField[2][3];
-                    loadFixedFieldParticleData( &(psA[tj]),  &jCoord, jDipole, jQuadrupole, &jBornRadius );
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
-                    float dScaleVal;
-                    float pScaleVal;
-                    getMaskedDScaleFactor( tj, dScaleMask, &dScaleVal );
-                    getMaskedPScaleFactor( tj, pScaleMask, &pScaleVal );
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    eFieldSum[0]         += dScaleVal*ijField[0][0];
-                    eFieldSum[1]         += dScaleVal*ijField[0][1];
-                    eFieldSum[2]         += dScaleVal*ijField[0][2];
-                    eFieldPolarSum[0]    += pScaleVal*ijField[0][0];
-                    eFieldPolarSum[1]    += pScaleVal*ijField[0][1];
-                    eFieldPolarSum[2]    += pScaleVal*ijField[0][2];
-                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-                    psA[tj].eField[0]    += dScaleVal*ijField[1][0];
-                    psA[tj].eField[1]    += dScaleVal*ijField[1][1];
-                    psA[tj].eField[2]    += dScaleVal*ijField[1][2];
-                    psA[tj].eFieldP[0]   += pScaleVal*ijField[1][0];
-                    psA[tj].eFieldP[1]   += pScaleVal*ijField[1][1];
-                    psA[tj].eFieldP[2]   += pScaleVal*ijField[1][2];
-                    // GK field
-                    calculateFixedGkFieldPairIxn_kernel( iCoord,                             jCoord,
-                                                         &(labFrameDipole[atomI*3]),         jDipole,
-                                                         &(labFrameQuadrupole[atomI*9]),     jQuadrupole,
-                                                         bornRadii[atomI]*jBornRadius,       ijField);
-                    if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
-                        gkFieldSum[0]        += ijField[0][0];
-                        gkFieldSum[1]        += ijField[0][1];
-                        gkFieldSum[2]        += ijField[0][2];
-                        psA[tj].gkField[0]   += ijField[1][0];
-                        psA[tj].gkField[1]   += ijField[1][1];
-                        psA[tj].gkField[2]   += ijField[1][2];
-                    }
-                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp * cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, eFieldSum,       outputEField );
-            load3dArrayBufferPerWarp( offset, eFieldPolarSum,  outputEFieldPolar );
-            load3dArrayBufferPerWarp( offset, gkFieldSum,      outputGkField );
-            offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].gkField, outputGkField );
-#else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, eFieldSum,       outputEField );
-            load3dArray( offset, eFieldPolarSum,  outputEFieldPolar );
-            load3dArray( offset, gkFieldSum,      outputGkField );
-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
-            load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
-            load3dArray( offset, sA[threadIdx.x].gkField, outputGkField );
-#endif
-            lasty = y;
-        }
-        pos++;
-    }
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEField.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEField.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaCudaKernels.h"
-#include "kCalculateAmoebaCudaUtilities.h"
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-void SetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
-    RTERROR(status, "SetCalculateAmoebaCudaFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-void GetCalculateAmoebaCudaFixedEFieldSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "GetCalculateAmoebaCudaFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
-    RTERROR(status, "GetCalculateAmoebaCudaFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-// reduce psWorkArray_3_1 -> EField
-// reduce psWorkArray_3_2 -> EFieldPolar
-static void kReduceE_Fields_kernel(amoebaGpuContext amoebaGpu )
-{
-    gpuContext gpu = amoebaGpu->gpuContext;
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData, 0 );
-    LAUNCHERROR("kReduceE_Fields1");
-    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
-                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
-                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 0 );
-    LAUNCHERROR("kReduceE_Fields2");
-}
-// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
-#undef GK
-#include "kCalculateAmoebaCudaFixedFieldParticle.h"
-// Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaFixedEField.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaFixedEField.h"
-/**---------------------------------------------------------------------------------------
-   Compute fixed electric field
-   @param amoebaGpu        amoebaGpu context
-   @param gpu              OpenMM gpu Cuda context
-   --------------------------------------------------------------------------------------- */
-void cudaComputeAmoebaFixedEField( amoebaGpuContext amoebaGpu )
-{
-    gpuContext gpu    = amoebaGpu->gpuContext;
-    kClearFields_3( amoebaGpu, 2 );
-    static unsigned int threadsPerBlock = 0;
-    if( threadsPerBlock == 0 ){ 
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            maxThreads = 512; 
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128; 
-        else 
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-    }
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaFixedE_FieldN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-    } else {
-        kCalculateAmoebaFixedE_FieldN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
-                                                                           gpu->psWorkUnit->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
-                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
-    }
-    LAUNCHERROR("kCalculateAmoebaFixedE_FieldN2Forces_kernel");
-    kReduceE_Fields_kernel( amoebaGpu );
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEField.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedEField.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "amoebaScaleFactors.h"
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateAmoebaFixedE_Field, Forces_kernel)(
-                            unsigned int* workUnit,
-                            float* outputEField,
-                            float* outputEFieldPolar){
-    extern __shared__ FixedFieldParticle sA[];
-    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
-    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
-    unsigned int pos             = warp*numWorkUnits/totalWarps;
-    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
-    unsigned int lasty           = 0xFFFFFFFF;
-    while (pos < end)
-    {
-        unsigned int x;
-        unsigned int y;
-        bool bExclusionFlag;
-        // extract cell coordinates
-        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
-        unsigned int tgx           = threadIdx.x & (GRID - 1);
-        unsigned int tbx           = threadIdx.x - tgx;
-        unsigned int tj            = tgx;
-        FixedFieldParticle* psA    = &sA[tbx];
-        unsigned int atomI         = x + tgx;
-        FixedFieldParticle localParticle;
-        loadFixedFieldShared( &localParticle, atomI );
-        float fieldSum[3];
-        float fieldPolarSum[3];
-        fieldSum[0]                = 0.0f;
-        fieldSum[1]                = 0.0f;
-        fieldSum[2]                = 0.0f;
-        fieldPolarSum[0]           = 0.0f;
-        fieldPolarSum[1]           = 0.0f;
-        fieldPolarSum[2]           = 0.0f;
-        if (x == y)
-        {
-            // load coordinates, charge, ...
-            loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
-            if (!bExclusionFlag)
-            {
-                // this branch is never exercised since it includes the
-                // interaction between atomI and itself which is always excluded
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijField[2][3];
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
-                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    fieldSum[0]            += match ? 0.0f : ijField[0][0];
-                    fieldSum[1]            += match ? 0.0f : ijField[0][1];
-                    fieldSum[2]            += match ? 0.0f : ijField[0][2];
-                    fieldPolarSum[0]       += match ? 0.0f : ijField[0][0];
-                    fieldPolarSum[1]       += match ? 0.0f : ijField[0][1];
-                    fieldPolarSum[2]       += match ? 0.0f : ijField[0][2];
-                }
-            }
-            else  // bExclusion
-            {
-                unsigned int xi       = x >> GRIDBITS;
-                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                int  dScaleMask       = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask       = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    // load coords, charge, ...
-                    float ijField[2][3];
-                    //loadFixedFieldParticleData( &(psA[j]), &jCoord, jDipole, jQuadrupole );
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[j], ijField);
-                    float dScaleVal;
-                    float pScaleVal;
-                    getMaskedDScaleFactor( j, dScaleMask, &dScaleVal );
-                    getMaskedPScaleFactor( j, pScaleMask, &pScaleVal );
-                    // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
-                    // by setting match flag
-                    unsigned int match      = (atomI == (y + j)) ? 1 : 0;
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    fieldSum[0]            += match ? 0.0f : dScaleVal*ijField[0][0];
-                    fieldSum[1]            += match ? 0.0f : dScaleVal*ijField[0][1];
-                    fieldSum[2]            += match ? 0.0f : dScaleVal*ijField[0][2];
-                    fieldPolarSum[0]       += match ? 0.0f : pScaleVal*ijField[0][0];
-                    fieldPolarSum[1]       += match ? 0.0f : pScaleVal*ijField[0][1];
-                    fieldPolarSum[2]       += match ? 0.0f : pScaleVal*ijField[0][2];
-                }
-            }
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
-            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
-#else
-            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, fieldSum,       outputEField );
-            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
-#endif
-        }
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                // load coordinates, charge, ...
-                loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
-            }
-            // zero shared fields
-            zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float ijField[2][3];
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    fieldSum[0]        += ijField[0][0];
-                    fieldSum[1]        += ijField[0][1];
-                    fieldSum[2]        += ijField[0][2];
-                    fieldPolarSum[0]   += ijField[0][0];
-                    fieldPolarSum[1]   += ijField[0][1];
-                    fieldPolarSum[2]   += ijField[0][2];
-                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-                    psA[tj].eField[0]  += ijField[1][0];
-                    psA[tj].eField[1]  += ijField[1][1];
-                    psA[tj].eField[2]  += ijField[1][2];
-                    psA[tj].eFieldP[0] += ijField[1][0];
-                    psA[tj].eFieldP[1] += ijField[1][1];
-                    psA[tj].eFieldP[2] += ijField[1][2];
-                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            else  // bExclusion
-            {
-                // Read fixed atom data into registers and GRF
-                unsigned int xi   = x >> GRIDBITS;
-                unsigned int yi   = y >> GRIDBITS;
-                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                int  dScaleMask   = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                int2 pScaleMask   = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    // load coords, charge, ...
-                    float ijField[2][3];
-                    calculateFixedEFieldPairIxn_kernel( localParticle, psA[tj], ijField);
-                    float dScaleVal;
-                    float pScaleVal;
-                    getMaskedDScaleFactor( tj, dScaleMask, &dScaleVal );
-                    getMaskedPScaleFactor( tj, pScaleMask, &pScaleVal );
-                    // add to field at atomI the field due atomJ's charge/dipole/quadrupole
-                    fieldSum[0]        += dScaleVal*ijField[0][0];
-                    fieldSum[1]        += dScaleVal*ijField[0][1];
-                    fieldSum[2]        += dScaleVal*ijField[0][2];
-                    fieldPolarSum[0]   += pScaleVal*ijField[0][0];
-                    fieldPolarSum[1]   += pScaleVal*ijField[0][1];
-                    fieldPolarSum[2]   += pScaleVal*ijField[0][2];
-                    // add to field at atomJ the field due atomI's charge/dipole/quadrupole
-                    psA[tj].eField[0]  += dScaleVal*ijField[1][0];
-                    psA[tj].eField[1]  += dScaleVal*ijField[1][1];
-                    psA[tj].eField[2]  += dScaleVal*ijField[1][2];
-                    psA[tj].eFieldP[0] += pScaleVal*ijField[1][0];
-                    psA[tj].eFieldP[1] += pScaleVal*ijField[1][1];
-                    psA[tj].eFieldP[2] += pScaleVal*ijField[1][2];
-                    tj                  = (tj + 1) & (GRID - 1);
-                }
-            }
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = 3*(x + tgx + warp*cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, fieldSum,       outputEField );
-            load3dArrayBufferPerWarp( offset, fieldPolarSum,  outputEFieldPolar );
-            offset                              = 3*(y + tgx + warp*cSim.paddedNumberOfAtoms);
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField,  outputEField );
-            load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
-#else
-            unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, fieldSum,       outputEField );
-            load3dArray( offset, fieldPolarSum,  outputEFieldPolar );
-            offset                              = 3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
-            load3dArray( offset, sA[threadIdx.x].eField,  outputEField );
-            load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
-#endif
-            lasty = y;
-        }
-        pos++;
-    }
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaFixedFieldParticle.h
-struct FixedFieldParticle {
-    // coordinates charge
-    float x;
-    float y;
-    float z;
-    float q;
-    // lab frame dipole
-    float labFrameDipole_X;
-    float labFrameDipole_Y;
-    float labFrameDipole_Z;
-    // lab frame quadrupole
-    float labFrameQuadrupole_XX;
-    float labFrameQuadrupole_XY;
-    float labFrameQuadrupole_XZ;
-    float labFrameQuadrupole_YY;
-    float labFrameQuadrupole_YZ;
-    float labFrameQuadrupole_ZZ;
-    // scaling factor
-    float thole;
-    float damp;
-    // field accumulators
-    float eField[3];
-    float eFieldP[3];
-#ifdef GK
-    // Born radius
-    float bornR;
-    // GK field
-    float gkField[3];
-#endif
-#ifdef INCLUDE_FIXED_FIELD_BUFFERS
-    float tempBuffer[3];
-    float tempBufferP[3];
-#endif
-};
-__device__ static void loadFixedFieldShared( struct FixedFieldParticle* sA, unsigned int atomI 
-#ifdef GK
-    , float* bornR
-#endif
-)
-{
-    // coordinates & charge
-    float4 posq                  = cSim.pPosq[atomI];
-    sA->x                        = posq.x;
-    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = cSim.pPosq[atomI].z;
-    sA->q                        = cSim.pPosq[atomI].w;
-    // lab dipole
-    sA->labFrameDipole_X         = cAmoebaSim.pLabFrameDipole[atomI*3];
-    sA->labFrameDipole_Y         = cAmoebaSim.pLabFrameDipole[atomI*3+1];
-    sA->labFrameDipole_Z         = cAmoebaSim.pLabFrameDipole[atomI*3+2];
-    // lab quadrupole
-    sA->labFrameQuadrupole_XX    = cAmoebaSim.pLabFrameQuadrupole[atomI*9];
-    sA->labFrameQuadrupole_XY    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+1];
-    sA->labFrameQuadrupole_XZ    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+2];
-    sA->labFrameQuadrupole_YY    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+4];
-    sA->labFrameQuadrupole_YZ    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+5];
-    sA->labFrameQuadrupole_ZZ    = cAmoebaSim.pLabFrameQuadrupole[atomI*9+8];
-    float2 dampingFactorAndThole = cAmoebaSim.pDampingFactorAndThole[atomI];
-    sA->damp                     = dampingFactorAndThole.x;
-    sA->thole                    = dampingFactorAndThole.y;
-#ifdef GK
-    sA->bornR                    = bornR[atomI];
-#endif
-}
-// load struct and arrays w/ shared data in sA
-__device__ static void loadFixedFieldParticleData( struct FixedFieldParticle* sA, 
-                                                   float4* jCoord, float* jDipole, float* jQuadrupole
-#ifdef GK
-, float* bornR
-#endif
-)
-{
-    // load coords, charge, ...
-    jCoord->x               = sA->x;
-    jCoord->y               = sA->y;
-    jCoord->z               = sA->z;
-    jCoord->w               = sA->q;
-    jDipole[0]              = sA->labFrameDipole_X;
-    jDipole[1]              = sA->labFrameDipole_Y;
-    jDipole[2]              = sA->labFrameDipole_Z;
-    jQuadrupole[0]          = sA->labFrameQuadrupole_XX;
-    jQuadrupole[1]          = sA->labFrameQuadrupole_XY;
-    jQuadrupole[2]          = sA->labFrameQuadrupole_XZ;
-    jQuadrupole[3]          = sA->labFrameQuadrupole_XY;
-    jQuadrupole[4]          = sA->labFrameQuadrupole_YY;
-    jQuadrupole[5]          = sA->labFrameQuadrupole_YZ;
-    jQuadrupole[6]          = sA->labFrameQuadrupole_XZ;
-    jQuadrupole[7]          = sA->labFrameQuadrupole_YZ;
-    jQuadrupole[8]          = sA->labFrameQuadrupole_ZZ;
-#ifdef GK
-    *bornR                  = sA->bornR;
-#endif
-}
-// zero fields
-__device__ static void zeroFixedFieldParticleSharedField( struct FixedFieldParticle* sA )
-{
-    sA->eField[0]    = 0.0f;
-    sA->eField[1]    = 0.0f;
-    sA->eField[2]    = 0.0f;
-    sA->eFieldP[0]   = 0.0f;
-    sA->eFieldP[1]   = 0.0f;
-    sA->eFieldP[2]   = 0.0f;
-#ifdef GK
-    sA->gkField[0]   = 0.0f;
-    sA->gkField[1]   = 0.0f;
-    sA->gkField[2]   = 0.0f;
-#endif
-}
-// body of fixed E-field calculation
-__device__ static void calculateFixedEFieldPairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
-                                                           float field[2][3])
-{
-    // ---------------------------------------------------------------------------------------
-    // get deltaR and r between 2 atoms
-    float deltaR[3];
-    deltaR[0]           = atomJ.x - atomI.x;
-    deltaR[1]           = atomJ.y - atomI.y;
-    deltaR[2]           = atomJ.z - atomI.z;
-    float r             =  SQRT( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
-    float rI            =  1.0f/r;
-    float r2I           =  rI*rI;
-    float rr3           =  rI*r2I;
-    float rr5           =  3.0f*rr3*r2I;
-    float rr7           =  5.0f*rr5*r2I;
-    // get scaling factors, if needed
-    float damp          = atomI.damp*atomJ.damp;
-    float dampExp;
-    if( damp != 0.0f && r < cAmoebaSim.scalingDistanceCutoff ){
-        // get scaling factors
-        float ratio     = r/damp;
-        float pGamma    = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole; 
-        damp            = ratio*ratio*ratio*pGamma;
-        dampExp         = EXP( -damp );
-    } else {
-        dampExp         = 0.0f;
-    }
-    rr3                *= 1.0f - dampExp;
-    rr5                *= 1.0f - ( 1.0f + damp )*dampExp;
-    rr7                *= 1.0f - ( 1.0f + damp + (0.6f*damp*damp))*dampExp;
-    float rr5_2         = rr5*2.0f;
-    float  qDotDelta[3];
-    qDotDelta[0]        = deltaR[0]*atomJ.labFrameQuadrupole_XX + deltaR[1]*atomJ.labFrameQuadrupole_XY + deltaR[2]*atomJ.labFrameQuadrupole_XZ;
-    qDotDelta[1]        = deltaR[0]*atomJ.labFrameQuadrupole_XY + deltaR[1]*atomJ.labFrameQuadrupole_YY + deltaR[2]*atomJ.labFrameQuadrupole_YZ;
-    qDotDelta[2]        = deltaR[0]*atomJ.labFrameQuadrupole_XZ + deltaR[1]*atomJ.labFrameQuadrupole_YZ + deltaR[2]*atomJ.labFrameQuadrupole_ZZ;
-    float dotdd         = deltaR[0]*atomJ.labFrameDipole_X      + deltaR[1]*atomJ.labFrameDipole_Y      + deltaR[2]*atomJ.labFrameDipole_Z;
-    float dotqd         = deltaR[0]*qDotDelta[0]                + deltaR[1]*qDotDelta[1]                + deltaR[2]*qDotDelta[2];
-    float factor        = -rr3*atomJ.q + rr5*dotdd - rr7*dotqd;
-    field[0][0]         = deltaR[0]*factor - rr3*atomJ.labFrameDipole_X + rr5_2*qDotDelta[0];
-    field[0][1]         = deltaR[1]*factor - rr3*atomJ.labFrameDipole_Y + rr5_2*qDotDelta[1];
-    field[0][2]         = deltaR[2]*factor - rr3*atomJ.labFrameDipole_Z + rr5_2*qDotDelta[2];
-    qDotDelta[0]        = deltaR[0]*atomI.labFrameQuadrupole_XX + deltaR[1]*atomI.labFrameQuadrupole_XY + deltaR[2]*atomI.labFrameQuadrupole_XZ;
-    qDotDelta[1]        = deltaR[0]*atomI.labFrameQuadrupole_XY + deltaR[1]*atomI.labFrameQuadrupole_YY + deltaR[2]*atomI.labFrameQuadrupole_YZ;
-    qDotDelta[2]        = deltaR[0]*atomI.labFrameQuadrupole_XZ + deltaR[1]*atomI.labFrameQuadrupole_YZ + deltaR[2]*atomI.labFrameQuadrupole_ZZ;
-    dotdd               = deltaR[0]*atomI.labFrameDipole_X    + deltaR[1]*atomI.labFrameDipole_Y    + deltaR[2]*atomI.labFrameDipole_Z;
-    dotqd               = deltaR[0]*qDotDelta[0] + deltaR[1]*qDotDelta[1] + deltaR[2]*qDotDelta[2];
-    factor              = rr3*atomI.q + rr5*dotdd + rr7*dotqd;
-    field[1][0]         = deltaR[0]*factor - rr3*atomI.labFrameDipole_X - rr5_2*qDotDelta[0];
-    field[1][1]         = deltaR[1]*factor - rr3*atomI.labFrameDipole_Y - rr5_2*qDotDelta[1];
-    field[1][2]         = deltaR[2]*factor - rr3*atomI.labFrameDipole_Z - rr5_2*qDotDelta[2];
-}
--- a/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaGrycuk.cu
+++ b/plugins/amoeba/platforms/cuda-old/src/kernels/kCalculateAmoebaCudaGrycuk.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "cudaKernels.h"
-#include "amoebaCudaKernels.h"
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
-void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
-    RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
-}
-void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
-{
-    cudaError_t status;
-    gpuContext gpu = amoebaGpu->gpuContext;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
-    RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
-}
-struct GrycukParticle {
-    float x;
-    float y;
-    float z;
-    float radius;
-    float scaledRadius;
-    float bornSum;
-};
-__device__ void loadGrycukShared( struct GrycukParticle* sA, unsigned int atomI )
-{
-    // coordinates, radii and scaled radii
-    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = cSim.pPosq[atomI].z;
-    sA->radius                   = cSim.pObcData[atomI].x;
-    sA->scaledRadius             = cSim.pObcData[atomI].y;
-}
-__device__ void calculateGrycukBornRadiiPairIxn_kernel( GrycukParticle& atomI, GrycukParticle& atomJ, float*  bornSum ){
-    /*
-     * radius:       radius (TINKER rsolv)
-     * scaledRadius: radius*overlap scale factor (TINKER rsolv*shct)
-     *
-     */
-    float xr,yr,zr;
-    float r,r2;
-    float sk, sk2;
-    float lik, uik;
-    float lik3, uik3;
-    float l2, l4, lr, l4r;
-    float u2, u4, ur, u4r;
-    float term;
-    // decide whether to compute the current interaction;
-    *bornSum = 0.0f;
-    if( atomI.radius <= 0.0f ){
-        return;
-    }
-    xr           = atomJ.x - atomI.x;
-    yr           = atomJ.y - atomI.y;
-    zr           = atomJ.z - atomI.z;
-    r2           = xr*xr + yr*yr + zr*zr;
-    r            = sqrtf(r2);
-    sk           = atomJ.scaledRadius;
-    sk2          = sk*sk;
-    if( (atomI.radius + r) < sk ){
-        lik       = atomI.radius;
-        uik       = sk - r; 
-        lik3      = lik*lik*lik;
-        uik3      = uik*uik*uik;
-        *bornSum -= (1.0f/uik3 - 1.0f/lik3);
-    }
-    uik = r + sk;
-    if( (atomI.radius + r) < sk ){
-        lik = sk - r; 
-    } else if( r < (atomI.radius + sk) ){
-        lik = atomI.radius;
-    } else {
-        lik = r - sk;
-    }
-    l2          = lik*lik; 
-    l4          = l2*l2;
-    lr          = lik*r;
-    l4r         = l4*r; 
-    u2          = uik*uik;
-    u4          = u2*u2;
-    ur          = uik*r; 
-    u4r         = u4*r;
-   term         = (3.0f*(r2-sk2)+6.0f*u2-8.0f*ur)/u4r - (3.0f*(r2-sk2)+6.0f*l2-8.0f*lr)/l4r;
-  *bornSum     += term/16.0f;
-}
-__device__ void zeroGrycukParticleSharedField( struct GrycukParticle* sA )
-{
-    sA->bornSum               = 0.0f;
-}
-__global__ 
-__launch_bounds__(384, 1)
-void kReduceGrycukGbsaBornSum_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    while (pos < cSim.atoms)
-    {   
-        float sum   = 0.0f;
-        float* pSt  = cSim.pBornSum + pos;
-        // Get summed Born data
-        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
-        {   
-            sum += *pSt;
-            pSt += cSim.stride;
-        }   
-        // Now calculate Born radius
-        float radius              = cSim.pObcData[pos].x;
-        radius                    = 1.0f/(radius*radius*radius);
-        sum                       = radius - sum;
-        sum                       = sum <= 0.0f ? 1000.0f : powf( sum, -1.0f/3.0f );
-        cSim.pBornRadii[pos]      = sum;
-        pos                      += gridDim.x * blockDim.x;
-    }   
-}
-/**---------------------------------------------------------------------------------------
-   Reduce Born radii
-   @param amoebaGpu        amoebaGpu context
-   --------------------------------------------------------------------------------------- */
-void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
-{
-    kReduceGrycukGbsaBornSum_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>();
-    LAUNCHERROR("kReduceGrycukGbsaBornSum");
-    if( 0 ){
-        static int callId                    = 0;
-        gpuContext gpu                       = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        fileId.push_back( callId++ );
-        VectorOfDoubleVectors outputVector;
-        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); 
-        cudaWriteVectorOfDoubleVectorsToFile( "BornRGry", fileId, outputVector );
-    }   
-}
-// Include versions of the kernels for N^2 calculations.
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaGrycukBornRadii.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaGrycukBornRadii.h"
-/**---------------------------------------------------------------------------------------
-   Compute Born radii using Grycuk algorithm
-   @param amoebaGpu        amoebaGpu context
-   --------------------------------------------------------------------------------------- */
-void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
-{
-    // ---------------------------------------------------------------------------------------
-    gpuContext gpu = amoebaGpu->gpuContext;
-    // apparently debug array can take up nontrivial no. registers
-    // on first pass, set threads/block and based on that setting the energy buffer array
-    static unsigned int threadsPerBlock = 0;
-    if( threadsPerBlock == 0 ){
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            //maxThreads = 384;
-            maxThreads = 512;
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-    }
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
-    } else {
-       kCalculateAmoebaGrycukBornRadiiN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
-    }
-    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
-   // ---------------------------------------------------------------------------------------
-}
-// Born radius chain rule component for Grycuk
-struct GrycukChainRuleParticle {
-    float x;
-    float y;
-    float z;
-    float radius;
-    float scaledRadius;
-    float bornRadius;
-    float bornForce;
-    float force[3];
-};
-__device__ void loadGrycukChainRuleParticleShared( struct GrycukChainRuleParticle* sA, unsigned int atomI )
-{
-    // coordinates, radii and scaled radii
-    sA->x                        = cSim.pPosq[atomI].x;
-    sA->y                        = cSim.pPosq[atomI].y;
-    sA->z                        = cSim.pPosq[atomI].z;
-    sA->radius                   = cSim.pObcData[atomI].x;
-    sA->scaledRadius             = cSim.pObcData[atomI].y;
-    sA->bornRadius               = cSim.pBornRadii[atomI];
-    sA->bornForce                = cSim.pBornForce[atomI];
-}
-__device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRuleParticle* sA )
-{
-    // zero force
-    sA->force[0]                 = 0.0f;
-    sA->force[1]                 = 0.0f;
-    sA->force[2]                 = 0.0f;
-}
-__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3] ){
-    const float pi         = 3.1415926535897f;
-    float third            = 1.0f/3.0f;
-    float pi43             = 4.0f*third*pi;
-    float lik, uik;
-    float lik4, uik4;
-    float factor           = -powf(pi,third)*powf(6.0f,(2.0f*third))/9.0f;
-    float term             = pi43/(atomI.bornRadius*atomI.bornRadius*atomI.bornRadius);
-          term             = factor/powf( term, (4.0f*third) );
-    float xr               = atomJ.x - atomI.x;
-    float yr               = atomJ.y - atomI.y;
-    float zr               = atomJ.z - atomI.z;
-    float sk               = atomJ.scaledRadius;
-    float sk2              = sk*sk;
-    float r2               = xr*xr + yr*yr + zr*zr;
-    float r                = sqrtf(r2);
-    float de               = 0.0f;
-    if( (atomI.radius + r) < sk ){
-        float uik4;
-        uik        = sk - r;
-        uik4       = uik*uik;
-        uik4       = uik4*uik4;
-        de         = -4.0f*pi/uik4;
-    }
-    if( (atomI.radius + r) < sk){
-        lik  = sk - r;
-        lik4 = lik*lik;
-        lik4 = lik4*lik4;
-        de  += 0.25f*pi*(sk2-4.0f*sk*r+17.0f*r2)/ (r2*lik4);
-    } else if( r < (atomI.radius +sk) ){
-        lik  = atomI.radius;
-        lik4 = lik*lik;
-        lik4 = lik4*lik4;
-        de  += 0.25f*pi*(2.0f*atomI.radius*atomI.radius-sk2-r2)/ (r2*lik4);
-    } else {
-        lik  = r - sk;
-        lik4 = lik*lik;
-        lik4 = lik4*lik4;
-        de  += 0.25f*pi*(sk2-4.0f*sk*r+r2)/ (r2*lik4);
-    }
-    uik        = r + sk;
-    uik4       = uik*uik;
-    uik4       = uik4*uik4;
-    de        -= 0.25f*pi*(sk2+4.0f*sk*r+r2)/ (r2*uik4);
-    float dbr  = term * de/r;
-          de   = dbr*atomI.bornForce;
-    force[0]   = xr*de;
-    force[1]   = yr*de;
-    force[2]   = zr*de;
-}
-// Include versions of the kernels for N^2 calculations.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateAmoebaCudaGrycukChainRule.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateAmoebaCudaGrycukChainRule.h"
-/**---------------------------------------------------------------------------------------
-   Compute Grycuk chain rule contribution to force
-   @param amoebaGpu        amoebaGpu context
-   --------------------------------------------------------------------------------------- */
-void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
-{
-    // ---------------------------------------------------------------------------------------
-    gpuContext gpu = amoebaGpu->gpuContext;
-    // apparently debug array can take up nontrivial no. registers
-    // on first pass, set threads/block and based on that setting the energy buffer array
-    static unsigned int threadsPerBlock = 0;
-    if( threadsPerBlock == 0 ){
-        unsigned int maxThreads;
-        if (gpu->sm_version >= SM_20)
-            //maxThreads = 384;
-            maxThreads = 512;
-        else if (gpu->sm_version >= SM_12)
-            maxThreads = 128;
-        else
-            maxThreads = 64;
-        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukChainRuleParticle), gpu->sharedMemoryPerBlock ), maxThreads);
-    }
-    if (gpu->bOutputBufferPerWarp){
-        kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
-    } else {
-        kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
-    }
-    LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
-   if( 0 ){ 
-        static int callId                    = 0;
-        gpuContext gpu                       = amoebaGpu->gpuContext;
-        std::vector<int> fileId;
-        fileId.push_back( callId++ ); 
-        VectorOfDoubleVectors outputVector;
-        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psLabFrameDipole,     outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
-        CUDAStream<float>* temp  = new CUDAStream<float>(3*gpu->sim.paddedNumberOfAtoms, 1, "Temp1");
-        reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
-        cudaLoadCudaFloatArray( gpu->natoms,  3, temp, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f ); 
-        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f ); 
-        cudaLoadCudaFloatArray( gpu->natoms,  1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f ); 
-        cudaWriteVectorOfDoubleVectorsToFile( "GryF", fileId, outputVector );
-        delete temp;
-        //exit(0);
-    }    
-   // ---------------------------------------------------------------------------------------
-}