Continuing converting AMOEBA to the new CUDA platform: AmoebaVdwForce

a99a2d0e · Peter Eastman · a90c90e2 · a99a2d0e · a99a2d0e · a99a2d0e
Commit a99a2d0e authored Jul 12, 2012 by Peter Eastman
6 changed files
--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernelFactory.cpp
@@ -102,10 +102,10 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
 //
 //    if (name == CalcAmoebaGeneralizedKirkwoodForceKernel::Name())
 //        return new CudaCalcAmoebaGeneralizedKirkwoodForceKernel(name, platform, cu, context.getSystem());
-//
-//    if (name == CalcAmoebaVdwForceKernel::Name())
-//        return new CudaCalcAmoebaVdwForceKernel(name, platform, cu, context.getSystem());
-//
+
+    if (name == CalcAmoebaVdwForceKernel::Name())
+        return new CudaCalcAmoebaVdwForceKernel(name, platform, cu, context.getSystem());
+
 //    if (name == CalcAmoebaWcaDispersionForceKernel::Name())
 //        return new CudaCalcAmoebaWcaDispersionForceKernel(name, platform, cu, context.getSystem());
 //

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
@@ -34,6 +34,7 @@
 #include "CudaBondedUtilities.h"
 #include "CudaForceInfo.h"
 #include "CudaKernelSources.h"
+#include "CudaNonbondedUtilities.h"

 #include <cmath>
 #ifdef _MSC_VER
@@ -43,6 +44,13 @@
 using namespace OpenMM;
 using namespace std;

+#define CHECK_RESULT(result) \
+    if (result != CUDA_SUCCESS) { \
+        std::stringstream m; \
+        m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        throw OpenMMException(m.str());\
+    }
+
 /* -------------------------------------------------------------------------- *
 *                           AmoebaHarmonicBond                               *
 * -------------------------------------------------------------------------- */
@@ -1126,82 +1134,151 @@ double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bo
 //    // Vdw14_7F
 //    kCalculateAmoebaVdw14_7Forces(gpu, data.getUseVdwNeighborList());
 //}
-//
-///* -------------------------------------------------------------------------- *
-// *                           AmoebaVdw                                        *
-// * -------------------------------------------------------------------------- */
-//
-//class CudaCalcAmoebaVdwForceKernel::ForceInfo : public CudaForceInfo {
-//public:
-//    ForceInfo(const AmoebaVdwForce& force) : force(force) {
-//    }
-//    bool areParticlesIdentical(int particle1, int particle2) {
-//        int iv1, iv2, class1, class2;
-//        double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2;
-//        force.getParticleParameters(particle1, iv1, class1, sigma1, epsilon1, reduction1);
-//        force.getParticleParameters(particle2, iv2, class2, sigma2, epsilon2, reduction2);
-//        return (class1 == class2 && sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
-//    }
-//private:
-//    const AmoebaVdwForce& force;
-//};
-//
-//CudaCalcAmoebaVdwForceKernel::CudaCalcAmoebaVdwForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) :
-//       CalcAmoebaVdwForceKernel(name, platform), cu(cu), system(system) {
-//    data.incrementKernelCount();
-//}
-//
-//CudaCalcAmoebaVdwForceKernel::~CudaCalcAmoebaVdwForceKernel() {
-//    data.decrementKernelCount();
-//}
-//
-//void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const AmoebaVdwForce& force) {
-//
-//    // per-particle parameters
-//
-//    int numParticles = system.getNumParticles();
-//
-//    std::vector<int> indexIVs(numParticles);
-//    std::vector<int> indexClasses(numParticles);
-//    std::vector< std::vector<int> > allExclusions(numParticles);
-//    std::vector<float> sigmas(numParticles);
-//    std::vector<float> epsilons(numParticles);
-//    std::vector<float> reductions(numParticles);
-//    for( int ii = 0; ii < numParticles; ii++ ){
-//
-//        int indexIV, indexClass;
-//        double sigma, epsilon, reduction;
-//        std::vector<int> exclusions;
-//
-//        force.getParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
-//        force.getParticleExclusions( ii, exclusions );
-//        for( unsigned int jj = 0; jj < exclusions.size(); jj++ ){
-//           allExclusions[ii].push_back( exclusions[jj] );
-//        }
-//
-//        indexIVs[ii]      = indexIV;
-//        indexClasses[ii]  = indexClass;
-//        sigmas[ii]        = static_cast<float>( sigma );
-//        epsilons[ii]      = static_cast<float>( epsilon );
-//        reductions[ii]    = static_cast<float>( reduction );
-//    }   
-//
-//    gpuSetAmoebaVdwParameters( data.getAmoebaGpu(), indexIVs, indexClasses, sigmas, epsilons, reductions,
-//                               force.getSigmaCombiningRule(), force.getEpsilonCombiningRule(),
-//                               allExclusions, force.getPBC(), static_cast<float>(force.getCutoff()) );
-//    data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-//    if( data.getLog() ){
-//        (void) fprintf( data.getLog(), "CudaCalcAmoebaVdwForceKernel PBC=%d getUseNeighborList=%d\n",
-//                        force.getPBC(), force.getUseNeighborList() );
-//    }
-//    data.setUseVdwNeighborList( force.getUseNeighborList() );
-//}
-//
-//double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-//    computeAmoebaVdwForce( data );
-//    return 0.0;
-//}
-//
+
+/* -------------------------------------------------------------------------- *
+ *                           AmoebaVdw                                        *
+ * -------------------------------------------------------------------------- */
+
+class CudaCalcAmoebaVdwForceKernel::ForceInfo : public CudaForceInfo {
+public:
+    ForceInfo(const AmoebaVdwForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        int iv1, iv2, class1, class2;
+        double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2;
+        force.getParticleParameters(particle1, iv1, class1, sigma1, epsilon1, reduction1);
+        force.getParticleParameters(particle2, iv2, class2, sigma2, epsilon2, reduction2);
+        return (class1 == class2 && sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
+    }
+private:
+    const AmoebaVdwForce& force;
+};
+
+CudaCalcAmoebaVdwForceKernel::CudaCalcAmoebaVdwForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) :
+        CalcAmoebaVdwForceKernel(name, platform), cu(cu), system(system), hasInitializedNonbonded(false), sigmaEpsilon(NULL),
+        bondReductionAtoms(NULL), bondReductionFactors(NULL), tempPosq(NULL), tempForces(NULL), nonbonded(NULL) {
+}
+
+CudaCalcAmoebaVdwForceKernel::~CudaCalcAmoebaVdwForceKernel() {
+    cu.setAsCurrent();
+    if (sigmaEpsilon != NULL)
+        delete sigmaEpsilon;
+    if (bondReductionAtoms != NULL)
+        delete bondReductionAtoms;
+    if (bondReductionFactors != NULL)
+        delete bondReductionFactors;
+    if (tempPosq != NULL)
+        delete tempPosq;
+    if (tempForces != NULL)
+        delete tempForces;
+    if (nonbonded != NULL)
+        delete nonbonded;
+}
+
+void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const AmoebaVdwForce& force) {
+    cu.setAsCurrent();
+    sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
+    bondReductionAtoms = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "bondReductionAtoms");
+    bondReductionFactors = CudaArray::create<float>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
+    tempPosq = new CudaArray(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "tempPosq");
+    tempForces = CudaArray::create<long long>(cu, 3*cu.getPaddedNumAtoms(), "tempForces");
+    
+    // Record atom parameters.
+    
+    vector<float2> sigmaEpsilonVec(cu.getPaddedNumAtoms(), make_float2(0, 1));
+    vector<int> bondReductionAtomsVec(cu.getPaddedNumAtoms(), 0);
+    vector<float> bondReductionFactorsVec(cu.getPaddedNumAtoms(), 0);
+    vector<vector<int> > exclusions(cu.getNumAtoms());
+    for (int i = 0; i < force.getNumParticles(); i++) {
+        int ivIndex, classIndex;
+        double sigma, epsilon, reductionFactor;
+        force.getParticleParameters(i, ivIndex, classIndex, sigma, epsilon, reductionFactor);
+        sigmaEpsilonVec[i] = make_float2((float) sigma, (float) epsilon);
+        bondReductionAtomsVec[i] = ivIndex;
+        bondReductionFactorsVec[i] = (float) reductionFactor;
+        force.getParticleExclusions(i, exclusions[i]);
+        exclusions[i].push_back(i);
+    }
+    sigmaEpsilon->upload(sigmaEpsilonVec);
+    bondReductionAtoms->upload(bondReductionAtomsVec);
+    bondReductionFactors->upload(bondReductionFactorsVec);
+ 
+    // This force is applied based on modified atom positions, where hydrogens have been moved slightly
+    // closer to their parent atoms.  We therefore create a separate CudaNonbondedUtilities just for
+    // this force, so it will have its own neighbor list and interaction kernel.
+    
+    nonbonded = new CudaNonbondedUtilities(cu);
+    nonbonded->addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
+    
+    // Create the interaction kernel.
+    
+    map<string, string> replacements;
+    string sigmaCombiningRule = force.getSigmaCombiningRule();
+    if (sigmaCombiningRule == "ARITHMETIC")
+        replacements["SIGMA_COMBINING_RULE"] = "1";
+    else if (sigmaCombiningRule == "GEOMETRIC")
+        replacements["SIGMA_COMBINING_RULE"] = "2";
+    else if (sigmaCombiningRule == "CUBIC-MEAN")
+        replacements["SIGMA_COMBINING_RULE"] = "3";
+    else
+        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
+    string epsilonCombiningRule = force.getEpsilonCombiningRule();
+    if (epsilonCombiningRule == "ARITHMETIC")
+        replacements["EPILON_COMBINING_RULE"] = "1";
+    else if (epsilonCombiningRule == "GEOMETRIC")
+        replacements["EPILON_COMBINING_RULE"] = "2";
+    else if (epsilonCombiningRule == "HARMONIC")
+        replacements["EPILON_COMBINING_RULE"] = "3";
+    else if (epsilonCombiningRule == "HHG")
+        replacements["EPILON_COMBINING_RULE"] = "4";
+    else
+        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
+    double cutoff = force.getCutoff();
+    double taperCutoff = cutoff*0.9;
+    replacements["CUTOFF_DISTANCE"] = cu.doubleToString(force.getCutoff());
+    replacements["TAPER_CUTOFF"] = cu.doubleToString(taperCutoff);
+    double cutoff2 = cutoff*cutoff;
+    double taperCutoff2 = taperCutoff*taperCutoff;
+    double denom = pow(cutoff-taperCutoff, -5.0);
+    replacements["TAPER_C0"] = cu.doubleToString(cutoff*cutoff2 * (cutoff2-5.0*cutoff*taperCutoff+10.0*taperCutoff2)*denom);
+    replacements["TAPER_C1"] = cu.doubleToString(-30.0*cutoff2*taperCutoff2*denom);
+    replacements["TAPER_C2"] = cu.doubleToString(30.0*(cutoff2*taperCutoff+cutoff*taperCutoff2)*denom);
+    replacements["TAPER_C3"] = cu.doubleToString(-10.0*(cutoff2+4.0*cutoff*taperCutoff+taperCutoff2)*denom);
+    replacements["TAPER_C4"] = cu.doubleToString(15.0*(cutoff+taperCutoff)*denom);
+    replacements["TAPER_C5"] = cu.doubleToString(-6.0*denom);
+    nonbonded->addInteraction(force.getUseNeighborList(), force.getPBC(), true, force.getCutoff(), exclusions,
+        cu.replaceStrings(CudaAmoebaKernelSources::amoebaVdwForce2, replacements), force.getForceGroup());
+    
+    // Create the other kernels.
+    
+    map<string, string> defines;
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    CUmodule module = cu.createModule(CudaAmoebaKernelSources::amoebaVdwForce1, defines);
+    prepareKernel = cu.getKernel(module, "prepareToComputeForce");
+    spreadKernel = cu.getKernel(module, "spreadForces");
+    cu.addForce(new ForceInfo(force));
+}
+
+double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (!hasInitializedNonbonded) {
+        hasInitializedNonbonded = true;
+        nonbonded->initialize(system);
+    }
+    const char* errorMessage = "Error copying array";
+    CHECK_RESULT(cuMemcpyDtoDAsync(tempPosq->getDevicePointer(), cu.getPosq().getDevicePointer(), tempPosq->getSize()*tempPosq->getElementSize(), 0));
+    CHECK_RESULT(cuMemcpyDtoDAsync(tempForces->getDevicePointer(), cu.getForce().getDevicePointer(), tempForces->getSize()*tempForces->getElementSize(), 0));
+    void* prepareArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &tempPosq->getDevicePointer(),
+        &bondReductionAtoms->getDevicePointer(), &bondReductionFactors->getDevicePointer()};
+    cu.executeKernel(prepareKernel, prepareArgs, cu.getPaddedNumAtoms());
+    nonbonded->prepareInteractions();
+    nonbonded->computeInteractions();
+    void* spreadArgs[] = {&cu.getForce().getDevicePointer(), &tempForces->getDevicePointer(), &bondReductionAtoms->getDevicePointer(), &bondReductionFactors->getDevicePointer()};
+    cu.executeKernel(spreadKernel, spreadArgs, cu.getPaddedNumAtoms());
+    CHECK_RESULT(cuMemcpyDtoDAsync(cu.getPosq().getDevicePointer(), tempPosq->getDevicePointer(), tempPosq->getSize()*tempPosq->getElementSize(), 0));
+    CHECK_RESULT(cuMemcpyDtoDAsync(cu.getForce().getDevicePointer(), tempForces->getDevicePointer(), tempForces->getSize()*tempForces->getElementSize(), 0));
+    return 0.0;
+}
+
 ///* -------------------------------------------------------------------------- *
 // *                           AmoebaWcaDispersion                              *
 // * -------------------------------------------------------------------------- */

--- a/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
@@ -432,6 +432,14 @@ private:
    class ForceInfo;
    CudaContext& cu;
    System& system;
+    bool hasInitializedNonbonded;
+    CudaArray* sigmaEpsilon;
+    CudaArray* bondReductionAtoms;
+    CudaArray* bondReductionFactors;
+    CudaArray* tempPosq;
+    CudaArray* tempForces;
+    CudaNonbondedUtilities* nonbonded;
+    CUfunction prepareKernel, spreadKernel;
 };

 /**

--- a/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce1.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce1.cu
+/**
+ * Clear the forces, and compute the position to use for each atom based on the bond reduction factors.
+ */
+extern "C" __global__ void prepareToComputeForce(unsigned long long* __restrict__ forceBuffers, real4* __restrict__ posq, const real4* __restrict__ tempPosq,
+        const int* __restrict__ bondReductionAtoms, const float* __restrict__ bondReductionFactors) {
+    for (unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < PADDED_NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+        forceBuffers[atom] = 0;
+        forceBuffers[atom+PADDED_NUM_ATOMS] = 0;
+        forceBuffers[atom+PADDED_NUM_ATOMS*2] = 0;
+        real4 pos1 = tempPosq[atom];
+        real4 pos2 = tempPosq[bondReductionAtoms[atom]];
+        real factor = (real) bondReductionFactors[atom];
+        posq[atom] = make_real4(factor*pos1.x + (1-factor)*pos2.x,
+                                    factor*pos1.y + (1-factor)*pos2.y,
+                                    factor*pos1.z + (1-factor)*pos2.z, pos1.w);
+    }
+}
+
+/**
+ * Spread the forces between atoms based on the bond reduction factors.
+ */
+extern "C" __global__ void spreadForces(const unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ tempForceBuffers,
+        const int* __restrict__ bondReductionAtoms, const float* __restrict__ bondReductionFactors) {
+    for (unsigned int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < PADDED_NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
+        int atom2 = bondReductionAtoms[atom1];
+        long long fx1 = forceBuffers[atom1];
+        long long fy1 = forceBuffers[atom1+PADDED_NUM_ATOMS];
+        long long fz1 = forceBuffers[atom1+PADDED_NUM_ATOMS*2];
+        if (atom1 != atom2) {
+            double factor = (double) bondReductionFactors[atom1];
+            long long fx2 = (long long) ((1-factor)*fx1);
+            long long fy2 = (long long) ((1-factor)*fy1);
+            long long fz2 = (long long) ((1-factor)*fz1);
+            atomicAdd(&tempForceBuffers[atom2], static_cast<unsigned long long>(fx2));
+            atomicAdd(&tempForceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(fy2));
+            atomicAdd(&tempForceBuffers[atom2+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>(fz2));
+            fx1 = (long long) (factor*fx1);
+            fy1 = (long long) (factor*fy1);
+            fz1 = (long long) (factor*fz1);
+        }
+        atomicAdd(&tempForceBuffers[atom1], static_cast<unsigned long long>(fx1));
+        atomicAdd(&tempForceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(fy1));
+        atomicAdd(&tempForceBuffers[atom1+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>(fz1));
+    }
+}
--- a/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce2.cu
+++ b/plugins/amoeba/platforms/cuda2/src/kernels/amoebaVdwForce2.cu
+{
+#ifdef USE_CUTOFF
+    unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
+#else
+    unsigned int includeInteraction = (!isExcluded);
+#endif
+    real tempForce = 0.0f;
+#if SIGMA_COMBINING_RULE == 1
+    real sigma = sigmaEpsilon1.x + sigmaEpsilon2.x;
+#elif SIGMA_COMBINING_RULE == 2
+    real sigma = 2*SQRT(sigmaEpsilon1.x*sigmaEpsilon2.x);
+#else
+    real sigma1_2 = sigmaEpsilon1.x*sigmaEpsilon1.x;
+    real sigma2_2 = sigmaEpsilon2.x*sigmaEpsilon2.x;
+    real sigma = 2*(sigmaEpsilon1.x*sigma1_2 + sigmaEpsilon2.x*sigma2_2)/(sigma1_2+sigma2_2);
+#endif
+#if EPSILON_COMBINING_RULE == 1
+    real epsilon = sigmaEpsilon1.y + sigmaEpsilon2.y;
+#elif EPSILON_COMBINING_RULE == 2
+    real epsilon = 2*SQRT(sigmaEpsilon1.y*sigmaEpsilon2.y);
+#elif EPSILON_COMBINING_RULE == 3
+    real epsilon1_2 = sigmaEpsilon1.x*sigmaEpsilon1.x;
+    real epsilon2_2 = sigmaEpsilon2.x*sigmaEpsilon2.x;
+    real epsilon = 2*(sigmaEpsilon1.x*epsilon1_2 + sigmaEpsilon2.x*epsilon2_2)/(epsilon1_2+epsilon2_2);
+#else
+    real epsilon_s = SQRT(sigmaEpsilon1.y) + SQRT(sigmaEpsilon2.y);
+    real epsilon = 4*sigmaEpsilon1.y*sigmaEpsilon2.y/(epsilon_s*epsilon_s);
+#endif
+    real r6 = r2*r2*r2;
+    real r7 = r6*r;
+    real sigma7 = sigma*sigma;
+    sigma7 = sigma7*sigma7*sigma7*sigma;
+    real rho = r7 + sigma7*0.12f;
+    real invRho = RECIP(rho);
+    real tau = 1.07f/(r + 0.07f*sigma);
+    real tau7 = tau*tau*tau;
+    tau7 = tau7*tau7*tau;
+    real dTau = tau/1.07f;
+    real tmp = sigma7*invRho;
+    real gTau = epsilon*tau7*r6*1.12f*tmp*tmp;
+    real termEnergy = epsilon*sigma7*tau7*((sigma7*1.12f*invRho)-2.0f);
+    real deltaE = (-7.0f*(dTau*termEnergy+gTau))*invR;
+    if (r > TAPER_CUTOFF) {
+        real taper = TAPER_C0+r*(TAPER_C1+r*(TAPER_C2+r*(TAPER_C3+r*(TAPER_C4+r*TAPER_C5))));
+        real dtaper = TAPER_C1+r*(2*TAPER_C2+r*(3*TAPER_C3+r*(4*TAPER_C4+r*5*TAPER_C5)));
+        deltaE = termEnergy*dtaper + deltaE*taper;
+        termEnergy *= taper;
+    }
+    tempEnergy += (includeInteraction ? termEnergy : 0);
+    dEdR -= (includeInteraction ? deltaE : 0);
+}
--- a/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaVdwForce.cpp
+++ b/plugins/amoeba/platforms/cuda2/tests/TestCudaAmoebaVdwForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMMAmoeba                             *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Mark Friedrichs                                                   *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of CudaAmoebaVdwForce.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "OpenMMAmoeba.h"
+#include "AmoebaTinkerParameterFile.h"
+#include "openmm/System.h"
+#include "openmm/AmoebaVdwForce.h"
+#include "openmm/LangevinIntegrator.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+const double TOL = 1e-4;
+
+void testVdw( FILE* log ) {
+
+    System system;
+    int numberOfParticles          = 6;
+    AmoebaVdwForce* amoebaVdwForce = new AmoebaVdwForce();
+    std::string sigmaCombiningRule = std::string("CUBIC-MEAN");
+    amoebaVdwForce->setSigmaCombiningRule( sigmaCombiningRule );
+
+    std::string epsilonCombiningRule = std::string("HHG");
+    amoebaVdwForce->setEpsilonCombiningRule( epsilonCombiningRule );
+
+    for( int ii = 0; ii < numberOfParticles; ii++ ){
+        int indexIV, indexClass;
+        double mass, sigma, epsilon, reduction;
+        std::vector< int > exclusions;
+        if( ii == 0 || ii == 3 ){
+            mass        = 16.0;
+            indexIV     = ii;
+            indexClass  = 70;
+            sigma       = 1.70250E+00;
+            epsilon     = 1.10000E-01;
+            reduction   = 0.0;
+        } else {
+            mass        = 1.0;
+            indexIV     = ii < 3 ? 0 : 3;
+            indexClass  = 71;
+            sigma       = 1.32750E+00;
+            epsilon     = 1.35000E-02;
+            reduction   = 0.91;
+        }
+
+        // exclusions
+
+        if( ii < 3 ){
+            exclusions.push_back ( 0 );
+            exclusions.push_back ( 1 );
+            exclusions.push_back ( 2 );
+        } else {
+            exclusions.push_back ( 3 );
+            exclusions.push_back ( 4 );
+            exclusions.push_back ( 5 );
+        }
+        system.addParticle(mass);
+        amoebaVdwForce->addParticle( indexIV, indexClass, sigma, epsilon, reduction );
+        amoebaVdwForce->setParticleExclusions( ii, exclusions );
+    }
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+
+    std::vector<Vec3> positions(numberOfParticles);
+    std::vector<Vec3> expectedForces(numberOfParticles);
+    double expectedEnergy;
+
+    positions[0]          = Vec3( -0.254893450E+02, -0.876646600E+01,  0.174761600E+01 );
+    positions[1]          = Vec3( -0.263489690E+02, -0.907798000E+01,  0.205385100E+01 );
+    positions[2]          = Vec3( -0.252491680E+02, -0.949411200E+01,  0.115017600E+01 );
+    positions[3]          = Vec3(  0.172827200E+01,  0.195873090E+02,  0.100059800E+01 );
+    positions[4]          = Vec3(  0.129370700E+01,  0.190112810E+02,  0.169576300E+01 );
+    positions[5]          = Vec3(  0.256122300E+01,  0.191601930E+02,  0.854382000E+00 );
+
+    double offset         = 27.0;
+    for( int ii = 0; ii < 3; ii++ ){
+       positions[ii][0]      += offset;
+       positions[ii][1]      += offset;
+    }
+
+    expectedForces[0]     = Vec3(  -0.729561040E+03,  0.425828484E+04, -0.769114213E+03 );
+    expectedForces[1]     = Vec3(   0.181000041E+02,  0.328216639E+02, -0.126210511E+02 );
+    expectedForces[2]     = Vec3(  -0.943743014E+00,  0.199728310E+02,  0.884567842E+00 );
+    expectedForces[3]     = Vec3(   0.615734500E+01, -0.747350431E+03,  0.264726489E+03 );
+    expectedForces[4]     = Vec3(   0.735772031E+03, -0.353310112E+04,  0.490066356E+03 );
+    expectedForces[5]     = Vec3(  -0.295245970E+02, -0.306277797E+02,  0.260578506E+02 );
+
+    expectedEnergy        = 0.740688488E+03;
+
+    system.addForce(amoebaVdwForce);
+    std::string platformName;
+    #define AngstromToNm 0.1    
+    #define CalToJoule   4.184    
+    for( int ii = 0; ii < numberOfParticles; ii++ ){
+        positions[ii][0] *= AngstromToNm;
+        positions[ii][1] *= AngstromToNm;
+        positions[ii][2] *= AngstromToNm;
+    }
+    for( int ii = 0; ii < amoebaVdwForce->getNumParticles();  ii++ ){
+        int indexIV, indexClass;
+        double sigma, epsilon, reduction;
+        amoebaVdwForce->getParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
+        sigma        *= AngstromToNm;
+        epsilon      *= CalToJoule;
+        amoebaVdwForce->setParticleParameters( ii, indexIV, indexClass, sigma, epsilon, reduction );
+    }
+    platformName = "CUDA";
+    Context context(system, integrator, Platform::getPlatformByName( platformName ) );
+
+    context.setPositions(positions);
+    State state                      = context.getState(State::Forces | State::Energy);
+    std::vector<Vec3> forces         = state.getForces();
+    const double conversion          = -AngstromToNm/CalToJoule;
+
+    for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+        forces[ii][0] *= conversion;
+        forces[ii][1] *= conversion;
+        forces[ii][2] *= conversion;
+    }    
+    expectedEnergy *= CalToJoule;
+
+#ifdef AMOEBA_DEBUG
+    if( log ){
+        (void) fprintf( log, "computeAmoebaVdwForces: expected energy=%14.7e %14.7e\n", expectedEnergy, state.getPotentialEnergy() );
+        for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+            (void) fprintf( log, "%6u [%14.7e %14.7e %14.7e]   [%14.7e %14.7e %14.7e]\n", ii,
+                            expectedForces[ii][0], expectedForces[ii][1], expectedForces[ii][2], forces[ii][0], forces[ii][1], forces[ii][2] );
+        }
+        (void) fflush( log );
+    }
+#endif
+
+    double tolerance = 1.0e-03;
+    for( unsigned int ii = 0; ii < forces.size(); ii++ ){
+        ASSERT_EQUAL_VEC( expectedForces[ii], forces[ii], tolerance );
+    }
+    ASSERT_EQUAL_TOL( expectedEnergy, state.getPotentialEnergy(), tolerance );
+}
+
+
+int main( int numberOfArguments, char* argv[] ) {
+
+    try {
+        std::cout << "TestCudaAmoebaVdwForce running test..." << std::endl;
+        registerAmoebaCudaKernelFactories();
+
+        FILE* log = NULL;
+        testVdw( log );
+    } catch(const std::exception& e) {
+        std::cout << "exception: " << e.what() << std::endl;
+        std::cout << "FAIL - ERROR.  Test failed." << std::endl;
+        return 1;
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}