Merge branch 'master' into nucleic

fd473eea · Peter Eastman · 0a751b5b · 6a985cfd · fd473eea · fd473eea
Commit fd473eea authored Oct 29, 2015 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -31,6 +31,7 @@
 #include "openmm/internal/AndersenThermostatImpl.h"
 #include "openmm/internal/CMAPTorsionForceImpl.h"
 #include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/CustomCentroidBondForceImpl.h"
 #include "openmm/internal/CustomCompoundBondForceImpl.h"
 #include "openmm/internal/CustomHbondForceImpl.h"
 #include "openmm/internal/CustomManyParticleForceImpl.h"
@@ -104,14 +105,14 @@ void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool

 double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
    cu.getBondedUtilities().computeInteractions(groups);
-    cu.getNonbondedUtilities().computeInteractions(groups);
+    cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
    double sum = 0.0;
    for (vector<CudaContext::ForcePostComputation*>::iterator iter = cu.getPostComputations().begin(); iter != cu.getPostComputations().end(); ++iter)
        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    if (includeEnergy) {
        CudaArray& energyArray = cu.getEnergyBuffer();
-        if (cu.getUseDoublePrecision()) {
+        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
            double* energy = (double*) cu.getPinnedBuffer();
            energyArray.download(energy);
            for (int i = 0; i < energyArray.getSize(); i++)
@@ -1560,8 +1561,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    }
    posq.upload(&temp[0]);
    sigmaEpsilon->upload(sigmaEpsilonVector);
-    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
-    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
+    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
+    bool useCutoff = (nonbondedMethod != NoCutoff);
+    bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
    map<string, string> defines;
    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
@@ -1589,7 +1591,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        dispersionCoefficient = 0.0;
    alpha = 0;
    ewaldSelfEnergy = 0.0;
-    if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.

        int kmaxx, kmaxy, kmaxz;
@@ -1618,11 +1620,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
            cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
        }
    }
-    else if (force.getNonbondedMethod() == NonbondedForce::PME) {
+    else if (nonbondedMethod == PME) {
        // Compute the PME parameters.

-        int gridSizeX, gridSizeY, gridSizeZ;
-
        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
        gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
        gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
@@ -1682,7 +1682,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
-                useCudaFFT = false; // We might switch back in the future, once Nvidia has all their bugs worked out
+                int cufftVersion;
+                cufftGetVersion(&cufftVersion);
+                useCudaFFT = (cufftVersion >= 7050); // There was a critical bug in version 7.0
                if (useCudaFFT) {
                    cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
                    if (result != CUFFT_SUCCESS)
@@ -1993,14 +1995,26 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    
    // Compute other values.
    
-    NonbondedForce::NonbondedMethod method = force.getNonbondedMethod();
-    if (method == NonbondedForce::Ewald || method == NonbondedForce::PME)
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME)
        ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
-    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
+    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
    cu.invalidateMolecules();
 }

+void CudaCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    if (nonbondedMethod != PME)
+        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
+    if (cu.getPlatformData().useCpuPme)
+        cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
+    else {
+        alpha = this->alpha;
+        nx = gridSizeX;
+        ny = gridSizeY;
+        nz = gridSizeZ;
+    }
+}
+
 class CudaCustomNonbondedForceInfo : public CudaForceInfo {
 public:
    CudaCustomNonbondedForceInfo(const CustomNonbondedForce& force) : force(force) {
@@ -2570,8 +2584,8 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
    cutoff = force.getCutoffDistance();
    string source = CudaKernelSources::gbsaObc2;
    nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup());
-    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(float2), params->getDevicePointer()));;
-    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "long long", 1, sizeof(long long), bornForce->getDevicePointer()));;
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(float2), params->getDevicePointer()));
+    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "long long", 1, sizeof(long long), bornForce->getDevicePointer()));
    cu.addForce(new CudaGBSAOBCForceInfo(force));
 }

@@ -3638,7 +3652,9 @@ void CudaCalcCustomExternalForceKernel::initialize(const System& system, const C
        globalParamNames[i] = force.getGlobalParameterName(i);
        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
    }
-    Lepton::ParsedExpression energyExpression = Lepton::Parser::parse(force.getEnergyFunction()).optimize();
+    map<string, Lepton::CustomFunction*> customFunctions;
+    customFunctions["periodicdistance"] = cu.getExpressionUtilities().getPeriodicDistancePlaceholder();
+    Lepton::ParsedExpression energyExpression = Lepton::Parser::parse(force.getEnergyFunction(), customFunctions).optimize();
    Lepton::ParsedExpression forceExpressionX = energyExpression.differentiate("x").optimize();
    Lepton::ParsedExpression forceExpressionY = energyExpression.differentiate("y").optimize();
    Lepton::ParsedExpression forceExpressionZ = energyExpression.differentiate("z").optimize();
@@ -4242,6 +4258,435 @@ void CudaCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& contex
    cu.invalidateMolecules();
 }

+class CudaCustomCentroidBondForceInfo : public CudaForceInfo {
+public:
+    CudaCustomCentroidBondForceInfo(const CustomCentroidBondForce& force) : force(force) {
+    }
+    int getNumParticleGroups() {
+        return force.getNumBonds();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        vector<double> parameters;
+        vector<int> groups;
+        force.getBondParameters(index, groups, parameters);
+        for (int i = 0; i < groups.size(); i++) {
+            vector<int> groupParticles;
+            vector<double> weights;
+            force.getGroupParameters(groups[i], groupParticles, weights);
+            particles.insert(particles.end(), groupParticles.begin(), groupParticles.end());
+        }
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        vector<int> groups1, groups2;
+        vector<double> parameters1, parameters2;
+        force.getBondParameters(group1, groups1, parameters1);
+        force.getBondParameters(group2, groups2, parameters2);
+        for (int i = 0; i < (int) parameters1.size(); i++)
+            if (parameters1[i] != parameters2[i])
+                return false;
+        for (int i = 0; i < groups1.size(); i++) {
+            vector<int> groupParticles;
+            vector<double> weights1, weights2;
+            force.getGroupParameters(groups1[i], groupParticles, weights1);
+            force.getGroupParameters(groups2[i], groupParticles, weights2);
+            if (weights1.size() != weights2.size())
+                return false;
+            for (int j = 0; j < weights1.size(); j++)
+                if (weights1[j] != weights2[j])
+                    return false;
+        }
+        return true;
+    }
+private:
+    const CustomCentroidBondForce& force;
+};
+
+CudaCalcCustomCentroidBondForceKernel::~CudaCalcCustomCentroidBondForceKernel() {
+    cu.setAsCurrent();
+    if (params != NULL)
+        delete params;
+    if (globals != NULL)
+        delete globals;
+    if (groupParticles != NULL)
+        delete groupParticles;
+    if (groupWeights != NULL)
+        delete groupWeights;
+    if (groupOffsets != NULL)
+        delete groupOffsets;
+    if (groupForces != NULL)
+        delete groupForces;
+    if (bondGroups != NULL)
+        delete bondGroups;
+    if (centerPositions != NULL)
+        delete centerPositions;
+    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
+        delete tabulatedFunctions[i];
+}
+
+void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, const CustomCentroidBondForce& force) {
+    cu.setAsCurrent();
+    numBonds = force.getNumBonds();
+    if (numBonds == 0)
+        return;
+    cu.addForce(new CudaCustomCentroidBondForceInfo(force));
+    
+    // Record the groups.
+    
+    numGroups = force.getNumGroups();
+    vector<int> groupParticleVec;
+    vector<float> groupWeightVecFloat;
+    vector<double> groupWeightVecDouble;
+    vector<int> groupOffsetVec;
+    groupOffsetVec.push_back(0);
+    for (int i = 0; i < numGroups; i++) {
+        vector<int> particles;
+        vector<double> weights;
+        force.getGroupParameters(i, particles, weights);
+        groupParticleVec.insert(groupParticleVec.end(), particles.begin(), particles.end());
+        groupOffsetVec.push_back(groupParticleVec.size());
+    }
+    vector<vector<double> > normalizedWeights;
+    CustomCentroidBondForceImpl::computeNormalizedWeights(force, system, normalizedWeights);
+    if (cu.getUseDoublePrecision()) {
+        for (int i = 0; i < numGroups; i++)
+            groupWeightVecDouble.insert(groupWeightVecDouble.end(), normalizedWeights[i].begin(), normalizedWeights[i].end());
+    }
+    else {
+        for (int i = 0; i < numGroups; i++)
+            for (int j = 0; j < normalizedWeights[i].size(); j++)
+                groupWeightVecFloat.push_back((float) normalizedWeights[i][j]);
+    }
+    groupParticles = CudaArray::create<int>(cu, groupParticleVec.size(), "groupParticles");
+    groupParticles->upload(groupParticleVec);
+    if (cu.getUseDoublePrecision()) {
+        groupWeights = CudaArray::create<double>(cu, groupParticleVec.size(), "groupWeights");
+        groupWeights->upload(groupWeightVecDouble);
+        centerPositions = CudaArray::create<double4>(cu, numGroups, "centerPositions");
+    }
+    else {
+        groupWeights = CudaArray::create<float>(cu, groupParticleVec.size(), "groupWeights");
+        groupWeights->upload(groupWeightVecFloat);
+        centerPositions = CudaArray::create<float4>(cu, numGroups, "centerPositions");
+    }
+    groupOffsets = CudaArray::create<int>(cu, groupOffsetVec.size(), "groupOffsets");
+    groupOffsets->upload(groupOffsetVec);
+    groupForces = CudaArray::create<long long>(cu, numGroups*3, "groupForces");
+    cu.addAutoclearBuffer(*groupForces);
+    
+    // Record the bonds.
+    
+    int groupsPerBond = force.getNumGroupsPerBond();
+    vector<int> bondGroupVec(numBonds*groupsPerBond);
+    params = new CudaParameterSet(cu, force.getNumPerBondParameters(), numBonds, "customCentroidBondParams");
+    vector<vector<float> > paramVector(numBonds);
+    for (int i = 0; i < numBonds; i++) {
+        vector<int> groups;
+        vector<double> parameters;
+        force.getBondParameters(i, groups, parameters);
+        for (int j = 0; j < groups.size(); j++)
+            bondGroupVec[i+j*numBonds] = groups[j];
+        paramVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            paramVector[i][j] = (float) parameters[j];
+    }
+    params->setParameterValues(paramVector);
+    bondGroups = CudaArray::create<int>(cu, bondGroupVec.size(), "bondGroups");
+    bondGroups->upload(bondGroupVec);
+    
+    // Record the arguments to the force kernel.
+    
+    groupForcesArgs.push_back(&groupForces->getDevicePointer());
+    groupForcesArgs.push_back(NULL); // Energy buffer hasn't been created yet
+    groupForcesArgs.push_back(&centerPositions->getDevicePointer());
+    groupForcesArgs.push_back(&bondGroups->getDevicePointer());
+
+    // Record the tabulated functions.
+
+    map<string, Lepton::CustomFunction*> functions;
+    vector<pair<string, string> > functionDefinitions;
+    vector<const TabulatedFunction*> functionList;
+    stringstream extraArgs;
+    for (int i = 0; i < force.getNumTabulatedFunctions(); i++) {
+        functionList.push_back(&force.getTabulatedFunction(i));
+        string name = force.getTabulatedFunctionName(i);
+        string arrayName = "table"+cu.intToString(i);
+        functionDefinitions.push_back(make_pair(name, arrayName));
+        functions[name] = cu.getExpressionUtilities().getFunctionPlaceholder(force.getTabulatedFunction(i));
+        int width;
+        vector<float> f = cu.getExpressionUtilities().computeFunctionCoefficients(force.getTabulatedFunction(i), width);
+        tabulatedFunctions.push_back(CudaArray::create<float>(cu, f.size(), "TabulatedFunction"));
+        tabulatedFunctions.back()->upload(f);
+        extraArgs << ", const float";
+        if (width > 1)
+            extraArgs << width;
+        extraArgs << "* __restrict__ " << arrayName;
+        groupForcesArgs.push_back(&tabulatedFunctions.back()->getDevicePointer());
+    }
+    
+    // Record information about parameters.
+
+    globalParamNames.resize(force.getNumGlobalParameters());
+    globalParamValues.resize(force.getNumGlobalParameters());
+    for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+        globalParamNames[i] = force.getGlobalParameterName(i);
+        globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
+    }
+    map<string, string> variables;
+    for (int i = 0; i < groupsPerBond; i++) {
+        string index = cu.intToString(i+1);
+        variables["x"+index] = "pos"+index+".x";
+        variables["y"+index] = "pos"+index+".y";
+        variables["z"+index] = "pos"+index+".z";
+    }
+    for (int i = 0; i < force.getNumPerBondParameters(); i++) {
+        const string& name = force.getPerBondParameterName(i);
+        variables[name] = "bondParams"+params->getParameterSuffix(i);
+    }
+    if (force.getNumGlobalParameters() > 0) {
+        globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customCentroidBondGlobals");
+        globals->upload(globalParamValues);
+        extraArgs << ", const float* __restrict__ globals";
+        for (int i = 0; i < force.getNumGlobalParameters(); i++) {
+            const string& name = force.getGlobalParameterName(i);
+            string value = "globals["+cu.intToString(i)+"]";
+            variables[name] = value;
+        }
+        groupForcesArgs.push_back(&globals->getDevicePointer());
+    }
+
+    // Now to generate the kernel.  First, it needs to calculate all distances, angles,
+    // and dihedrals the expression depends on.
+
+    map<string, vector<int> > distances;
+    map<string, vector<int> > angles;
+    map<string, vector<int> > dihedrals;
+    Lepton::ParsedExpression energyExpression = CustomCentroidBondForceImpl::prepareExpression(force, functions, distances, angles, dihedrals);
+    map<string, Lepton::ParsedExpression> forceExpressions;
+    set<string> computedDeltas;
+    vector<string> atomNames, posNames;
+    for (int i = 0; i < groupsPerBond; i++) {
+        string index = cu.intToString(i+1);
+        atomNames.push_back("P"+index);
+        posNames.push_back("pos"+index);
+    }
+    stringstream compute;
+    for (int i = 0; i < groupsPerBond; i++) {
+        compute<<"int group"<<(i+1)<<" = bondGroups[index+"<<(i*numBonds)<<"];\n";
+        compute<<"real4 pos"<<(i+1)<<" = centerPositions[group"<<(i+1)<<"];\n";
+    }
+    int index = 0;
+    for (map<string, vector<int> >::const_iterator iter = distances.begin(); iter != distances.end(); ++iter, ++index) {
+        const vector<int>& groups = iter->second;
+        string deltaName = atomNames[groups[0]]+atomNames[groups[1]];
+        if (computedDeltas.count(deltaName) == 0) {
+            compute<<"real4 delta"<<deltaName<<" = delta("<<posNames[groups[0]]<<", "<<posNames[groups[1]]<<");\n";
+            computedDeltas.insert(deltaName);
+        }
+        compute<<"real r_"<<deltaName<<" = sqrt(delta"<<deltaName<<".w);\n";
+        variables[iter->first] = "r_"+deltaName;
+        forceExpressions["real dEdDistance"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = angles.begin(); iter != angles.end(); ++iter, ++index) {
+        const vector<int>& groups = iter->second;
+        string deltaName1 = atomNames[groups[1]]+atomNames[groups[0]];
+        string deltaName2 = atomNames[groups[1]]+atomNames[groups[2]];
+        string angleName = "angle_"+atomNames[groups[0]]+atomNames[groups[1]]+atomNames[groups[2]];
+        if (computedDeltas.count(deltaName1) == 0) {
+            compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[groups[1]]<<", "<<posNames[groups[0]]<<");\n";
+            computedDeltas.insert(deltaName1);
+        }
+        if (computedDeltas.count(deltaName2) == 0) {
+            compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[groups[1]]<<", "<<posNames[groups[2]]<<");\n";
+            computedDeltas.insert(deltaName2);
+        }
+        compute<<"real "<<angleName<<" = computeAngle(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
+        variables[iter->first] = angleName;
+        forceExpressions["real dEdAngle"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = dihedrals.begin(); iter != dihedrals.end(); ++iter, ++index) {
+        const vector<int>& groups = iter->second;
+        string deltaName1 = atomNames[groups[0]]+atomNames[groups[1]];
+        string deltaName2 = atomNames[groups[2]]+atomNames[groups[1]];
+        string deltaName3 = atomNames[groups[2]]+atomNames[groups[3]];
+        string crossName1 = "cross_"+deltaName1+"_"+deltaName2;
+        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
+        string dihedralName = "dihedral_"+atomNames[groups[0]]+atomNames[groups[1]]+atomNames[groups[2]]+atomNames[groups[3]];
+        if (computedDeltas.count(deltaName1) == 0) {
+            compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[groups[0]]<<", "<<posNames[groups[1]]<<");\n";
+            computedDeltas.insert(deltaName1);
+        }
+        if (computedDeltas.count(deltaName2) == 0) {
+            compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[groups[2]]<<", "<<posNames[groups[1]]<<");\n";
+            computedDeltas.insert(deltaName2);
+        }
+        if (computedDeltas.count(deltaName3) == 0) {
+            compute<<"real4 delta"<<deltaName3<<" = delta("<<posNames[groups[2]]<<", "<<posNames[groups[3]]<<");\n";
+            computedDeltas.insert(deltaName3);
+        }
+        compute<<"real4 "<<crossName1<<" = computeCross(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
+        compute<<"real4 "<<crossName2<<" = computeCross(delta"<<deltaName2<<", delta"<<deltaName3<<");\n";
+        compute<<"real "<<dihedralName<<" = computeAngle("<<crossName1<<", "<<crossName2<<");\n";
+        compute<<dihedralName<<" *= (delta"<<deltaName1<<".x*"<<crossName2<<".x + delta"<<deltaName1<<".y*"<<crossName2<<".y + delta"<<deltaName1<<".z*"<<crossName2<<".z < 0 ? -1 : 1);\n";
+        variables[iter->first] = dihedralName;
+        forceExpressions["real dEdDihedral"+cu.intToString(index)+" = "] = energyExpression.differentiate(iter->first).optimize();
+    }
+
+    // Now evaluate the expressions.
+
+    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+        CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+        extraArgs<<", const "<<buffer.getType()<<"* __restrict__ globalParams"<<i;
+        compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = globalParams"<<i<<"[index];\n";
+        groupForcesArgs.push_back(&buffer.getMemory());
+    }
+    forceExpressions["energy += "] = energyExpression;
+    compute << cu.getExpressionUtilities().createExpressions(forceExpressions, variables, functionList, functionDefinitions, "temp");
+
+    // Finally, apply forces to groups.
+
+    vector<string> forceNames;
+    for (int i = 0; i < groupsPerBond; i++) {
+        string istr = cu.intToString(i+1);
+        string forceName = "force"+istr;
+        forceNames.push_back(forceName);
+        compute<<"real3 "<<forceName<<" = make_real3(0);\n";
+        compute<<"{\n";
+        Lepton::ParsedExpression forceExpressionX = energyExpression.differentiate("x"+istr).optimize();
+        Lepton::ParsedExpression forceExpressionY = energyExpression.differentiate("y"+istr).optimize();
+        Lepton::ParsedExpression forceExpressionZ = energyExpression.differentiate("z"+istr).optimize();
+        map<string, Lepton::ParsedExpression> expressions;
+        if (!isZeroExpression(forceExpressionX))
+            expressions[forceName+".x -= "] = forceExpressionX;
+        if (!isZeroExpression(forceExpressionY))
+            expressions[forceName+".y -= "] = forceExpressionY;
+        if (!isZeroExpression(forceExpressionZ))
+            expressions[forceName+".z -= "] = forceExpressionZ;
+        if (expressions.size() > 0)
+            compute<<cu.getExpressionUtilities().createExpressions(expressions, variables, functionList, functionDefinitions, "coordtemp");
+        compute<<"}\n";
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = distances.begin(); iter != distances.end(); ++iter, ++index) {
+        const vector<int>& groups = iter->second;
+        string deltaName = atomNames[groups[0]]+atomNames[groups[1]];
+        string value = "(dEdDistance"+cu.intToString(index)+"/r_"+deltaName+")*trim(delta"+deltaName+")";
+        compute<<forceNames[groups[0]]<<" += "<<"-"<<value<<";\n";
+        compute<<forceNames[groups[1]]<<" += "<<value<<";\n";
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = angles.begin(); iter != angles.end(); ++iter, ++index) {
+        const vector<int>& groups = iter->second;
+        string deltaName1 = atomNames[groups[1]]+atomNames[groups[0]];
+        string deltaName2 = atomNames[groups[1]]+atomNames[groups[2]];
+        compute<<"{\n";
+        compute<<"real3 crossProd = cross(delta"<<deltaName2<<", delta"<<deltaName1<<");\n";
+        compute<<"real lengthCross = max(SQRT(dot(crossProd, crossProd)), 1e-6f);\n";
+        compute<<"real3 deltaCross0 = -cross(trim(delta"<<deltaName1<<"), crossProd)*dEdAngle"<<cu.intToString(index)<<"/(delta"<<deltaName1<<".w*lengthCross);\n";
+        compute<<"real3 deltaCross2 = cross(trim(delta"<<deltaName2<<"), crossProd)*dEdAngle"<<cu.intToString(index)<<"/(delta"<<deltaName2<<".w*lengthCross);\n";
+        compute<<"real3 deltaCross1 = -(deltaCross0+deltaCross2);\n";
+        compute<<forceNames[groups[0]]<<" += deltaCross0;\n";
+        compute<<forceNames[groups[1]]<<" += deltaCross1;\n";
+        compute<<forceNames[groups[2]]<<" += deltaCross2;\n";
+        compute<<"}\n";
+    }
+    index = 0;
+    for (map<string, vector<int> >::const_iterator iter = dihedrals.begin(); iter != dihedrals.end(); ++iter, ++index) {
+        const vector<int>& groups = iter->second;
+        string deltaName1 = atomNames[groups[0]]+atomNames[groups[1]];
+        string deltaName2 = atomNames[groups[2]]+atomNames[groups[1]];
+        string deltaName3 = atomNames[groups[2]]+atomNames[groups[3]];
+        string crossName1 = "cross_"+deltaName1+"_"+deltaName2;
+        string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
+        compute<<"{\n";
+        compute<<"real r = sqrt(delta"<<deltaName2<<".w);\n";
+        compute<<"real4 ff;\n";
+        compute<<"ff.x = (-dEdDihedral"<<cu.intToString(index)<<"*r)/"<<crossName1<<".w;\n";
+        compute<<"ff.y = (delta"<<deltaName1<<".x*delta"<<deltaName2<<".x + delta"<<deltaName1<<".y*delta"<<deltaName2<<".y + delta"<<deltaName1<<".z*delta"<<deltaName2<<".z)/delta"<<deltaName2<<".w;\n";
+        compute<<"ff.z = (delta"<<deltaName3<<".x*delta"<<deltaName2<<".x + delta"<<deltaName3<<".y*delta"<<deltaName2<<".y + delta"<<deltaName3<<".z*delta"<<deltaName2<<".z)/delta"<<deltaName2<<".w;\n";
+        compute<<"ff.w = (dEdDihedral"<<cu.intToString(index)<<"*r)/"<<crossName2<<".w;\n";
+        compute<<"real3 internalF0 = ff.x*trim("<<crossName1<<");\n";
+        compute<<"real3 internalF3 = ff.w*trim("<<crossName2<<");\n";
+        compute<<"real3 s = ff.y*internalF0 - ff.z*internalF3;\n";
+        compute<<forceNames[groups[0]]<<" += internalF0;\n";
+        compute<<forceNames[groups[1]]<<" += s-internalF0;\n";
+        compute<<forceNames[groups[2]]<<" += -s-internalF3;\n";
+        compute<<forceNames[groups[3]]<<" += internalF3;\n";
+        compute<<"}\n";
+    }
+    
+    // Save the forces to global memory.
+    
+    for (int i = 0; i < groupsPerBond; i++) {
+        compute<<"atomicAdd(&groupForce[group"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0x100000000)));\n";
+        compute<<"atomicAdd(&groupForce[group"<<(i+1)<<"+NUM_GROUPS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0x100000000)));\n";
+        compute<<"atomicAdd(&groupForce[group"<<(i+1)<<"+NUM_GROUPS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0x100000000)));\n";
+        compute<<"__threadfence_block();\n";
+    }
+    map<string, string> replacements;
+    replacements["M_PI"] = cu.doubleToString(M_PI);
+    replacements["NUM_GROUPS"] = cu.intToString(numGroups);
+    replacements["NUM_BONDS"] = cu.intToString(numBonds);
+    replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    replacements["EXTRA_ARGS"] = extraArgs.str();
+    replacements["COMPUTE_FORCE"] = compute.str();
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customCentroidBond, replacements));
+    computeCentersKernel = cu.getKernel(module, "computeGroupCenters");
+    groupForcesKernel = cu.getKernel(module, "computeGroupForces");
+    applyForcesKernel = cu.getKernel(module, "applyForcesToAtoms");
+}
+
+double CudaCalcCustomCentroidBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (numBonds == 0)
+        return 0.0;
+    if (globals != NULL) {
+        bool changed = false;
+        for (int i = 0; i < (int) globalParamNames.size(); i++) {
+            float value = (float) context.getParameter(globalParamNames[i]);
+            if (value != globalParamValues[i])
+                changed = true;
+            globalParamValues[i] = value;
+        }
+        if (changed)
+            globals->upload(globalParamValues);
+    }
+    void* computeCentersArgs[] = {&cu.getPosq().getDevicePointer(), &groupParticles->getDevicePointer(), &groupWeights->getDevicePointer(),
+            &groupOffsets->getDevicePointer(), &centerPositions->getDevicePointer()};
+    cu.executeKernel(computeCentersKernel, computeCentersArgs, CudaContext::TileSize*numGroups);
+    groupForcesArgs[1] = &cu.getEnergyBuffer().getDevicePointer();
+    cu.executeKernel(groupForcesKernel, &groupForcesArgs[0], numBonds);
+    void* applyForcesArgs[] = {&groupParticles->getDevicePointer(), &groupWeights->getDevicePointer(), &groupOffsets->getDevicePointer(),
+            &groupForces->getDevicePointer(), &cu.getForce().getDevicePointer()};
+    cu.executeKernel(applyForcesKernel, applyForcesArgs, CudaContext::TileSize*numGroups);
+    return 0.0;
+}
+
+void CudaCalcCustomCentroidBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCentroidBondForce& force) {
+    cu.setAsCurrent();
+    if (numBonds != force.getNumBonds())
+        throw OpenMMException("updateParametersInContext: The number of bonds has changed");
+    if (numBonds == 0)
+        return;
+    
+    // Record the per-bond parameters.
+    
+    vector<vector<float> > paramVector(numBonds);
+    vector<int> particles;
+    vector<double> parameters;
+    for (int i = 0; i < numBonds; i++) {
+        force.getBondParameters(i, particles, parameters);
+        paramVector[i].resize(parameters.size());
+        for (int j = 0; j < (int) parameters.size(); j++)
+            paramVector[i][j] = (float) parameters[j];
+    }
+    params->setParameterValues(paramVector);
+    
+    // Mark that the current reordering may be invalid.
+    
+    cu.invalidateMolecules();
+}
+
 class CudaCustomCompoundBondForceInfo : public CudaForceInfo {
 public:
    CudaCustomCompoundBondForceInfo(const CustomCompoundBondForce& force) : force(force) {
@@ -4304,7 +4749,6 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
    map<string, Lepton::CustomFunction*> functions;
    vector<pair<string, string> > functionDefinitions;
    vector<const TabulatedFunction*> functionList;
-    stringstream tableArgs;
    for (int i = 0; i < force.getNumTabulatedFunctions(); i++) {
        functionList.push_back(&force.getTabulatedFunction(i));
        string name = force.getTabulatedFunctionName(i);
@@ -4507,7 +4951,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
    cu.getBondedUtilities().addInteraction(atoms, compute.str(), force.getForceGroup());
    map<string, string> replacements;
    replacements["M_PI"] = cu.doubleToString(M_PI);
-    cu.getBondedUtilities().addPrefixCode(cu.replaceStrings(CudaKernelSources::customCompoundBond, replacements));;
+    cu.getBondedUtilities().addPrefixCode(cu.replaceStrings(CudaKernelSources::customCompoundBond, replacements));
 }

 double CudaCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -375,32 +375,38 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {

    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
-    context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
-    blockSorter->sort(*sortedBlocks);
-    context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
-    context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
-    forceRebuildNeighborList = false;
+    bool rebuild = false;
+    do {
+        context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
+        blockSorter->sort(*sortedBlocks);
+        context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
+        context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
+        forceRebuildNeighborList = false;
+        if (context.getComputeForceCount() == 1)
+            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
+    } while(rebuild);
    lastCutoff = kernels.cutoffDistance;
 }

-void CudaNonbondedUtilities::computeInteractions(int forceGroups) {
+void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
    if ((forceGroups&groupFlags) == 0)
        return;
    KernelSet& kernels = groupKernels[forceGroups];
    if (kernels.hasForces) {
-        context.executeKernel(kernels.forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
-        if (context.getComputeForceCount() == 1)
-            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
+        CUfunction& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
+        if (kernel == NULL)
+            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
+        context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
 }

-void CudaNonbondedUtilities::updateNeighborListSize() {
+bool CudaNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
-        return;
+        return false;
    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
    interactionCount->download(pinnedInteractionCount);
    if (pinnedInteractionCount[0] <= (unsigned int) maxTiles)
-        return;
+        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.
@@ -422,6 +428,7 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
        forceArgs[17] = &interactingAtoms->getDevicePointer();
    findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
    forceRebuildNeighborList = true;
+    return true;
 }

 void CudaNonbondedUtilities::setUsePadding(bool padding) {
@@ -450,8 +457,8 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
    }
    kernels.hasForces = (source.size() > 0);
    kernels.cutoffDistance = cutoff;
-    if (kernels.hasForces)
-        kernels.forceKernel = createInteractionKernel(source, parameters, arguments, true, true, groups);
+    kernels.source = source;
+    kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
    if (useCutoff) {
        double padding = (usePadding ? 0.1*cutoff : 0.0);
        double paddedCutoff = cutoff+padding;
@@ -474,7 +481,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
    groupKernels[groups] = kernels;
 }

-CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups) {
+CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups, bool includeForces, bool includeEnergy) {
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
@@ -650,6 +657,10 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        defines["USE_SYMMETRIC"] = "1";
    if (useShuffle)
        defines["ENABLE_SHUFFLE"] = "1";
+    if (includeForces)
+        defines["INCLUDE_FORCES"] = "1";
+    if (includeEnergy)
+        defines["INCLUDE_ENERGY"] = "1";
    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
    double maxCutoff = 0.0;
    for (int i = 0; i < 32; i++) {

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -623,6 +623,10 @@ void CudaParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl&
        getKernel(i).copyParametersToContext(context, force);
 }

+void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
+    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
+}
+
 class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
 public:
    Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -80,6 +80,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
+    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
+/**
+ * Compute the center of each group.
+ */
+extern "C" __global__ void computeGroupCenters(const real4* __restrict__ posq, const int* __restrict__ groupParticles,
+        const real* __restrict__ groupWeights, const int* __restrict__ groupOffsets, real4* __restrict__ centerPositions) {
+    __shared__ volatile real3 temp[64];
+    for (int group = blockIdx.x; group < NUM_GROUPS; group += gridDim.x) {
+        // The threads in this block work together to compute the center one group.
+        
+        int firstIndex = groupOffsets[group];
+        int lastIndex = groupOffsets[group+1];
+        real3 center = make_real3(0, 0, 0);
+        for (int index = threadIdx.x; index < lastIndex-firstIndex; index += blockDim.x) {
+            int atom = groupParticles[firstIndex+index];
+            real weight = groupWeights[firstIndex+index];
+            real4 pos = posq[atom];
+            center.x += weight*pos.x;
+            center.y += weight*pos.y;
+            center.z += weight*pos.z;
+        }
+        
+        // Sum the values.
+        
+        int thread = threadIdx.x;
+        temp[thread].x = center.x;
+        temp[thread].y = center.y;
+        temp[thread].z = center.z;
+        __syncthreads();
+        if (thread < 32) {
+            temp[thread].x += temp[thread+32].x;
+            temp[thread].y += temp[thread+32].y;
+            temp[thread].z += temp[thread+32].z;
+            if (thread < 16) {
+                temp[thread].x += temp[thread+16].x;
+                temp[thread].y += temp[thread+16].y;
+                temp[thread].z += temp[thread+16].z;
+            }
+            if (thread < 8) {
+                temp[thread].x += temp[thread+8].x;
+                temp[thread].y += temp[thread+8].y;
+                temp[thread].z += temp[thread+8].z;
+            }
+            if (thread < 4) {
+                temp[thread].x += temp[thread+4].x;
+                temp[thread].y += temp[thread+4].y;
+                temp[thread].z += temp[thread+4].z;
+            }
+            if (thread < 2) {
+                temp[thread].x += temp[thread+2].x;
+                temp[thread].y += temp[thread+2].y;
+                temp[thread].z += temp[thread+2].z;
+            }
+        }
+        if (thread == 0)
+            centerPositions[group] = make_real4(temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
+    }
+}
+
+/**
+ * Convert a real4 to a real3 by removing its last element.
+ */
+inline __device__ real3 trim(real4 v) {
+    return make_real3(v.x, v.y, v.z);
+}
+
+/**
+ * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 delta(real4 vec1, real4 vec2) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
+    return result;
+}
+
+/**
+ * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
+ */
+__device__ real computeAngle(real4 vec1, real4 vec2) {
+    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
+    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
+    real angle;
+    if (cosine > 0.99f || cosine < -0.99f) {
+        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
+
+        real3 crossProduct = cross(vec1, vec2);
+        real scale = vec1.w*vec2.w;
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
+        if (cosine < 0.0f)
+            angle = M_PI-angle;
+    }
+    else
+       angle = ACOS(cosine);
+    return angle;
+}
+
+/**
+ * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
+ */
+inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
+    real3 cp = cross(vec1, vec2);
+    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
+}
+
+/**
+ * Compute the forces on groups based on the bonds.
+ */
+extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
+        const int* __restrict__ bondGroups
+        EXTRA_ARGS) {
+    mixed energy = 0;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
+        COMPUTE_FORCE
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
+
+/**
+ * Apply the forces from the group centers to the individual atoms.
+ */
+extern "C" __global__ void applyForcesToAtoms(const int* __restrict__ groupParticles, const real* __restrict__ groupWeights, const int* __restrict__ groupOffsets,
+        const long long* __restrict__ groupForce, unsigned long long* __restrict__ atomForce) {
+    for (int group = blockIdx.x; group < NUM_GROUPS; group += gridDim.x) {
+        long long fx = groupForce[group];
+        long long fy = groupForce[group+NUM_GROUPS];
+        long long fz = groupForce[group+NUM_GROUPS*2];
+        int firstIndex = groupOffsets[group];
+        int lastIndex = groupOffsets[group+1];
+        for (int index = threadIdx.x; index < lastIndex-firstIndex; index += blockDim.x) {
+            int atom = groupParticles[firstIndex+index];
+            real weight = groupWeights[firstIndex+index];
+            atomicAdd(&atomForce[atom], static_cast<unsigned long long>((long long) (fx*weight)));
+            atomicAdd(&atomForce[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (fy*weight)));
+            atomicAdd(&atomForce[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (fz*weight)));
+        }
+    }
+}
--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -13,7 +13,7 @@ typedef struct {
 /**
 * Compute a force based on pair interactions.
 */
-extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
+extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-    real energy = 0;
+    mixed energy = 0;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];

    // First loop: process tiles that contain exclusions.

--- a/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
@@ -2,9 +2,9 @@
 * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
 */

-extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
+extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
-    real energy = 0;
+    mixed energy = 0;
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Load the derivatives


--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Compute forces on donors.
 */
-extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    extern __shared__ real4 posBuffer[];
-    real energy = 0;
+    mixed energy = 0;
    real3 f1 = make_real3(0);
    real3 f2 = make_real3(0);
    real3 f3 = make_real3(0);
@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
 /**
 * Compute forces on acceptors.
 */
-extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
 * Compute the interaction.
 */
 extern "C" __global__ void computeInteraction(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
 #ifdef USE_CUTOFF
        , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
 #endif
        PARAMETER_ARGUMENTS) {
-    real energy = 0.0f;
+    mixed energy = 0;
    
    // Loop over particles to be the first one in the set.
    

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -9,14 +9,14 @@ typedef struct {
 } AtomData;

 extern "C" __global__ void computeInteractionGroups(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
-    real energy = 0.0f;
+    mixed energy = 0;
    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];

    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;

--- a/platforms/cuda/src/kernels/ewald.cu
+++ b/platforms/cuda/src/kernels/ewald.cu
@@ -6,7 +6,7 @@ __device__ real2 multofReal2(real2 a, real2 b) {
 * Precompute the cosine and sine sums which appear in each force term.
 */

-extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
+extern "C" __global__ void calculateEwaldCosSinSums(mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
    const unsigned int ksizex = 2*KMAX_X-1;
    const unsigned int ksizey = 2*KMAX_Y-1;
    const unsigned int ksizez = 2*KMAX_Z-1;
@@ -14,7 +14,7 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
    real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
    real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
    unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
-    real energy = 0;
+    mixed energy = 0;
    while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
        index += blockDim.x*gridDim.x;
    while (index < totalK) {

--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
@@ -33,9 +33,9 @@ extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, c
 * Reduce the Born force.
 */

-extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, real* __restrict__ energyBuffer,
+extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, mixed* __restrict__ energyBuffer,
        const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) {
-    real energy = 0;
+    mixed energy = 0;
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Get summed Born force

@@ -402,7 +402,7 @@ typedef struct {
 */

 extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
-        real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
+        mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
@@ -415,7 +415,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-    real energy = 0;
+    mixed energy = 0;
    __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];

    // First loop: process tiles that contain exclusions.

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -100,7 +100,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
 *
 */
 extern "C" __global__ void computeNonbonded(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, 
@@ -112,7 +112,7 @@ extern "C" __global__ void computeNonbonded(
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
-    real energy = 0.0f;
+    mixed energy = 0;
    // used shared memory if the device cannot shuffle
 #ifndef ENABLE_SHUFFLE
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
@@ -175,6 +175,7 @@ extern "C" __global__ void computeNonbonded(
                real tempEnergy = 0.0f;
                COMPUTE_INTERACTION
                energy += 0.5f*tempEnergy;
+#ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
                force.x -= delta.x*dEdR;
                force.y -= delta.y*dEdR;
@@ -184,6 +185,7 @@ extern "C" __global__ void computeNonbonded(
                force.y -= dEdR1.y;
                force.z -= dEdR1.z;
 #endif
+#endif
 #ifdef USE_EXCLUSIONS
                excl >>= 1;
 #endif
@@ -241,6 +243,7 @@ extern "C" __global__ void computeNonbonded(
                real tempEnergy = 0.0f;
                COMPUTE_INTERACTION
                energy += tempEnergy;
+#ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
                delta *= dEdR;
                force.x -= delta.x;
@@ -270,11 +273,12 @@ extern "C" __global__ void computeNonbonded(
                localData[tbx+tj].fz += dEdR2.z;
 #endif 
 #endif // end USE_SYMMETRIC
-#ifdef USE_EXCLUSIONS
-                excl >>= 1;
-#endif
 #ifdef ENABLE_SHUFFLE
                SHUFFLE_WARP_DATA
+#endif
+#endif
+#ifdef USE_EXCLUSIONS
+                excl >>= 1;
 #endif
                // cycles the indices
                // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
@@ -282,6 +286,7 @@ extern "C" __global__ void computeNonbonded(
            }
            const unsigned int offset = y*TILE_SIZE + tgx;
            // write results for off diagonal tiles
+#ifdef INCLUDE_FORCES
 #ifdef ENABLE_SHUFFLE
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000)));
@@ -290,13 +295,16 @@ extern "C" __global__ void computeNonbonded(
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+#endif
 #endif
        }
        // Write results for on and off diagonal tiles
+#ifdef INCLUDE_FORCES
        const unsigned int offset = x*TILE_SIZE + tgx;
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
@@ -441,6 +449,7 @@ extern "C" __global__ void computeNonbonded(
                    real tempEnergy = 0.0f;
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
+#ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
                    delta *= dEdR;
                    force.x -= delta.x;
@@ -472,6 +481,7 @@ extern "C" __global__ void computeNonbonded(
 #endif // end USE_SYMMETRIC
 #ifdef ENABLE_SHUFFLE
                    SHUFFLE_WARP_DATA
+#endif
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
@@ -509,6 +519,7 @@ extern "C" __global__ void computeNonbonded(
                    real tempEnergy = 0.0f;
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
+#ifdef INCLUDE_FORCES
 #ifdef USE_SYMMETRIC
                    delta *= dEdR;
                    force.x -= delta.x;
@@ -540,12 +551,14 @@ extern "C" __global__ void computeNonbonded(
 #endif // end USE_SYMMETRIC
 #ifdef ENABLE_SHUFFLE
                    SHUFFLE_WARP_DATA
+#endif
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }

            // Write results.
+#ifdef INCLUDE_FORCES
            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
@@ -565,8 +578,11 @@ extern "C" __global__ void computeNonbonded(
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
 #endif
            }
+#endif
        }
        pos++;
    }
+#ifdef INCLUDE_ENERGY
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+#endif
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -115,7 +115,7 @@ extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPm

 // convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
 extern "C" __global__ void 
-reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer, 
+reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer, 
                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, 
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
@@ -150,14 +150,14 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_


 extern "C" __global__ void
-gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer,
+gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
 
-    real energy = 0;
+    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));

--- a/platforms/cuda/staticTarget/CMakeLists.txt
+++ b/platforms/cuda/staticTarget/CMakeLists.txt
@@ -19,7 +19,7 @@ SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE
 IF (APPLE)
    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
 ELSE (APPLE)
-    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}")
 ENDIF (APPLE)

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/platforms/cuda/tests/CMakeLists.txt
+++ b/platforms/cuda/tests/CMakeLists.txt
@@ -28,7 +28,7 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    IF (APPLE)
        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ELSE (APPLE)
-        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
    ENDIF (APPLE)

    ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)

--- a/platforms/cuda/tests/CudaTests.h
+++ b/platforms/cuda/tests/CudaTests.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
+#include "CudaPlatform.h"
+#include <string>
+
+OpenMM::CudaPlatform platform;
+
+void initializeTests(int argc, char* argv[]) {
+    if (argc > 1)
+        platform.setPropertyDefaultValue("CudaPrecision", std::string(argv[1]));
+}
--- a/platforms/cuda/tests/TestCudaAndersenThermostat.cpp
+++ b/platforms/cuda/tests/TestCudaAndersenThermostat.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,191 +29,8 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-/**
- * This tests the CUDA implementation of AndersenThermostat.
- */
+#include "CudaTests.h"
+#include "TestAndersenThermostat.h"

-#include "openmm/internal/AssertionUtilities.h"
-#include "openmm/AndersenThermostat.h"
-#include "openmm/Context.h"
-#include "CudaPlatform.h"
-#include "openmm/NonbondedForce.h"
-#include "openmm/System.h"
-#include "openmm/VerletIntegrator.h"
-#include "SimTKOpenMMRealType.h"
-#include "sfmt/SFMT.h"
-#include <iostream>
-#include <vector>
-
-using namespace OpenMM;
-using namespace std;
-
-CudaPlatform platform;
-
-void testTemperature() {
-    const int numParticles = 8;
-    const double temp = 100.0;
-    const double collisionFreq = 10.0;
-    const int numSteps = 5000;
-    System system;
-    VerletIntegrator integrator(0.003);
-    NonbondedForce* forceField = new NonbondedForce();
-    for (int i = 0; i < numParticles; ++i) {
-        system.addParticle(2.0);
-        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
-    }
-    system.addForce(forceField);
-    AndersenThermostat* thermstat = new AndersenThermostat(temp, collisionFreq);
-    system.addForce(thermstat);
-    Context context(system, integrator, platform);
-    vector<Vec3> positions(numParticles);
-    for (int i = 0; i < numParticles; ++i)
-        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
-    context.setPositions(positions);
-    context.setVelocitiesToTemperature(temp);
-
-    // Let it equilibrate.
-
-    integrator.step(10000);
-
-    // Now run it for a while and see if the temperature is correct.
-
-    double ke = 0.0;
-    for (int i = 0; i < numSteps; ++i) {
-        State state = context.getState(State::Energy);
-        ke += state.getKineticEnergy();
-        integrator.step(10);
-    }
-    ke /= numSteps;
-    double expected = 0.5*numParticles*3*BOLTZ*temp;
-    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 0.1);
-}
-
-void testConstraints() {
-    const int numParticles = 8;
-    const double temp = 100.0;
-    const double collisionFreq = 10.0;
-    const int numSteps = 15000;
-    System system;
-    VerletIntegrator integrator(0.004);
-    NonbondedForce* forceField = new NonbondedForce();
-    for (int i = 0; i < numParticles; ++i) {
-        system.addParticle(2.0);
-        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
-    }
-    system.addForce(forceField);
-    system.addConstraint(0, 1, 1);
-    system.addConstraint(1, 2, 1);
-    system.addConstraint(2, 3, 1);
-    system.addConstraint(3, 0, 1);
-    system.addConstraint(4, 5, 1);
-    system.addConstraint(5, 6, 1);
-    system.addConstraint(6, 7, 1);
-    system.addConstraint(7, 4, 1);
-    AndersenThermostat* thermstat = new AndersenThermostat(temp, collisionFreq);
-    system.addForce(thermstat);
-    Context context(system, integrator, platform);
-    vector<Vec3> positions(numParticles);
-    positions[0] = Vec3(0, 0, 0);
-    positions[1] = Vec3(1, 0, 0);
-    positions[2] = Vec3(1, 1, 0);
-    positions[3] = Vec3(0, 1, 0);
-    positions[4] = Vec3(1, 0, 1);
-    positions[5] = Vec3(1, 1, 1);
-    positions[6] = Vec3(0, 1, 1);
-    positions[7] = Vec3(0, 0, 1);
-    context.setPositions(positions);
-    context.setVelocitiesToTemperature(temp);
-
-    // Let it equilibrate.
-
-    integrator.step(5000);
-
-    // Now run it for a while and see if the temperature is correct.
-
-    double ke = 0.0;
-    for (int i = 0; i < numSteps; ++i) {
-        State state = context.getState(State::Energy);
-        ke += state.getKineticEnergy();
-        integrator.step(1);
-    }
-    ke /= numSteps;
-    double expected = 0.5*(numParticles*3-system.getNumConstraints())*BOLTZ*temp;
-    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 0.1);
-}
-
-void testRandomSeed() {
-    const int numParticles = 8;
-    const double temp = 100.0;
-    const double collisionFreq = 10.0;
-    System system;
-    VerletIntegrator integrator(0.01);
-    NonbondedForce* forceField = new NonbondedForce();
-    for (int i = 0; i < numParticles; ++i) {
-        system.addParticle(2.0);
-        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
-    }
-    system.addForce(forceField);
-    AndersenThermostat* thermostat = new AndersenThermostat(temp, collisionFreq);
-    system.addForce(thermostat);
-    vector<Vec3> positions(numParticles);
-    vector<Vec3> velocities(numParticles);
-    for (int i = 0; i < numParticles; ++i) {
-        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
-        velocities[i] = Vec3(0, 0, 0);
-    }
-
-    // Try twice with the same random seed.
-
-    thermostat->setRandomNumberSeed(5);
-    Context context(system, integrator, platform);
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state1 = context.getState(State::Positions);
-    context.reinitialize();
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state2 = context.getState(State::Positions);
-
-    // Try twice with a different random seed.
-
-    thermostat->setRandomNumberSeed(10);
-    context.reinitialize();
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state3 = context.getState(State::Positions);
-    context.reinitialize();
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state4 = context.getState(State::Positions);
-
-    // Compare the results.
-
-    for (int i = 0; i < numParticles; i++) {
-        for (int j = 0; j < 3; j++) {
-            ASSERT(state1.getPositions()[i][j] == state2.getPositions()[i][j]);
-            ASSERT(state3.getPositions()[i][j] == state4.getPositions()[i][j]);
-            ASSERT(state1.getPositions()[i][j] != state3.getPositions()[i][j]);
-        }
-    }
-}
-
-int main(int argc, char* argv[]) {
-    try {
-        if (argc > 1)
-            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
-        testTemperature();
-        testConstraints();
-        testRandomSeed();
-    }
-    catch(const exception& e) {
-        cout << "exception: " << e.what() << endl;
-        return 1;
-    }
-    cout << "Done" << endl;
-    return 0;
+void runPlatformTests() {
 }
--- a/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,252 +29,8 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "openmm/System.h"
+#include "CudaTests.h"
+#include "TestBrownianIntegrator.h"

-
-/**
- * This tests the CUDA implementation of BrownianIntegrator.
- */
-
-#include "openmm/internal/AssertionUtilities.h"
-#include "openmm/Context.h"
-#include "CudaPlatform.h"
-#include "openmm/HarmonicBondForce.h"
-#include "openmm/NonbondedForce.h"
-#include "openmm/System.h"
-#include "openmm/BrownianIntegrator.h"
-#include "SimTKOpenMMRealType.h"
-#include "sfmt/SFMT.h"
-#include <iostream>
-#include <vector>
-
-using namespace OpenMM;
-using namespace std;
-
-const double TOL = 1e-5;
-
-CudaPlatform platform;
-
-void testSingleBond() {
-    System system;
-    system.addParticle(2.0);
-    system.addParticle(2.0);
-    double dt = 0.01;
-    BrownianIntegrator integrator(0, 0.1, dt);
-    HarmonicBondForce* forceField = new HarmonicBondForce();
-    forceField->addBond(0, 1, 1.5, 1);
-    system.addForce(forceField);
-    Context context(system, integrator, platform);
-    vector<Vec3> positions(2);
-    positions[0] = Vec3(-1, 0, 0);
-    positions[1] = Vec3(1, 0, 0);
-    context.setPositions(positions);
-
-    // This is simply an overdamped harmonic oscillator, so compare it to the analytical solution.
-
-    double rate = 2*1.0/(0.1*2.0);
-    for (int i = 0; i < 1000; ++i) {
-        State state = context.getState(State::Positions | State::Velocities);
-        double time = state.getTime();
-        double expectedDist = 1.5+0.5*std::exp(-rate*time);
-        ASSERT_EQUAL_VEC(Vec3(-0.5*expectedDist, 0, 0), state.getPositions()[0], 0.02);
-        ASSERT_EQUAL_VEC(Vec3(0.5*expectedDist, 0, 0), state.getPositions()[1], 0.02);
-        if (i > 0) {
-            double expectedSpeed = -0.5*rate*std::exp(-rate*(time-0.5*dt));
-            ASSERT_EQUAL_VEC(Vec3(-0.5*expectedSpeed, 0, 0), state.getVelocities()[0], 0.11);
-            ASSERT_EQUAL_VEC(Vec3(0.5*expectedSpeed, 0, 0), state.getVelocities()[1], 0.11);
-        }
-        integrator.step(1);
-    }
-}
-
-void testTemperature() {
-    const int numParticles = 8;
-    const int numBonds = numParticles-1;
-    const double temp = 10.0;
-    System system;
-    BrownianIntegrator integrator(temp, 2.0, 0.01);
-    HarmonicBondForce* forceField = new HarmonicBondForce();
-    for (int i = 0; i < numParticles; ++i)
-        system.addParticle(2.0);
-    for (int i = 0; i < numBonds; ++i)
-        forceField->addBond(i, i+1, 1.0, 5.0);
-    system.addForce(forceField);
-    Context context(system, integrator, platform);
-    vector<Vec3> positions(numParticles);
-    for (int i = 0; i < numParticles; ++i)
-        positions[i] = Vec3(i, 0, 0);
-    context.setPositions(positions);
-
-    // Let it equilibrate.
-
-    integrator.step(10000);
-
-    // Now run it for a while and see if the temperature is correct.
-
-    double pe = 0.0;
-    const int steps = 50000;
-    for (int i = 0; i < steps; ++i) {
-        State state = context.getState(State::Energy);
-        pe += state.getPotentialEnergy();
-        integrator.step(1);
-    }
-    pe /= steps;
-    double expected = 0.5*numBonds*BOLTZ*temp;
-    ASSERT_USUALLY_EQUAL_TOL(expected, pe, 0.1*expected);
-}
-
-void testConstraints() {
-    const int numParticles = 8;
-    const int numConstraints = 5;
-    const double temp = 20.0;
-    System system;
-    BrownianIntegrator integrator(temp, 2.0, 0.001);
-    integrator.setConstraintTolerance(1e-5);
-    NonbondedForce* forceField = new NonbondedForce();
-    for (int i = 0; i < numParticles; ++i) {
-        system.addParticle(10.0);
-        forceField->addParticle((i%2 == 0 ? 0.2 : -0.2), 0.5, 5.0);
-    }
-    system.addConstraint(0, 1, 1.0);
-    system.addConstraint(1, 2, 1.0);
-    system.addConstraint(2, 3, 1.0);
-    system.addConstraint(4, 5, 1.0);
-    system.addConstraint(6, 7, 1.0);
-    system.addForce(forceField);
-    Context context(system, integrator, platform);
-    vector<Vec3> positions(numParticles);
-    vector<Vec3> velocities(numParticles);
-    OpenMM_SFMT::SFMT sfmt;
-    init_gen_rand(0, sfmt);
-
-    for (int i = 0; i < numParticles; ++i) {
-        positions[i] = Vec3(i, 0, 0);
-        velocities[i] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
-    }
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-
-    // Simulate it and see whether the constraints remain satisfied.
-
-    for (int i = 0; i < 1000; ++i) {
-        State state = context.getState(State::Positions);
-        for (int j = 0; j < numConstraints; ++j) {
-            int particle1, particle2;
-            double distance;
-            system.getConstraintParameters(j, particle1, particle2, distance);
-            Vec3 p1 = state.getPositions()[particle1];
-            Vec3 p2 = state.getPositions()[particle2];
-            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
-            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
-        }
-        integrator.step(1);
-    }
-}
-
-void testConstrainedMasslessParticles() {
-    System system;
-    system.addParticle(0.0);
-    system.addParticle(1.0);
-    system.addConstraint(0, 1, 1.5);
-    vector<Vec3> positions(2);
-    positions[0] = Vec3(-1, 0, 0);
-    positions[1] = Vec3(1, 0, 0);
-    BrownianIntegrator integrator(300.0, 2.0, 0.01);
-    bool failed = false;
-    try {
-        // This should throw an exception.
-        
-        Context context(system, integrator, platform);
-    }
-    catch (exception& ex) {
-        failed = true;
-    }
-    ASSERT(failed);
-    
-    // Now make both particles massless, which should work.
-    
-    system.setParticleMass(1, 0.0);
-    Context context(system, integrator, platform);
-    context.setPositions(positions);
-    context.setVelocitiesToTemperature(300.0);
-    integrator.step(1);
-    State state = context.getState(State::Velocities);
-    ASSERT_EQUAL(0.0, state.getVelocities()[0][0]);
-}
-
-void testRandomSeed() {
-    const int numParticles = 8;
-    const double temp = 100.0;
-    const double collisionFreq = 10.0;
-    System system;
-    BrownianIntegrator integrator(temp, 2.0, 0.001);
-    NonbondedForce* forceField = new NonbondedForce();
-    for (int i = 0; i < numParticles; ++i) {
-        system.addParticle(2.0);
-        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
-    }
-    system.addForce(forceField);
-    vector<Vec3> positions(numParticles);
-    vector<Vec3> velocities(numParticles);
-    for (int i = 0; i < numParticles; ++i) {
-        positions[i] = Vec3((i%2 == 0 ? 2 : -2), (i%4 < 2 ? 2 : -2), (i < 4 ? 2 : -2));
-        velocities[i] = Vec3(0, 0, 0);
-    }
-
-    // Try twice with the same random seed.
-
-    integrator.setRandomNumberSeed(5);
-    Context context(system, integrator, platform);
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state1 = context.getState(State::Positions);
-    context.reinitialize();
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state2 = context.getState(State::Positions);
-
-    // Try twice with a different random seed.
-
-    integrator.setRandomNumberSeed(10);
-    context.reinitialize();
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state3 = context.getState(State::Positions);
-    context.reinitialize();
-    context.setPositions(positions);
-    context.setVelocities(velocities);
-    integrator.step(10);
-    State state4 = context.getState(State::Positions);
-
-    // Compare the results.
-
-    for (int i = 0; i < numParticles; i++) {
-        for (int j = 0; j < 3; j++) {
-            ASSERT(state1.getPositions()[i][j] == state2.getPositions()[i][j]);
-            ASSERT(state3.getPositions()[i][j] == state4.getPositions()[i][j]);
-            ASSERT(state1.getPositions()[i][j] != state3.getPositions()[i][j]);
-        }
-    }
-}
-
-int main(int argc, char* argv[]) {
-    try {
-        if (argc > 1)
-            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
-        testSingleBond();
-        testTemperature();
-        testConstraints();
-        testConstrainedMasslessParticles();
-        testRandomSeed();
-    }
-    catch(const exception& e) {
-        cout << "exception: " << e.what() << endl;
-        return 1;
-    }
-    cout << "Done" << endl;
-    return 0;
+void runPlatformTests() {
 }
--- a/platforms/cuda/tests/TestCudaCMAPTorsionForce.cpp
+++ b/platforms/cuda/tests/TestCudaCMAPTorsionForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2015 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -29,149 +29,8 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-/**
- * This tests the CUDA implementation of CMAPTorsionForce.
- */
+#include "CudaTests.h"
+#include "TestCMAPTorsionForce.h"

-#include "openmm/internal/AssertionUtilities.h"
-#include "openmm/Context.h"
-#include "CudaPlatform.h"
-#include "openmm/CMAPTorsionForce.h"
-#include "openmm/PeriodicTorsionForce.h"
-#include "openmm/System.h"
-#include "openmm/VerletIntegrator.h"
-#include "SimTKOpenMMRealType.h"
-#include "sfmt/SFMT.h"
-#include <iostream>
-#include <vector>
-
-using namespace OpenMM;
-using namespace std;
-
-const double TOL = 1e-5;
-
-CudaPlatform platform;
-
-void testCMAPTorsions() {
-    const int mapSize = 36;
-
-    // Create two systems: one with a pair of periodic torsions, and one with a CMAP torsion
-    // that approximates the same force.
-
-    System system1;
-    for (int i = 0; i < 5; i++)
-        system1.addParticle(1.0);
-    PeriodicTorsionForce* periodic = new PeriodicTorsionForce();
-    periodic->addTorsion(0, 1, 2, 3, 2, M_PI/4, 1.5);
-    periodic->addTorsion(1, 2, 3, 4, 3, M_PI/3, 2.0);
-    system1.addForce(periodic);
-    System system2;
-    for (int i = 0; i < 5; i++)
-        system2.addParticle(1.0);
-    CMAPTorsionForce* cmap = new CMAPTorsionForce();
-    vector<double> mapEnergy(mapSize*mapSize);
-    for (int i = 0; i < mapSize; i++) {
-        double angle1 = i*2*M_PI/mapSize;
-        double energy1 = 1.5*(1+cos(2*angle1-M_PI/4));
-        for (int j = 0; j < mapSize; j++) {
-            double angle2 = j*2*M_PI/mapSize;
-            double energy2 = 2.0*(1+cos(3*angle2-M_PI/3));
-            mapEnergy[i+j*mapSize] = energy1+energy2;
-        }
-    }
-    cmap->addMap(mapSize, mapEnergy);
-    cmap->addTorsion(0, 0, 1, 2, 3, 1, 2, 3, 4);
-    system2.addForce(cmap);
-
-    // Set the atoms in various positions, and verify that both systems give equal forces and energy.
-
-    OpenMM_SFMT::SFMT sfmt;
-    init_gen_rand(0, sfmt);
-    vector<Vec3> positions(5);
-    VerletIntegrator integrator1(0.01);
-    VerletIntegrator integrator2(0.01);
-    Context c1(system1, integrator1, platform);
-    Context c2(system2, integrator2, platform);
-    for (int i = 0; i < 50; i++) {
-        for (int j = 0; j < (int) positions.size(); j++)
-            positions[j] = Vec3(5.0*genrand_real2(sfmt), 5.0*genrand_real2(sfmt), 5.0*genrand_real2(sfmt));
-        c1.setPositions(positions);
-        c2.setPositions(positions);
-        State s1 = c1.getState(State::Forces | State::Energy);
-        State s2 = c2.getState(State::Forces | State::Energy);
-        for (int i = 0; i < system1.getNumParticles(); i++)
-            ASSERT_EQUAL_VEC(s1.getForces()[i], s2.getForces()[i], 0.05);
-        ASSERT_EQUAL_TOL(s1.getPotentialEnergy(), s2.getPotentialEnergy(), 1e-3);
-    }
-}
-
-void testChangingParameters() {
-    // Create a system with two maps and one torsion.
-
-    const int mapSize = 8;
-    System system;
-    for (int i = 0; i < 5; i++)
-        system.addParticle(1.0);
-    CMAPTorsionForce* cmap = new CMAPTorsionForce();
-    vector<double> mapEnergy1(mapSize*mapSize);
-    vector<double> mapEnergy2(mapSize*mapSize);
-    for (int i = 0; i < mapSize; i++) {
-        double angle1 = i*2*M_PI/mapSize;
-        double energy1 = cos(angle1);
-        for (int j = 0; j < mapSize; j++) {
-            double angle2 = j*2*M_PI/mapSize;
-            double energy2 = 10*sin(angle2);
-            mapEnergy1[i+j*mapSize] = energy1+energy2;
-            mapEnergy2[i+j*mapSize] = energy1-energy2;
-        }
-    }
-    cmap->addMap(mapSize, mapEnergy1);
-    cmap->addMap(mapSize, mapEnergy2);
-    cmap->addTorsion(0, 0, 1, 2, 3, 1, 2, 3, 4);
-    system.addForce(cmap);
-
-    // Set particle positions so angle1=0 and angle2=PI/4.
-
-    vector<Vec3> positions(5);
-    positions[0] = Vec3(0, 0, 1);
-    positions[1] = Vec3(0, 0, 0);
-    positions[2] = Vec3(1, 0, 0);
-    positions[3] = Vec3(1, 0, 1);
-    positions[4] = Vec3(0.5, -0.5, 1);
-    VerletIntegrator integrator(0.01);
-    Context context(system, integrator, platform);
-    context.setPositions(positions);
-
-    // Check that the energy is correct.
-
-    double energy = context.getState(State::Energy).getPotentialEnergy();
-    ASSERT_EQUAL_TOL(1+10*sin(M_PI/4), energy, 1e-5);
-
-    // Modify the parameters.
-
-    cmap->setTorsionParameters(0, 1, 0, 1, 2, 3, 1, 2, 3, 4);
-    for (int i = 0; i < mapSize*mapSize; i++)
-        mapEnergy2[i] *= 2.0;
-    cmap->setMapParameters(1, mapSize, mapEnergy2);
-    cmap->updateParametersInContext(context);
-
-    // See if the results are correct.
-
-    energy = context.getState(State::Energy).getPotentialEnergy();
-    ASSERT_EQUAL_TOL(2-20*sin(M_PI/4), energy, 1e-5);
-}
-
-int main(int argc, char* argv[]) {
-    try {
-        if (argc > 1)
-            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
-        testCMAPTorsions();
-        testChangingParameters();
-    }
-    catch(const exception& e) {
-        cout << "exception: " << e.what() << endl;
-        return 1;
-    }
-    cout << "Done" << endl;
-    return 0;
+void runPlatformTests() {
 }