Began CUDA version of GayBerneForce

222b3bb4 · peastman · a381a3ab · 222b3bb4 · 222b3bb4 · 222b3bb4
Commit 222b3bb4 authored Aug 01, 2016 by peastman
6 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1104,6 +1104,74 @@ private:
    CUevent event;
 };

+/**
+ * This kernel is invoked by GayBerneForce to calculate the forces acting on the system.
+ */
+class CudaCalcGayBerneForceKernel : public CalcGayBerneForceKernel {
+public:
+    CudaCalcGayBerneForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGayBerneForceKernel(name, platform), cu(cu),
+            hasInitializedKernels(false), sortedParticles(NULL), axisParticleIndices(NULL), sigParams(NULL), epsParams(NULL), scale(NULL), exceptionParticles(NULL),
+            exceptionParams(NULL), aMatrix(NULL),
+            bMatrix(NULL), gMatrix(NULL), exclusions(NULL), exclusionStartIndex(NULL), blockCenter(NULL), blockBoundingBox(NULL), neighbors(NULL),
+            neighborIndex(NULL), neighborBlockCount(NULL), sortedPos(NULL), torque(NULL) {
+    }
+    ~CudaCalcGayBerneForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the GayBerneForce this kernel will be used for
+     */
+    void initialize(const System& system, const GayBerneForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the GayBerneForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const GayBerneForce& force);
+private:
+    class ReorderListener;
+    void sortAtoms();
+    CudaContext& cu;
+    bool hasInitializedKernels;
+    int numRealParticles, numExceptions, maxNeighborBlocks;
+    GayBerneForce::NonbondedMethod nonbondedMethod;
+    CudaArray* sortedParticles;
+    CudaArray* axisParticleIndices;
+    CudaArray* sigParams;
+    CudaArray* epsParams;
+    CudaArray* scale;
+    CudaArray* exceptionParticles;
+    CudaArray* exceptionParams;
+    CudaArray* aMatrix;
+    CudaArray* bMatrix;
+    CudaArray* gMatrix;
+    CudaArray* exclusions;
+    CudaArray* exclusionStartIndex;
+    CudaArray* blockCenter;
+    CudaArray* blockBoundingBox;
+    CudaArray* neighbors;
+    CudaArray* neighborIndex;
+    CudaArray* neighborBlockCount;
+    CudaArray* sortedPos;
+    CudaArray* torque;
+    std::vector<bool> isRealParticle;
+    std::vector<std::pair<int, int> > exceptionAtoms;
+    std::vector<std::pair<int, int> > excludedPairs;
+    std::vector<void*> framesArgs, blockBoundsArgs, neighborsArgs, forceArgs, torqueArgs;
+    CUfunction framesKernel, blockBoundsKernel, neighborsKernel, forceKernel, torqueKernel;
+    CUevent event;
+};
+
 /**
 * This kernel is invoked by VerletIntegrator to take one time step.
 */

--- a/platforms/cuda/src/CudaKernelFactory.cpp
+++ b/platforms/cuda/src/CudaKernelFactory.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -110,6 +110,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomManyParticleForceKernel::Name())
        return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcGayBerneForceKernel::Name())
+        return new CudaCalcGayBerneForceKernel(name, platform, cu);
    if (name == IntegrateVerletStepKernel::Name())
        return new CudaIntegrateVerletStepKernel(name, platform, cu);
    if (name == IntegrateLangevinStepKernel::Name())

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -5988,6 +5988,431 @@ void CudaCalcCustomManyParticleForceKernel::copyParametersToContext(ContextImpl&
    cu.invalidateMolecules();
 }

+class CudaGayBerneForceInfo : public CudaForceInfo {
+public:
+    CudaGayBerneForceInfo(const GayBerneForce& force) : force(force) {
+    }
+    bool areParticlesIdentical(int particle1, int particle2) {
+        int xparticle1, yparticle1;
+        double sigma1, epsilon1, sx1, sy1, sz1, ex1, ey1, ez1;
+        int xparticle2, yparticle2;
+        double sigma2, epsilon2, sx2, sy2, sz2, ex2, ey2, ez2;
+        force.getParticleParameters(particle1, sigma1, epsilon1, xparticle1, yparticle1, sx1, sy1, sz1, ex1, ey1, ez1);
+        force.getParticleParameters(particle2, sigma2, epsilon2, xparticle2, yparticle2, sx2, sy2, sz2, ex2, ey2, ez2);
+        return (sigma1 == sigma2 && epsilon1 == epsilon2 && sx1 == sx2 && sy1 == sy2 && sz1 == sz2 && ex1 == ex2 && ey1 == ey2 && ez1 == ez2);
+    }
+    int getNumParticleGroups() {
+        return force.getNumExceptions()+force.getNumParticles();
+    }
+    void getParticlesInGroup(int index, vector<int>& particles) {
+        if (index < force.getNumExceptions()) {
+            int particle1, particle2;
+            double sigma, epsilon;
+            force.getExceptionParameters(index, particle1, particle2, sigma, epsilon);
+            particles.resize(2);
+            particles[0] = particle1;
+            particles[1] = particle2;
+        }
+        else {
+            int particle = index-force.getNumExceptions();
+            int xparticle, yparticle;
+            double sigma, epsilon, sx, sy, sz, ex, ey, ez;
+            force.getParticleParameters(particle, sigma, epsilon, xparticle, yparticle, sx, sy, sz, ex, ey, ez);
+            particles.clear();
+            particles.push_back(particle);
+            if (xparticle > -1)
+                particles.push_back(xparticle);
+            if (yparticle > -1)
+                particles.push_back(yparticle);
+        }
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        if (group1 < force.getNumExceptions() && group2 < force.getNumExceptions()) {
+            int particle1, particle2;
+            double sigma1, sigma2, epsilon1, epsilon2;
+            force.getExceptionParameters(group1, particle1, particle2, sigma1, epsilon1);
+            force.getExceptionParameters(group2, particle1, particle2, sigma2, epsilon2);
+            return (sigma1 == sigma2 && epsilon1 == epsilon2);
+        }
+        return true;
+    }
+private:
+    const GayBerneForce& force;
+};
+
+class CudaCalcGayBerneForceKernel::ReorderListener : public CudaContext::ReorderListener {
+public:
+    ReorderListener(CudaCalcGayBerneForceKernel& owner) : owner(owner) {
+    }
+    void execute() {
+        owner.sortAtoms();
+    }
+private:
+    CudaCalcGayBerneForceKernel& owner;
+};
+
+CudaCalcGayBerneForceKernel::~CudaCalcGayBerneForceKernel() {
+    if (sortedParticles != NULL)
+        delete sortedParticles;
+    if (axisParticleIndices != NULL)
+        delete axisParticleIndices;
+    if (sigParams != NULL)
+        delete sigParams;
+    if (epsParams != NULL)
+        delete epsParams;
+    if (scale != NULL)
+        delete scale;
+    if (exceptionParticles != NULL)
+        delete exceptionParticles;
+    if (exceptionParams != NULL)
+        delete exceptionParams;
+    if (aMatrix != NULL)
+        delete aMatrix;
+    if (bMatrix != NULL)
+        delete bMatrix;
+    if (gMatrix != NULL)
+        delete gMatrix;
+    if (exclusions != NULL)
+        delete exclusions;
+    if (exclusionStartIndex != NULL)
+        delete exclusionStartIndex;
+    if (blockCenter != NULL)
+        delete blockCenter;
+    if (blockBoundingBox != NULL)
+        delete blockBoundingBox;
+    if (neighbors != NULL)
+        delete neighbors;
+    if (neighborIndex != NULL)
+        delete neighborIndex;
+    if (neighborBlockCount != NULL)
+        delete neighborBlockCount;
+    if (sortedPos != NULL)
+        delete sortedPos;
+    if (torque != NULL)
+        delete torque;
+}
+
+void CudaCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
+    // Initialize interactions.
+
+    int numParticles = force.getNumParticles();
+    sigParams = CudaArray::create<float4>(cu, cu.getPaddedNumAtoms(), "sigParams");
+    epsParams = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "epsParams");
+    scale = CudaArray::create<float4>(cu, cu.getPaddedNumAtoms(), "scale");
+    axisParticleIndices = CudaArray::create<int2>(cu, cu.getPaddedNumAtoms(), "axisParticleIndices");
+    sortedParticles = CudaArray::create<int>(cu, cu.getPaddedNumAtoms(), "sortedParticles");
+    aMatrix = CudaArray::create<float>(cu, 9*cu.getPaddedNumAtoms(), "aMatrix");
+    bMatrix = CudaArray::create<float>(cu, 9*cu.getPaddedNumAtoms(), "bMatrix");
+    gMatrix = CudaArray::create<float>(cu, 9*cu.getPaddedNumAtoms(), "gMatrix");
+    vector<float4> sigParamsVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    vector<float2> epsParamsVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
+    vector<float4> scaleVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    vector<int2> axisParticleVector(cu.getPaddedNumAtoms(), make_int2(0, 0));
+    isRealParticle.resize(cu.getPaddedNumAtoms());
+    for (int i = 0; i < numParticles; i++) {
+        int xparticle, yparticle;
+        double sigma, epsilon, sx, sy, sz, ex, ey, ez;
+        force.getParticleParameters(i, sigma, epsilon, xparticle, yparticle, sx, sy, sz, ex, ey, ez);
+        axisParticleVector[i] = make_int2(xparticle, yparticle);
+        sigParamsVector[i] = make_float4((float) (0.5*sigma), (float) (0.25*sx*sx), (float) (0.25*sy*sy), (float) (0.25*sz*sz));
+        epsParamsVector[i] = make_float2((float) sqrt(epsilon), (float) (0.125*(sx*sy + sz*sz)*sqrt(sx*sy)));
+        scaleVector[i] = make_float4((float) (1/sqrt(ex)), (float) (1/sqrt(ey)), (float) (1/sqrt(ez)), 0);
+        isRealParticle[i] = (epsilon != 0.0);
+    }
+    sigParams->upload(sigParamsVector);
+    epsParams->upload(epsParamsVector);
+    scale->upload(scaleVector);
+    axisParticleIndices->upload(axisParticleVector);
+    
+    // Record exceptions and exclusions.
+
+    vector<float2> exceptionParamsVec;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, sigma, epsilon);
+        if (epsilon != 0.0) {
+            exceptionParamsVec.push_back(make_float2((float) sigma, (float) epsilon));
+            exceptionAtoms.push_back(make_pair(particle1, particle2));
+            isRealParticle[particle1] = true;
+            isRealParticle[particle2] = true;
+        }
+        if (isRealParticle[particle1] && isRealParticle[particle2])
+            excludedPairs.push_back(pair<int, int>(particle1, particle2));
+    }
+    numRealParticles = 0;
+    for (int i = 0; i < isRealParticle.size(); i++)
+        if (isRealParticle[i])
+            numRealParticles++;
+    numExceptions = exceptionParamsVec.size();
+    exclusions = CudaArray::create<int>(cu, max(1, (int) excludedPairs.size()), "exclusions");
+    exclusionStartIndex = CudaArray::create<int>(cu, numRealParticles+1, "exclusionStartIndex");
+    exceptionParticles = CudaArray::create<int4>(cu, max(1, numExceptions), "exceptionParticles");
+    exceptionParams = CudaArray::create<float2>(cu, max(1, numExceptions), "exceptionParams");
+    if (numExceptions > 0)
+        exceptionParams->upload(exceptionParamsVec);
+    
+    // Create data structures used for the neighbor list.
+
+    int numAtomBlocks = (numRealParticles+31)/32;
+    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    blockCenter = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockCenter");
+    blockBoundingBox = new CudaArray(cu, numAtomBlocks, 4*elementSize, "blockBoundingBox");
+    sortedPos = new CudaArray(cu, numRealParticles, 4*elementSize, "sortedPos");
+    maxNeighborBlocks = numRealParticles*2;
+    neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors");
+    neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighbors");
+    neighborBlockCount = CudaArray::create<int>(cu, 1, "neighborBlockCount");
+    if (force.getNonbondedMethod() != GayberneForce::NoCutoff)
+        CHECK_RESULT(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "Error creating event for CustomManyParticleForce");
+
+    // Create array for accumulating torques.
+    
+    torque = CudaArray::create<long long>(cu, 3*cu.getPaddedNumAtoms(), "torque");
+    cu.addAutoclearBuffer(*torque);
+
+    // Create the kernels.
+    
+    nonbondedMethod = force.getNonbondedMethod();
+    bool useCutoff = (nonbondedMethod != GayBerneForce::NoCutoff);
+    bool usePeriodic = (nonbondedMethod == GayBerneForce::CutoffPeriodic);
+    map<string, string> defines;
+    defines["USE_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
+    double cutoff = force.getCutoffDistance();
+    defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
+    if (useCutoff) {
+        defines["USE_CUTOFF"] = 1;
+        if (usePeriodic)
+            defines["USE_PERIODIC"] = "1";
+        
+        // Compute the switching coefficients.
+        
+        if (force.getUseSwitchingFunction()) {
+            defines["SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
+            defines["SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-cutoff, 3.0));
+            defines["SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-cutoff, 4.0));
+            defines["SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-cutoff, 5.0));
+        }
+    }
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    CUmodule module = cu.createModule(CudakernelSources::vectorOps+CudaKernelSources::gayBerne, defines);
+    framesKernel = cu.getKernel(module, "computeEllipsoidFrames");
+    blockBoundsKernel = cu.getKernel(module, "findBlockBounds");
+    neighborsKernel = cu.getKernel(module, "findNeighbors");
+    forceKernel = cu.getKernel(module, "computeForce");
+    torqueKernel = cu.getKernel(module, "applyTorques");
+    cu.addForce(new CudaGayBerneForceInfo(force));
+    cu.addReorderListener(new ReorderListener(*this));
+}
+
+double CudaCalcGayBerneForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    if (!hasInitializedKernels) {
+        hasInitializedKernels = true;
+        sortAtoms();
+        framesArgs.push_back(&numRealParticles);
+        framesArgs.push_back(&cu.getPosq().getDevicePointer());
+        framesArgs.push_back(&axisParticleIndices->getDevicePointer());
+        framesArgs.push_back(&sigParams->getDevicePointer());
+        framesArgs.push_back(&scale->getDevicePointer());
+        framesArgs.push_back(&aMatrix->getDevicePointer());
+        framesArgs.push_back(&bMatrix->getDevicePointer());
+        framesArgs.push_back(&gMatrix->getDevicePointer());
+        framesArgs.push_back(&sortedParticles->getDevicePointer());
+        blockBoundsArgs.push_back(&numRealParticles);
+        blockBoundsArgs.push_back(cu.getPeriodicBoxSizePointer());
+        blockBoundsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        blockBoundsArgs.push_back(cu.getPeriodicBoxVecXPointer());
+        blockBoundsArgs.push_back(cu.getPeriodicBoxVecYPointer());
+        blockBoundsArgs.push_back(cu.getPeriodicBoxVecZPointer());
+        blockBoundsArgs.push_back(&sortedParticles->getDevicePointer());
+        blockBoundsArgs.push_back(&cu.getPosq().getDevicePointer());
+        blockBoundsArgs.push_back(&sortedPos->getDevicePointer());
+        blockBoundsArgs.push_back(&blockCenter->getDevicePointer());
+        blockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
+        blockBoundsArgs.push_back(&neighborBlockCount->getDevicePointer());
+        neighborsArgs.push_back(&numRealParticles);
+        neighborsArgs.push_back(&maxNeighborBlocks);
+        neighborsArgs.push_back(cu.getPeriodicBoxSizePointer());
+        neighborsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+        neighborsArgs.push_back(cu.getPeriodicBoxVecXPointer());
+        neighborsArgs.push_back(cu.getPeriodicBoxVecYPointer());
+        neighborsArgs.push_back(cu.getPeriodicBoxVecZPointer());
+        neighborsArgs.push_back(&sortedPos->getDevicePointer());
+        neighborsArgs.push_back(&blockCenter->getDevicePointer());
+        neighborsArgs.push_back(&blockBoundingBox->getDevicePointer());
+        neighborsArgs.push_back(&neighbors->getDevicePointer());
+        neighborsArgs.push_back(&neighborIndex->getDevicePointer());
+        neighborsArgs.push_back(&neighborBlockCount->getDevicePointer());
+        neighborsArgs.push_back(&exclusions->getDevicePointer());
+        neighborsArgs.push_back(&exclusionStartIndex->getDevicePointer());
+        forceArgs.push_back(&cu.getLongForceBuffer().getDevicePointer());
+        forceArgs.push_back(&torque->getDevicePointer());
+        forceArgs.push_back(&numRealParticles);
+        forceArgs.push_back(&numExceptions);
+        forceArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
+        forceArgs.push_back(&sortedPos->getDevicePointer());
+        forceArgs.push_back(&sigParams->getDevicePointer());
+        forceArgs.push_back(&epsParams->getDevicePointer());
+        forceArgs.push_back(&sortedParticles->getDevicePointer());
+        forceArgs.push_back(&aMatrix->getDevicePointer());
+        forceArgs.push_back(&bMatrix->getDevicePointer());
+        forceArgs.push_back(&gMatrix->getDevicePointer());
+        forceArgs.push_back(&exclusions->getDevicePointer());
+        forceArgs.push_back(&exclusionStartIndex->getDevicePointer());
+        forceArgs.push_back(&exceptionParticles->getDevicePointer());
+        forceArgs.push_back(&exceptionParams->getDevicePointer());
+        if (nonbondedMethod != GayBerneForce::NoCutoff) {
+            forceArgs.push_back(&maxNeighborBlocks);
+            forceArgs.push_back(&neighbors->getDevicePointer());
+            forceArgs.push_back(&neighborIndex->getDevicePointer());
+            forceArgs.push_back(&neighborBlockCount->getDevicePointer());
+            forceArgs.push_back(cu.getPeriodicBoxSizePointer());
+            forceArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+            forceArgs.push_back(cu.getPeriodicBoxVecXPointer());
+            forceArgs.push_back(cu.getPeriodicBoxVecYPointer());
+            forceArgs.push_back(cu.getPeriodicBoxVecZPointer());
+        }
+        torqueArgs.push_back(&cu.getLongForceBuffer().getDevicePointer());
+        torqueArgs.push_back(&torque->getDevicePointer());
+        torqueArgs.push_back(&numRealParticles);
+        torqueArgs.push_back(&cu.getPosq().getDevicePointer());
+        torqueArgs.push_back(&axisParticleIndices->getDevicePointer());
+        torqueArgs.push_back(&sortedParticles->getDevicePointer());
+    }
+    cu.executeKernel(framesKernel, &framesArgs[0], numRealParticles);
+    cu.executeKernel(blockBoundsKernel, &blockBoundsArgs[0], (numRealParticles+31)/32);
+    if (nonbondedMethod == GayBerneForce::NoCutoff) {
+        cu.executeKernel(forceKernel, &forceArgs[0], cu.getNonbondedUtilities().getNumForceThreadBlocks()*cu.getNonbondedUtilities().getForceThreadBlockSize());
+    }
+    else {
+        while (true) {
+            cu.executeKernel(neighborsKernel, &neighborsArgs[0], numRealParticles);
+            int* count = (int*) cu.getPinnedBuffer();
+            neighborBlockCount->download(count, false);
+            cu.executeKernel(forceKernel, &forceArgs[0], cu.getNonbondedUtilities().getNumForceThreadBlocks()*cu.getNonbondedUtilities().getForceThreadBlockSize());
+            CHECK_RESULT(cuEventSynchronize(event), "Error synchronizing on event for GayBerneForce");
+            if (*count <= maxNeighborBlocks)
+                break;
+            
+            // There wasn't enough room for the neighbor list, so we need to recreate it.
+
+            delete neighbors;
+            neighbors = NULL;
+            delete neighborIndex;
+            neighborIndex = NULL;
+            maxNeighborBlocks = (int) ceil((*count)*1.1);
+            neighbors = CudaArray::create<int>(cu, maxNeighborBlocks*32, "neighbors");
+            neighborIndex = CudaArray::create<int>(cu, maxNeighborBlocks, "neighbors");
+            neighborsArgs[10] = &neighbors->getDevicePointer();
+            neighborsArgs[11] = &neighborIndex->getDevicePointer();
+            forceArgs[17] = &neighbors->getDevicePointer();
+            forceArgs[18] = &neighborIndex->getDevicePointer();
+        }
+    }
+    cu.executeKernel(torqueKernel, &torqueArgs[0], numRealParticles);
+    return 0.0;
+}
+
+void CudaCalcGayBerneForceKernel::copyParametersToContext(ContextImpl& context, const GayBerneForce& force) {
+    // Make sure the new parameters are acceptable.
+    
+    if (force.getNumParticles() != cu.getNumAtoms())
+        throw OpenMMException("updateParametersInContext: The number of particles has changed");
+    vector<int> exceptions;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, sigma, epsilon);
+        if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
+            exceptions.push_back(i);
+        else if (epsilon != 0.0)
+            throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
+    }
+    int numExceptions = exceptionAtoms.size();
+    
+    // Record the per-particle parameters.
+    
+    vector<float4> sigParamsVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    vector<float2> epsParamsVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
+    vector<float4> scaleVector(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
+    for (int i = 0; i < force.getNumParticles(); i++) {
+        int xparticle, yparticle;
+        double sigma, epsilon, sx, sy, sz, ex, ey, ez;
+        force.getParticleParameters(i, sigma, epsilon, xparticle, yparticle, sx, sy, sz, ex, ey, ez);
+        sigParamsVector[i] = make_float4((float) (0.5*sigma), (float) (0.25*sx*sx), (float) (0.25*sy*sy), (float) (0.25*sz*sz));
+        epsParamsVector[i] = make_float2((float) sqrt(epsilon), (float) (0.125*(sx*sy + sz*sz)*sqrt(sx*sy)));
+        scaleVector[i] = make_float4((float) (1/sqrt(ex)), (float) (1/sqrt(ey)), (float) (1/sqrt(ez)), 0);
+        if (epsilon != 0.0 && !isRealParticle[i])
+            throw OpenMMException("updateParametersInContext: The set of ignored particles (ones with epsilon=0) has changed");
+    }
+    sigParams->upload(sigParamsVector);
+    epsParams->upload(epsParamsVector);
+    scale->upload(scaleVector);
+    
+    // Record the exceptions.
+    
+    if (numExceptions > 0) {
+        vector<float2> exceptionParamsVec(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            int atom1, atom2;
+            double sigma, epsilon;
+            force.getExceptionParameters(exceptions[i], atom1, atom2, sigma, epsilon);
+            exceptionParamsVec[i] = make_float2((float) sigma, (float) epsilon);
+        }
+        exceptionParams->upload(exceptionParamsVec);
+    }
+    cu.invalidateMolecules();
+    sortAtoms();
+}
+
+void CudaCalcGayBerneForceKernel::sortAtoms() {
+    // Sort the list of atoms by type to avoid thread divergence.  This is executed every time
+    // the atoms are reordered.
+    
+    int nextIndex = 0;
+    vector<int> particles(cu.getPaddedNumAtoms(), 0);
+    const vector<int>& order = cu.getAtomIndex();
+    vector<int> inverseOrder(order.size(), -1);
+    for (int i = 0; i < cu.getNumAtoms(); i++) {
+        int atom = order[i];
+        if (isRealParticle[atom]) {
+            inverseOrder[atom] = nextIndex;
+            particles[nextIndex++] = atom;
+        }
+    }
+    sortedParticles->upload(particles);
+    
+    // Update the list of exception particles.
+    
+    int numExceptions = exceptionAtoms.size();
+    if (numExceptions > 0) {
+        vector<int4> exceptionParticlesVec(numExceptions);
+        for (int i = 0; i < numExceptions; i++)
+            exceptionParticlesVec[i] = make_int4(exceptionAtoms[i].first, exceptionAtoms[i].second, inverseOrder[exceptionAtoms[i].first], inverseOrder[exceptionAtoms[i].second]);
+        exceptionParticles->upload(exceptionParticlesVec);
+    }
+    
+    // Rebuild the list of exclusions.
+    
+    vector<vector<int> > excludedAtoms(numRealParticles);
+    for (int i = 0; i < excludedPairs.size(); i++) {
+        int first = inverseOrder[min(excludedPairs[i].first, excludedPairs[i].second)];
+        int second = inverseOrder[max(excludedPairs[i].first, excludedPairs[i].second)];
+        excludedAtoms[first].push_back(second);
+    }
+    int index = 0;
+    vector<int> exclusionVec(exclusions->getSize());
+    vector<int> startIndexVec(exclusionStartIndex->getSize());
+    for (int i = 0; i < numRealParticles; i++) {
+        startIndexVec[i] = index;
+        for (int j = 0; j < excludedAtoms[i].size(); j++)
+            exclusionVec[index++] = excludedAtoms[i][j];
+    }
+    startIndexVec[numRealParticles] = index;
+    exclusions->upload(exclusionVec);
+    exclusionStartIndex->upload(startIndexVec);
+}
+
 CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
 }


--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -92,6 +92,7 @@ CudaPlatform::CudaPlatform() {
    registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
+    registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);

--- a/platforms/cuda/src/kernels/gayBerne.cu
+++ b/platforms/cuda/src/kernels/gayBerne.cu
+#define TILE_SIZE 32
+#define NEIGHBOR_BLOCK_SIZE 32
+
+/**
+ * Calculate the ellipsoid coordinate frames and associated matrices.
+ */
+extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
+        const float4* __restrict__ sigParams, const float4* __restrict__ scale, real* __restrict__ aMatrix,
+        real* __restrict__ bMatrix, real* __restrict__ gMatrix, const int* sortedParticles) {
+    for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
+        // Compute the local coordinate system of the ellipsoid;
+
+        int originalIndex = sortedParticles[sortedIndex];
+        real3 pos = trimTo3(posq[originalIndex]);
+        int2 axisParticles = axisParticleIndices[originalIndex];
+        real3 xdir, ydir, zdir;
+        if (axisParticles.x == -1) {
+            xdir = make_real3(1, 0, 0);
+            ydir = make_real3(0, 1, 0);
+        }
+        else {
+            xdir = pos-trimTo3(posq[axisParticles.x]);
+            xdir = normalize(xdir);
+            if (axisParticles.y == -1) {
+                if (xdir.y > -0.5f && xdir.y < 0.5f)
+                    ydir = make_real3(0, 1, 0);
+                else
+                    ydir = make_real3(1, 0, 0);
+            }
+            else
+                ydir = pos-trimTo3(posq[axisParticles.y]);
+            ydir -= xdir*dot(xdir, ydir);
+            ydir = normalize(ydir);
+        }
+        zdir = cross(xdir, ydir);
+
+        // Compute matrices we will need later.
+
+        real (*a)[3] = (real (*)[3]) (aMatrix+sortedIndex*9);
+        real (*b)[3] = (real (*)[3]) (bMatrix+sortedIndex*9);
+        real (*g)[3] = (real (*)[3]) (gMatrix+sortedIndex*9);
+        a[0][0] = xdir.x;
+        a[0][1] = xdir.y;
+        a[0][2] = xdir.z;
+        a[1][0] = ydir.x;
+        a[1][1] = ydir.y;
+        a[1][2] = ydir.z;
+        a[2][0] = zdir.x;
+        a[2][1] = zdir.y;
+        a[2][2] = zdir.z;
+        float4 sig = sigParams[originalIndex];
+        float3 r2 = sig.yzw;
+        float3 e2 = trimTo3(scale[originalIndex]);
+        for (int i = 0; i < 3; i++)
+            for (int j = 0; j < 3; j++) {
+                b[i][j] = a[0][i]*e2.x*a[0][j] + a[1][i]*e2.y*a[1][j] + a[2][i]*e2.z*a[2][j];
+                g[i][j] = a[0][i]*r2.x*a[0][j] + a[1][i]*r2.y*a[1][j] + a[2][i]*r2.z*a[2][j];
+            }
+    }
+}
+
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        const int* sortedAtoms, const real4* __restrict__ posq, real4* __restrict__ sortedPos, real4* __restrict__ blockCenter,
+        real4* __restrict__ blockBoundingBox, int* __restrict__ neighborBlockCount) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int base = index*TILE_SIZE;
+    while (base < numAtoms) {
+        real4 pos = posq[sortedAtoms[base]];
+        sortedPos[base] = pos;
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_POS(pos)
+#endif
+        real4 minPos = pos;
+        real4 maxPos = pos;
+        int last = min(base+TILE_SIZE, numAtoms);
+        for (int i = base+1; i < last; i++) {
+            pos = posq[sortedAtoms[i]];
+            sortedPos[i] = pos;
+#ifdef USE_PERIODIC
+            real4 center = 0.5f*(maxPos+minPos);
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
+#endif
+            minPos = min(minPos, pos);
+            maxPos = max(maxPos, pos);
+        }
+        real4 blockSize = 0.5f*(maxPos-minPos);
+        blockBoundingBox[index] = blockSize;
+        blockCenter[index] = 0.5f*(maxPos+minPos);
+        index += blockDim.x*gridDim.x;
+        base = index*TILE_SIZE;
+    }
+    if (blockIdx.x*blockDim.x+threadIdx.x == 0)
+        *neighborBlockCount = 0;
+}
+
+/**
+ * This is called by findNeighbors() to write a block to the neighbor list.
+ */
+void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors,
+        int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) {
+    int blockIndex = atomicAdd(neighborBlockCount, 1);
+    if (blockIndex >= maxNeighborBlocks)
+        return; // We don't have enough room for the neighbor list.
+    neighborIndex[blockIndex] = atom1;
+    int baseIndex = blockIndex*NEIGHBOR_BLOCK_SIZE;
+    for (int i = 0; i < numAtomsInBuffer; i++)
+        neighbors[baseIndex+i] = neighborBuffer[i];
+    for (int i = numAtomsInBuffer; i < NEIGHBOR_BLOCK_SIZE; i++)
+        neighbors[baseIndex+i] = -1;
+}
+
+/**
+ * Build a list of neighbors for each atom.
+ */
+extern "C" __global__ void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ neighbors,
+        int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
+    const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
+    int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
+    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
+        int nextExclusion = exclusionStartIndex[atom1];
+        int lastExclusion = exclusionStartIndex[atom1+1];
+        real4 pos = sortedPos[atom1];
+        int nextBufferIndex = 0;
+        
+        // Loop over atom blocks and compute the distance of this atom from each one's bounding box.
+        
+        for (int block = (atom1+1)/TILE_SIZE; block < numBlocks; block++) {
+            real4 center = blockCenter[block];
+            real4 blockSize = blockBoundingBox[block];
+            real4 blockDelta = center-pos;
+#ifdef USE_PERIODIC
+            APPLY_PERIODIC_TO_DELTA(blockDelta)
+#endif
+            blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize.x);
+            blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize.y);
+            blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize.z);
+            if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z >= CUTOFF_SQUARED)
+                continue;
+            
+            // Loop over atoms within this block.
+            
+            int first = max(block*TILE_SIZE, atom1+1);
+            int last = min((block+1)*TILE_SIZE, numAtoms);
+            for (int atom2 = first; atom2 < last; atom2++) {
+                // Skip over excluded interactions.
+
+                if (nextExclusion < lastExclusion && exclusions[nextExclusion] >= atom2) {
+                    nextExclusion++;
+                    continue;
+                }
+                real4 delta = pos-sortedPos[atom2];
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                if (r2 < CUTOFF_SQUARED) {
+                    neighborBuffer[nextBufferIndex++] = atom2;
+                    if (nextBufferIndex == NEIGHBOR_BLOCK_SIZE) {
+                        storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
+                        nextBufferIndex = 0;
+                    }
+                }
+            }
+        }
+        if (nextBufferIndex > 0)
+            storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
+    }
+}
+
+typedef struct {
+    float4 sig;
+    float2 eps;
+    real3 pos;
+    real a[3][3], b[3][3], g[3][3];
+} AtomData;
+
+void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams,
+        const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) {
+    data->sig = sigParams[originalIndex];
+    data->eps = epsParams[originalIndex];
+    data->pos = trimTo3(pos[sortedIndex]);
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++) {
+            int k = 9*sortedIndex+3*i+j;
+            data->a[i][j] = aMatrix[k];
+            data->b[i][j] = bMatrix[k];
+            data->g[i][j] = gMatrix[k];
+        }
+}
+
+real3 matrixVectorProduct(real (*m)[3], real3 v) {
+    return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
+                      m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
+                      m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
+}
+
+real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
+    return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
+                      m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
+                      m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
+}
+
+
+void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
+    result[0][0] = a[0][0]+b[0][0];
+    result[0][1] = a[0][1]+b[0][1];
+    result[0][2] = a[0][2]+b[0][2];
+    result[1][0] = a[1][0]+b[1][0];
+    result[1][1] = a[1][1]+b[1][1];
+    result[1][2] = a[1][2]+b[1][2];
+    result[2][0] = a[2][0]+b[2][0];
+    result[2][1] = a[2][1]+b[2][1];
+    result[2][2] = a[2][2]+b[2][2];
+}
+
+real determinant(real (*m)[3]) {
+    return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
+            m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
+}
+
+
+void matrixInverse(real (*result)[3], real (*m)[3]) {
+    real invDet = RECIP(determinant(m));
+    result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
+    result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
+    result[2][0] = invDet*(m[1][0]*m[2][1] - m[1][1]*m[2][0]);
+    result[0][1] = -invDet*(m[0][1]*m[2][2] - m[0][2]*m[2][1]);
+    result[1][1] = invDet*(m[0][0]*m[2][2] - m[0][2]*m[2][0]);
+    result[2][1] = -invDet*(m[0][0]*m[2][1] - m[0][1]*m[2][0]);
+    result[0][2] = invDet*(m[0][1]*m[1][2] - m[0][2]*m[1][1]);
+    result[1][2] = -invDet*(m[0][0]*m[1][2] - m[0][2]*m[1][0]);
+    result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
+}
+
+void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, real *totalEnergy) {
+    real rInv = RSQRT(r2);
+    real r = r2*rInv;
+    real3 drUnit = dr*rInv;
+    
+    // Compute the switching function.
+
+    real switchValue = 1, switchDeriv = 0;
+    #if USE_SWITCH
+    if (r > SWITCH_CUTOFF) {
+        real x = r-SWITCH_CUTOFF;
+        switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
+        switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
+    }
+    #endif
+
+    // Compute vectors and matrices we'll be needing.
+
+    real B12[3][3], G12[3][3], B12inv[3][3], G12inv[3][3];
+    matrixSum(B12, data1->b, data2->b);
+    matrixSum(G12, data1->g, data2->g);
+    matrixInverse(B12inv, B12);
+    matrixInverse(G12inv, G12);
+    real detG12 = determinant(G12);
+
+    // Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
+
+    real sigma12 = 1/SQRT(0.5f*dot(drUnit, matrixVectorProduct(G12inv, drUnit)));
+    real h12 = r - sigma12;
+    real rho = sigma/(h12+sigma);
+    real rho2 = rho*rho;
+    real rho6 = rho2*rho2*rho2;
+    real u = 4*epsilon*(rho6*rho6-rho6);
+    real eta = SQRT(2*data1->eps.y*data2->eps.y/detG12);
+    real chi = 2*dot(drUnit, matrixVectorProduct(B12inv, drUnit));
+    chi *= chi;
+    real energy = u*eta*chi;
+    
+    // Compute the terms needed for the force.
+
+    real3 kappa = matrixVectorProduct(G12inv, dr);
+    real3 iota = matrixVectorProduct(B12inv, dr);
+    real rInv2 = rInv*rInv;
+    real dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
+    real temp = 0.5f*sigma12*sigma12*sigma12*rInv2;
+    real3 dudr = (drUnit + (kappa-drUnit*dot(kappa, drUnit))*temp)*dUSLJdr;
+    real3 dchidr = (iota-drUnit*dot(iota, drUnit))*(-8*rInv2*SQRT(chi));
+    real3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
+    *force1 += force;
+    *force2 -= force;
+
+    // Compute the terms needed for the torque.
+
+    for (int j = 0; j < 2; j++) {
+        real (*a)[3] = (j == 0 ? data1->a : data2->a);
+        real (*b)[3] = (j == 0 ? data1->b : data2->b);
+        real (*g)[3] = (j == 0 ? data1->g : data2->g);
+        float4 sig = (j == 0 ? data1->sig : data2->sig);
+        real3 dudq = cross(vectorMatrixProduct(kappa, g), kappa*(temp*dUSLJdr));
+        real3 dchidq = cross(vectorMatrixProduct(iota, b), iota)*(-4*rInv2);
+        real3 scale = make_real3(sig.y, sig.z, sig.w)*(-0.5f*eta/detG12);
+        real d[3][3];
+        d[0][0] = scale.x*(2*a[0][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
+                             a[0][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
+                             a[0][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])));
+        d[0][1] = scale.x*(  a[0][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
+                           2*a[0][1]*(G12[0][0]*G12[2][2] - G12[2][0]*G12[0][2]) +
+                             a[0][2]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])));
+        d[0][2] = scale.x*(  a[0][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
+                             a[0][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
+                           2*a[0][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
+        d[1][0] = scale.y*(2*a[1][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
+                             a[1][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
+                             a[1][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])));
+        d[1][1] = scale.y*(  a[1][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
+                           2*a[1][1]*(G12[2][2]*G12[0][0] - G12[2][0]*G12[0][2]) +
+                             a[1][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
+        d[1][2] = scale.y*(  a[1][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
+                             a[1][1]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])) +
+                           2*a[1][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
+        d[2][0] = scale.z*(2*a[2][0]*(G12[1][1]*G12[2][2] - G12[2][1]*G12[1][2]) +
+                             a[2][1]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
+                             a[2][2]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])));
+        d[2][1] = scale.z*(  a[2][0]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
+                           2*a[2][1]*(G12[0][0]*G12[2][2] - G12[0][2]*G12[2][0]) +
+                             a[2][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
+        d[2][2] = scale.z*(  a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
+                             a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
+                           2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
+        real3 detadq = 0;
+        for (int i = 0; i < 3; i++)
+            detadq += cross(make_real3(a[i][0], a[i][1], a[i][2]), make_real3(d[i][0], d[i][1], d[i][2]));
+        real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
+        *(j == 0 ? torque1 : torque2) -= torque;
+    }
+    *totalEnergy += switchValue*energy;
+}
+
+/**
+ * Compute the interactions.
+ */
+extern "C" __global__ void computeForce(
+        long* __restrict__ forceBuffers, long* __restrict__ torqueBuffers,
+        int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos,
+        const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms,
+        const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix,
+        const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex,
+        const int4* __restrict__ exceptionParticles, const float2* __restrict__ exceptionParams
+#ifdef USE_CUTOFF
+        , int maxNeighborBlocks, int* __restrict__ neighbors, int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
+#endif
+        ) {
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    mixed energy = 0;
+#ifdef USE_CUTOFF
+    const int numBlocks = *neighborBlockCount;
+    if (numBlocks > maxNeighborBlocks)
+        return; // There wasn't enough memory for the neighbor list.
+    for (int block = blockIdx.x*blockDim.x+threadIdx.x; block < numBlocks; block += blockDim.x*gridDim.x) {
+        // Load parameters for atom1.
+        
+        int atom1 = neighborIndex[block];
+        int index1 = sortedAtoms[atom1];
+        AtomData data1;
+        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
+        real3 force1 = 0.0f;
+        real3 torque1 = 0.0f;
+        for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
+            // Load parameters for atom2.
+            
+            int atom2 = neighbors[NEIGHBOR_BLOCK_SIZE*block+indexInBlock];
+            if (atom2 == -1)
+                continue;
+            int index2 = sortedAtoms[atom2];
+            AtomData data2;
+            loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
+            real3 force2 = 0.0f;
+            real3 torque2 = 0.0f;
+            
+            // Compute the interaction.
+            
+            real3 delta = data1.pos-data2.pos;
+#ifdef USE_PERIODIC
+            APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+            real sigma = data1.sig.x+data2.sig.x;
+            real epsilon = data1.eps.x*data2.eps.x;
+            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
+            atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
+            atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
+            atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
+            atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
+            atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
+            atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
+        }
+        atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
+        atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
+        atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
+        atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
+        atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
+        atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
+    }
+#else
+    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
+        // Load parameters for atom1.
+        
+        int index1 = sortedAtoms[atom1];
+        AtomData data1;
+        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
+        real3 force1 = 0.0f;
+        real3 torque1 = 0.0f;
+        int nextExclusion = exclusionStartIndex[atom1];
+        int lastExclusion = exclusionStartIndex[atom1+1];
+        for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
+            // Skip over excluded interactions.
+            
+            if (nextExclusion < lastExclusion && exclusions[nextExclusion] == atom2) {
+                nextExclusion++;
+                continue;
+            }
+            
+            // Load parameters for atom2.
+            
+            int index2 = sortedAtoms[atom2];
+            AtomData data2;
+            loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
+            real3 force2 = 0.0f;
+            real3 torque2 = 0.0f;
+            
+            // Compute the interaction.
+            
+            real3 delta = data1.pos-data2.pos;
+            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+            real sigma = data1.sig.x+data2.sig.x;
+            real epsilon = data1.eps.x*data2.eps.x;
+            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
+            atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
+            atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
+            atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
+            atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
+            atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
+            atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
+        }
+        atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
+        atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
+        atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
+        atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
+        atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
+        atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
+    }
+#endif
+    
+    // Now compute exceptions.
+    
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numExceptions; index += blockDim.x*gridDim.x) {
+        int4 atomIndices = exceptionParticles[index];
+        float2 params = exceptionParams[index];
+        int index1 = atomIndices.x, index2 = atomIndices.y;
+        int atom1 = atomIndices.z, atom2 = atomIndices.w;
+        AtomData data1, data2;
+        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
+        loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
+        real3 force1 = 0, force2 = 0;
+        real3 torque1 = 0, torque2 = 0;
+        real3 delta = data1.pos-data2.pos;
+        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+        if (r2 < CUTOFF_SQUARED) {
+#endif
+            computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
+            atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
+            atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
+            atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
+            atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
+            atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
+            atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
+            atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
+            atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
+            atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
+            atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
+            atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
+            atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
+#ifdef USE_CUTOFF
+        }
+#endif
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
+
+/**
+ * Convert the torques to forces on the connected particles.
+ */
+extern "C" __global__ void applyTorques(
+        long* __restrict__ forceBuffers, long* __restrict__ torqueBuffers,
+        int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
+        const int* sortedParticles) {
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
+        int originalIndex = sortedParticles[sortedIndex];
+        real3 pos = trimTo3(posq[originalIndex]);
+        int2 axisParticles = axisParticleIndices[originalIndex];
+        if (axisParticles.x != -1) {
+            // Load the torque.
+
+            real scale = 1/(real) 0x100000000;
+            real3 torque = make_real3(scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
+            real3 force = 0, xforce = 0, yforce = 0;
+
+            // Apply a force to the x particle.
+            
+            real3 dx = trimTo3(posq[axisParticles.x])-pos;
+            real dx2 = dot(dx, dx);
+            real3 f = cross(torque, dx)/dx2;
+            xforce += f;
+            force -= f;
+            if (axisParticles.y != -1) {
+                // Apply a force to the y particle.  This is based on the component of the torque
+                // that was not already applied to the x particle.
+                
+                real3 dy = trimTo3(posq[axisParticles.y])-pos;
+                real dy2 = dot(dy, dy);
+                real3 torque2 = dx*dot(torque, dx)/dx2;
+                f = cross(torque2, dy)/dy2;
+                yforce += f;
+                force -= f;
+            }
+            atomicAdd(&forceBuffers[originalIndex], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+            atomicAdd(&forceBuffers[axisParticles.x], static_cast<unsigned long long>((long long) (xforce.x*0x100000000)));
+            atomicAdd(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.y*0x100000000)));
+            atomicAdd(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.z*0x100000000)));
+            if (axisParticles.y != -1) {
+                atomicAdd(&forceBuffers[axisParticles.y], static_cast<unsigned long long>((long long) (yforce.x*0x100000000)));
+                atomicAdd(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.y*0x100000000)));
+                atomicAdd(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.z*0x100000000)));
+            }
+        }
+    }
+}
--- a/platforms/cuda/tests/TestCudaGayBerneForce.cpp
+++ b/platforms/cuda/tests/TestCudaGayBerneForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaTests.h"
+#include "TestGayBerneForce.h"
+
+void runPlatformTests() {
+}