merge

1ebe88ba · Robert McGibbon · 804cbb22 · a37dbc96 · 1ebe88ba · 1ebe88ba
Commit 1ebe88ba authored Jul 24, 2015 by Robert McGibbon
11 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -43,6 +43,7 @@
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <set>
 #include <sstream>
 #include <typeinfo>

@@ -492,13 +493,32 @@ void OpenCLContext::addForce(OpenCLForceInfo* force) {
 }

 string OpenCLContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
+    static set<char> symbolChars;
+    if (symbolChars.size() == 0) {
+        symbolChars.insert('_');
+        for (char c = 'a'; c <= 'z'; c++)
+            symbolChars.insert(c);
+        for (char c = 'A'; c <= 'Z'; c++)
+            symbolChars.insert(c);
+        for (char c = '0'; c <= '9'; c++)
+            symbolChars.insert(c);
+    }
    string result = input;
    for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
-        int index = -1;
+        int index = 0;
+        int size = iter->first.size();
        do {
-            index = result.find(iter->first);
-            if (index != result.npos)
-                result.replace(index, iter->first.size(), iter->second);
+            index = result.find(iter->first, index);
+            if (index != result.npos) {
+                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
+                    // We have found a complete symbol, not part of a longer symbol.
+                    
+                    result.replace(index, size, iter->second);
+                    index += iter->second.size();
+                }
+                else
+                    index++;
+            }
        } while (index != result.npos);
    }
    return result;
@@ -1130,7 +1150,7 @@ void OpenCLContext::reorderAtomsImpl() {
        if (useHilbert)
            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
        else
-            binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
+            binWidth = (Real) (0.2*nonbonded->getMaxCutoffDistance());
        Real invBinWidth = (Real) (1.0/binWidth);
        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -121,16 +121,13 @@ void OpenCLCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, boo
    for (vector<OpenCLContext::ForcePreComputation*>::iterator iter = cl.getPreComputations().begin(); iter != cl.getPreComputations().end(); ++iter)
        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
-    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cl.setComputeForceCount(cl.getComputeForceCount()+1);
-    if (includeNonbonded)
-        nb.prepareInteractions();
+    nb.prepareInteractions(groups);
 }

 double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
    cl.getBondedUtilities().computeInteractions(groups);
-    if ((groups&(1<<cl.getNonbondedUtilities().getForceGroup())) != 0)
-        cl.getNonbondedUtilities().computeInteractions();
+    cl.getNonbondedUtilities().computeInteractions(groups);
    double sum = 0.0;
    for (vector<OpenCLContext::ForcePostComputation*>::iterator iter = cl.getPostComputations().begin(); iter != cl.getPostComputations().end(); ++iter)
        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
@@ -2643,8 +2640,9 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
    surfaceAreaFactor = -6.0*4*M_PI*force.getSurfaceAreaEnergy();
    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
    bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
+    cutoff = force.getCutoffDistance();
    string source = OpenCLKernelSources::gbsaObc2;
-    nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
+    nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup());
    nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
    nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "real", 1, elementSize, bornForce->getDeviceBuffer()));;
    cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
@@ -2663,8 +2661,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
            defines["USE_CUTOFF"] = "1";
        if (nb.getUsePeriodic())
            defines["USE_PERIODIC"] = "1";
-        defines["CUTOFF_SQUARED"] = cl.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
-        defines["CUTOFF"] = cl.doubleToString(nb.getCutoffDistance());
+        defines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
+        defines["CUTOFF"] = cl.doubleToString(cutoff);
        defines["PREFACTOR"] = cl.doubleToString(prefactor);
        defines["SURFACE_AREA_FACTOR"] = cl.doubleToString(surfaceAreaFactor);
        defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
@@ -2856,6 +2854,7 @@ OpenCLCalcCustomGBForceKernel::~OpenCLCalcCustomGBForceKernel() {
 void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) {
    if (cl.getPlatformData().contexts.size() > 1)
        throw OpenMMException("CustomGBForce does not support using multiple OpenCL devices");
+    cutoff = force.getCutoffDistance();
    bool useExclusionsForValue = false;
    numComputedValues = force.getNumComputedValues();
    vector<string> computedValueNames(force.getNumComputedValues());
@@ -3047,7 +3046,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        if (useExclusionsForValue)
            pairValueDefines["USE_EXCLUSIONS"] = "1";
        pairValueDefines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
-        pairValueDefines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairValueDefines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
        pairValueDefines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
        pairValueDefines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
        pairValueDefines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
@@ -3240,7 +3239,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
        if (anyExclusions)
            pairEnergyDefines["USE_EXCLUSIONS"] = "1";
        pairEnergyDefines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
-        pairEnergyDefines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairEnergyDefines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
        pairEnergyDefines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
        pairEnergyDefines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
        pairEnergyDefines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
@@ -3492,7 +3491,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
            globals->upload(globalParamValues);
            arguments.push_back(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
        }
-        cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getNumExclusions() > 0, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+        cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getNumExclusions() > 0, cutoff, exclusionList, source, force.getForceGroup());
        for (int i = 0; i < (int) parameters.size(); i++)
            cl.getNonbondedUtilities().addParameter(parameters[i]);
        for (int i = 0; i < (int) arguments.size(); i++)
@@ -3527,7 +3526,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
            pairValueDefines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
            pairValueDefines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
-            pairValueDefines["CUTOFF"] = cl.doubleToString(nb.getCutoffDistance());
+            pairValueDefines["CUTOFF"] = cl.doubleToString(cutoff);
            cl::Program program = cl.createProgram(pairValueSrc, pairValueDefines);
            pairValueKernel = cl::Kernel(program, "computeN2Value");
            pairValueSrc = "";
@@ -3541,7 +3540,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
            int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
            pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
            pairEnergyDefines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
-            pairEnergyDefines["CUTOFF"] = cl.doubleToString(nb.getCutoffDistance());
+            pairEnergyDefines["CUTOFF"] = cl.doubleToString(cutoff);
            cl::Program program = cl.createProgram(pairEnergySrc, pairEnergyDefines);
            pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
            pairEnergySrc = "";

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -54,10 +54,10 @@ private:
    bool useDouble;
 };

-OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
+OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
@@ -126,24 +126,28 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
-    if (cutoff != -1.0) {
+    if (groupCutoff.size() > 0) {
        if (usesCutoff != useCutoff)
            throw OpenMMException("All Forces must agree on whether to use a cutoff");
        if (usesPeriodic != usePeriodic)
            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
-        if (cutoffDistance != cutoff)
-            throw OpenMMException("All Forces must use the same cutoff distance");
-        if (forceGroup != nonbondedForceGroup)
-            throw OpenMMException("All nonbonded forces must be in the same force group");
+        if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
+            throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
    }
    if (usesExclusions)
        requestExclusions(exclusionList);
    useCutoff = usesCutoff;
    usePeriodic = usesPeriodic;
-    cutoff = cutoffDistance;
-    if (kernel.size() > 0)
-        kernelSource += kernel+"\n";
-    nonbondedForceGroup = forceGroup;
+    groupCutoff[forceGroup] = cutoffDistance;
+    groupFlags |= 1<<forceGroup;
+    if (kernel.size() > 0) {
+        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
+            groupKernelSource[forceGroup] = "";
+        map<string, string> replacements;
+        replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
+        replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
+        groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
+    }
 }

 void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
@@ -228,6 +232,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
    }
+    maxExclusions = 0;
+    for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
+        maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
    exclusionIndices = OpenCLArray::create<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
    exclusionRowIndices = OpenCLArray::create<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices->upload(exclusionIndicesVec);
@@ -287,80 +294,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        vector<cl_uint> count(1, 0);
        interactionCount->upload(count);
    }
-
-    // Create kernels.
-
-    if (kernelSource.size() > 0)
-        forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
-    if (useCutoff) {
-        double padding = (usePadding ? 0.1*cutoff : 0.0);
-        double paddedCutoff = cutoff+padding;
-        map<string, string> defines;
-        defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
-        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
-        defines["PADDING"] = context.doubleToString(padding);
-        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
-        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
-        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
-        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
-        defines["SIMD_WIDTH"] = context.intToString(context.getSIMDWidth());
-        if (usePeriodic)
-            defines["USE_PERIODIC"] = "1";
-        int maxExclusions = 0;
-        for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
-            maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
-        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
-        defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
-        string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
-        int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256);
-        while (true) {
-            defines["GROUP_SIZE"] = context.intToString(groupSize);
-            cl::Program interactingBlocksProgram = context.createProgram(file, defines);
-            findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
-            findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
-            findBlockBoundsKernel.setArg<cl::Buffer>(6, context.getPosq().getDeviceBuffer());
-            findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter->getDeviceBuffer());
-            findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox->getDeviceBuffer());
-            findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList->getDeviceBuffer());
-            findBlockBoundsKernel.setArg<cl::Buffer>(10, sortedBlocks->getDeviceBuffer());
-            sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
-            sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
-            sortBoxDataKernel.setArg<cl_int>(9, true);
-            findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
-            findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
-            findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
-            findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
-            findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlocks->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockCenter->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(14, sortedBlockBoundingBox->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionIndices->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(16, exclusionRowIndices->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(17, oldPositions->getDeviceBuffer());
-            findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer());
-            if (findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
-                // The device can't handle this block size, so reduce it.
-                
-                groupSize -= 32;
-                if (groupSize < 32)
-                    throw OpenMMException("Failed to create findInteractingBlocks kernel");
-                continue;
-            }
-            break;
-        }
-        interactingBlocksThreadBlockSize = (deviceIsCpu ? 1 : groupSize);
-    }
 }

 static void setPeriodicBoxArgs(OpenCLContext& cl, cl::Kernel& kernel, int index) {
@@ -380,34 +313,53 @@ static void setPeriodicBoxArgs(OpenCLContext& cl, cl::Kernel& kernel, int index)
    }
 }

-void OpenCLNonbondedUtilities::prepareInteractions() {
+double OpenCLNonbondedUtilities::getMaxCutoffDistance() {
+    double cutoff = 0.0;
+    for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
+        cutoff = max(cutoff, iter->second);
+    return cutoff;
+}
+
+void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    if (groupKernels.find(forceGroups) == groupKernels.end())
+        createKernelsForGroups(forceGroups);
    if (!useCutoff)
        return;
    if (numTiles == 0)
        return;
+    KernelSet& kernels = groupKernels[forceGroups];
    if (usePeriodic) {
        mm_float4 box = context.getPeriodicBoxSize();
-        double minAllowedSize = 1.999999*cutoff;
+        double minAllowedSize = 1.999999*kernels.cutoffDistance;
        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
    }

    // Compute the neighbor list.

-    setPeriodicBoxArgs(context, findBlockBoundsKernel, 1);
-    context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
+    if (lastCutoff != kernels.cutoffDistance)
+        forceRebuildNeighborList = true;
+    setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
+    context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
    blockSorter->sort(*sortedBlocks);
-    context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
-    setPeriodicBoxArgs(context, findInteractingBlocksKernel, 0);
-    context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
-    sortBoxDataKernel.setArg<cl_int>(9, false);
+    kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
+    context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
+    setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
+    context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
+    forceRebuildNeighborList = false;
+    lastCutoff = kernels.cutoffDistance;
 }

-void OpenCLNonbondedUtilities::computeInteractions() {
-    if (kernelSource.size() > 0) {
+void OpenCLNonbondedUtilities::computeInteractions(int forceGroups) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    KernelSet& kernels = groupKernels[forceGroups];
+    if (kernels.hasForces) {
        if (useCutoff)
-            setPeriodicBoxArgs(context, forceKernel, 9);
-        context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+            setPeriodicBoxArgs(context, kernels.forceKernel, 9);
+        context.executeKernel(kernels.forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
        if (context.getComputeForceCount() == 1)
            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
    }
@@ -434,13 +386,15 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
    interactingAtoms = NULL;
    interactingTiles = OpenCLArray::create<cl_int>(context, maxTiles, "interactingTiles");
    interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
-    forceKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
-    forceKernel.setArg<cl_uint>(14, maxTiles);
-    forceKernel.setArg<cl::Buffer>(17, interactingAtoms->getDeviceBuffer());
-    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
-    findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
-    findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
-    sortBoxDataKernel.setArg<cl_int>(9, true);
+    for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
+        iter->second.forceKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
+        iter->second.forceKernel.setArg<cl_uint>(14, maxTiles);
+        iter->second.forceKernel.setArg<cl::Buffer>(17, interactingAtoms->getDeviceBuffer());
+        iter->second.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
+        iter->second.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
+        iter->second.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
+    }
+    forceRebuildNeighborList = true;
 }

 void OpenCLNonbondedUtilities::setUsePadding(bool padding) {
@@ -454,18 +408,103 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
    startTileIndex = (int) (startFraction*totalTiles);;
    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
-    if (useCutoff && interactingTiles != NULL) {
+    if (useCutoff) {
        // We are using a cutoff, and the kernels have already been created.
        
-        forceKernel.setArg<cl_uint>(5, startTileIndex);
-        forceKernel.setArg<cl_uint>(6, numTiles);
-        findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
-        findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
-        sortBoxDataKernel.setArg<cl_int>(9, true);
+        for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
+            iter->second.forceKernel.setArg<cl_uint>(5, startTileIndex);
+            iter->second.forceKernel.setArg<cl_uint>(6, numTiles);
+            iter->second.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
+            iter->second.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
+        }
+        forceRebuildNeighborList = true;
+    }
+}
+
+void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
+    KernelSet kernels;
+    double cutoff = 0.0;
+    string source;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            cutoff = max(cutoff, groupCutoff[i]);
+            source += groupKernelSource[i];
+        }
+    }
+    kernels.hasForces = (source.size() > 0);
+    kernels.cutoffDistance = cutoff;
+    if (kernels.hasForces)
+        kernels.forceKernel = createInteractionKernel(source, parameters, arguments, true, true, groups);
+    if (useCutoff && kernels.hasForces) {
+        double padding = (usePadding ? 0.1*cutoff : 0.0);
+        double paddedCutoff = cutoff+padding;
+        map<string, string> defines;
+        defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
+        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
+        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
+        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+        defines["SIMD_WIDTH"] = context.intToString(context.getSIMDWidth());
+        if (usePeriodic)
+            defines["USE_PERIODIC"] = "1";
+        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
+        string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
+        int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256);
+        while (true) {
+            defines["GROUP_SIZE"] = context.intToString(groupSize);
+            cl::Program interactingBlocksProgram = context.createProgram(file, defines);
+            kernels.findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
+            kernels.findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
+            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(6, context.getPosq().getDeviceBuffer());
+            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter->getDeviceBuffer());
+            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox->getDeviceBuffer());
+            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList->getDeviceBuffer());
+            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(10, sortedBlocks->getDeviceBuffer());
+            kernels.sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
+            kernels.sortBoxDataKernel.setArg<cl_int>(9, true);
+            kernels.findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
+            kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
+            kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlocks->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockCenter->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(14, sortedBlockBoundingBox->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionIndices->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(16, exclusionRowIndices->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(17, oldPositions->getDeviceBuffer());
+            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer());
+            if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
+                // The device can't handle this block size, so reduce it.
+                
+                groupSize -= 32;
+                if (groupSize < 32)
+                    throw OpenMMException("Failed to create findInteractingBlocks kernel");
+                continue;
+            }
+            break;
+        }
+        interactingBlocksThreadBlockSize = (deviceIsCpu ? 1 : groupSize);
    }
+    groupKernels[groups] = kernels;
 }

-cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
+cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups) {
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
@@ -565,8 +604,16 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    if (useCutoff && context.getSIMDWidth() < 32)
        defines["PRUNE_BY_CUTOFF"] = "1";
    defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
-    defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
-    defines["CUTOFF"] = context.doubleToString(cutoff);
+    double maxCutoff = 0.0;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            double cutoff = groupCutoff[i];
+            maxCutoff = max(maxCutoff, cutoff);
+            defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
+            defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
+        }
+    }
+    defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -220,9 +220,9 @@ __kernel void computeNonbonded(
        if (numTiles <= maxTiles) {
            x = tiles[pos];
            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
+                                  0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
+                                  0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
        }
        else
 #endif

--- a/platforms/opencl/tests/TestOpenCLCustomNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomNonbondedForce.cpp
@@ -973,6 +973,62 @@ void testInteractionGroupLongRangeCorrection() {
    ASSERT_EQUAL_TOL(expected, energy2-energy1, 1e-4);
 }

+void testMultipleCutoffs() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    
+    // Add multiple nonbonded forces that have different cutoffs.
+    
+    CustomNonbondedForce* nonbonded1 = new CustomNonbondedForce("2*r");
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded1->setCutoffDistance(2.5);
+    system.addForce(nonbonded1);
+    CustomNonbondedForce* nonbonded2 = new CustomNonbondedForce("3*r");
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded2->setCutoffDistance(2.9);
+    nonbonded2->setForceGroup(1);
+    system.addForce(nonbonded2);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 0, 0);
+    for (double r = 2.4; r < 3.2; r += 0.2) {
+        positions[1][1] = r;
+        context.setPositions(positions);
+        double e1 = (r < 2.5 ? 2.0*r : 0.0);
+        double e2 = (r < 2.9 ? 3.0*r : 0.0);
+        double f1 = (r < 2.5 ? 2.0 : 0.0);
+        double f2 = (r < 2.9 ? 3.0 : 0.0);
+        
+        // Check the first force.
+        
+        State state = context.getState(State::Forces | State::Energy, false, 1);
+        ASSERT_EQUAL_VEC(Vec3(0, f1, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1, state.getPotentialEnergy(), TOL);
+        
+        // Check the second force.
+        
+        state = context.getState(State::Forces | State::Energy, false, 2);
+        ASSERT_EQUAL_VEC(Vec3(0, f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e2, state.getPotentialEnergy(), TOL);
+        
+        // Check the sum of both forces.
+
+        state = context.getState(State::Forces | State::Energy);
+        ASSERT_EQUAL_VEC(Vec3(0, f1+f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1-f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1+e2, state.getPotentialEnergy(), TOL);
+    }
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -997,6 +1053,7 @@ int main(int argc, char* argv[]) {
        testInteractionGroups();
        testLargeInteractionGroup();
        testInteractionGroupLongRangeCorrection();
+        testMultipleCutoffs();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/reference/src/SimTKReference/ReferenceCCMAAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceCCMAAlgorithm.cpp
@@ -33,21 +33,19 @@
 #include "openmm/Vec3.h"
 #include "openmm/internal/ThreadPool.h"
 #include <map>
+#include <utility>

-using std::map;
-using std::pair;
-using std::vector;
-using std::set;
 using namespace OpenMM;
+using namespace std;

 // This class extracts columns from the inverse matrix one at a time.  It is done in parallel,
 // since this can be very slow.

 class ExtractMatrixTask : public ThreadPool::Task {
 public:
-    ExtractMatrixTask(int numConstraints, vector<vector<pair<int, RealOpenMM> > >& matrix, const vector<RealOpenMM>& distance, RealOpenMM elementCutoff,
+    ExtractMatrixTask(int numConstraints, vector<vector<pair<int, RealOpenMM> > >& transposedMatrix, const vector<RealOpenMM>& distance, RealOpenMM elementCutoff,
                      const int* qRowStart, const int* qColIndex, const int* rRowStart, const int* rColIndex, const double* qValue, const double* rValue) :
-                numConstraints(numConstraints), matrix(matrix), distance(distance), elementCutoff(elementCutoff), qRowStart(qRowStart), qColIndex(qColIndex),
+                numConstraints(numConstraints), transposedMatrix(transposedMatrix), distance(distance), elementCutoff(elementCutoff), qRowStart(qRowStart), qColIndex(qColIndex),
                rRowStart(rRowStart), rColIndex(rColIndex), qValue(qValue), rValue(rValue) {
    }

@@ -61,15 +59,15 @@ public:
            QUERN_multiply_with_q_transpose(numConstraints, qRowStart, qColIndex, qValue, &rhs[0]);
            QUERN_solve_with_r(numConstraints, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
            for (int j = 0; j < numConstraints; j++) {
-                double value = rhs[j]*distance[j]/distance[i];
+                double value = rhs[j]*distance[i]/distance[j];
                if (FABS((RealOpenMM) value) > elementCutoff)
-                    matrix[i].push_back(pair<int, RealOpenMM>(j, (RealOpenMM) value));
+                    transposedMatrix[i].push_back(pair<int, RealOpenMM>(j, (RealOpenMM) value));
            }
        }
    }
 private:
    int numConstraints;
-    vector<vector<pair<int, RealOpenMM> > >& matrix;
+    vector<vector<pair<int, RealOpenMM> > >& transposedMatrix;
    const vector<RealOpenMM>& distance;
    RealOpenMM elementCutoff;
    const int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
@@ -194,12 +192,21 @@ ReferenceCCMAAlgorithm::ReferenceCCMAAlgorithm(int numberOfAtoms,
        double *qValue, *rValue;
        QUERN_compute_qr(numberOfConstraints, numberOfConstraints, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
-        vector<double> rhs(numberOfConstraints);
+        vector<vector<pair<int, RealOpenMM> > > transposedMatrix(numberOfConstraints);
        _matrix.resize(numberOfConstraints);
        ThreadPool threads;
-        ExtractMatrixTask task(numberOfConstraints, _matrix, _distance, _elementCutoff, qRowStart, qColIndex, rRowStart, rColIndex, qValue, rValue);
+        ExtractMatrixTask task(numberOfConstraints, transposedMatrix, _distance, _elementCutoff, qRowStart, qColIndex, rRowStart, rColIndex, qValue, rValue);
        threads.execute(task);
        threads.waitForThreads();
+
+        // For purposes of thread safety we extracted the matrix in transposed form, so we need to transpose it again.
+
+        for (int i = 0; i < numberOfConstraints; i++) {
+            for (int j = 0; j < transposedMatrix[i].size(); j++) {
+                pair<int, RealOpenMM> value = transposedMatrix[i][j];
+                _matrix[value.first].push_back(make_pair(i, value.second));
+            }
+        }
        QUERN_free_result(qRowStart, qColIndex, qValue);
        QUERN_free_result(rRowStart, rColIndex, rValue);
    }

--- a/platforms/reference/tests/TestReferenceCustomNonbondedForce.cpp
+++ b/platforms/reference/tests/TestReferenceCustomNonbondedForce.cpp
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -906,6 +906,62 @@ void testInteractionGroupLongRangeCorrection() {
    ASSERT_EQUAL_TOL(expected, energy2-energy1, 1e-4);
 }

+void testMultipleCutoffs() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    
+    // Add multiple nonbonded forces that have different cutoffs.
+    
+    CustomNonbondedForce* nonbonded1 = new CustomNonbondedForce("2*r");
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded1->setCutoffDistance(2.5);
+    system.addForce(nonbonded1);
+    CustomNonbondedForce* nonbonded2 = new CustomNonbondedForce("3*r");
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded2->setCutoffDistance(2.9);
+    nonbonded2->setForceGroup(1);
+    system.addForce(nonbonded2);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 0, 0);
+    for (double r = 2.4; r < 3.2; r += 0.2) {
+        positions[1][1] = r;
+        context.setPositions(positions);
+        double e1 = (r < 2.5 ? 2.0*r : 0.0);
+        double e2 = (r < 2.9 ? 3.0*r : 0.0);
+        double f1 = (r < 2.5 ? 2.0 : 0.0);
+        double f2 = (r < 2.9 ? 3.0 : 0.0);
+        
+        // Check the first force.
+        
+        State state = context.getState(State::Forces | State::Energy, false, 1);
+        ASSERT_EQUAL_VEC(Vec3(0, f1, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1, state.getPotentialEnergy(), TOL);
+        
+        // Check the second force.
+        
+        state = context.getState(State::Forces | State::Energy, false, 2);
+        ASSERT_EQUAL_VEC(Vec3(0, f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e2, state.getPotentialEnergy(), TOL);
+        
+        // Check the sum of both forces.
+
+        state = context.getState(State::Forces | State::Energy);
+        ASSERT_EQUAL_VEC(Vec3(0, f1+f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1-f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1+e2, state.getPotentialEnergy(), TOL);
+    }
+}
+
 int main() {
    try {
        testSimpleExpression();
@@ -926,6 +982,7 @@ int main() {
        testInteractionGroups();
        testLargeInteractionGroup();
        testInteractionGroupLongRangeCorrection();
+        testMultipleCutoffs();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -2407,7 +2407,7 @@ void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const Amoeba
    replacements["TAPER_C5"] = cu.doubleToString(6/pow(taperCutoff-cutoff, 5.0));
    bool useCutoff = (force.getNonbondedMethod() != AmoebaVdwForce::NoCutoff);
    nonbonded->addInteraction(useCutoff, useCutoff, true, force.getCutoff(), exclusions,
-        cu.replaceStrings(CudaAmoebaKernelSources::amoebaVdwForce2, replacements), force.getForceGroup());
+        cu.replaceStrings(CudaAmoebaKernelSources::amoebaVdwForce2, replacements), 0);
    
    // Create the other kernels.
    
@@ -2429,8 +2429,8 @@ double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeF
    void* prepareArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &tempPosq->getDevicePointer(),
        &bondReductionAtoms->getDevicePointer(), &bondReductionFactors->getDevicePointer()};
    cu.executeKernel(prepareKernel, prepareArgs, cu.getPaddedNumAtoms());
-    nonbonded->prepareInteractions();
-    nonbonded->computeInteractions();
+    nonbonded->prepareInteractions(1);
+    nonbonded->computeInteractions(1);
    void* spreadArgs[] = {&cu.getForce().getDevicePointer(), &tempForces->getDevicePointer(), &bondReductionAtoms->getDevicePointer(), &bondReductionFactors->getDevicePointer()};
    cu.executeKernel(spreadKernel, spreadArgs, cu.getPaddedNumAtoms());
    tempPosq->copyTo(cu.getPosq());
@@ -2534,7 +2534,7 @@ void CudaCalcAmoebaWcaDispersionForceKernel::initialize(const System& system, co
    // just so that CudaNonbondedUtilities will keep track of the tiles.
    
    vector<vector<int> > exclusions;
-    cu.getNonbondedUtilities().addInteraction(false, false, false, cu.getNonbondedUtilities().getCutoffDistance(), exclusions, "", force.getForceGroup());
+    cu.getNonbondedUtilities().addInteraction(false, false, false, 1.0, exclusions, "", force.getForceGroup());
    cu.addForce(new ForceInfo(force));
 }


--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -18,6 +18,9 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    IF ((${TEST_ROOT} MATCHES TestVectorize) AND NOT (MSVC OR ANDROID OR PNACL))
        SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
    ENDIF ((${TEST_ROOT} MATCHES TestVectorize) AND NOT (MSVC OR ANDROID OR PNACL))
+    IF ((${TEST_ROOT} MATCHES TestVectorize8) AND NOT (MSVC OR ANDROID OR PNACL))
+        SET(EXTRA_TEST_FLAGS "${EXTRA_COMPILE_FLAGS} -mavx")
+    ENDIF ((${TEST_ROOT} MATCHES TestVectorize8) AND NOT (MSVC OR ANDROID OR PNACL))
    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_TEST_FLAGS}" COMPILE_FLAGS "${EXTRA_TEST_FLAGS}")
    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
 ENDFOREACH(TEST_PROG ${TEST_PROGS})

--- a/tests/TestVectorize8.cpp
+++ b/tests/TestVectorize8.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2014-2015 Stanford University and the Authors.      *
+ * Authors: Robert T. McGibbon                                                *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests vectorized operations.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/internal/vectorize8.h"
+#include <iostream>
+
+
+#ifndef __AVX__
+bool isVec8Supported() {
+    return false;
+}
+#else
+/**
+ * Check whether 8 component vectors are supported with the current CPU.
+ */
+bool isVec8Supported() {
+    // Make sure the CPU supports AVX.
+
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
+    }
+    return false;
+}
+#endif
+
+using namespace OpenMM;
+using namespace std;
+
+#define ASSERT_VEC4_EQUAL(found, expected0, expected1, expected2, expected3) {if (std::abs((found)[0]-(expected0))>1e-6 || std::abs((found)[1]-(expected1))>1e-6 || std::abs((found)[2]-(expected2))>1e-6 || std::abs((found)[3]-(expected3))>1e-6) {std::stringstream details; details << " Expected ("<<(expected0)<<","<<(expected1)<<","<<(expected2)<<","<<(expected3)<<"), found ("<<(found)[0]<<","<<(found)[1]<<","<<(found)[2]<<","<<(found)[3]<<")"; throwException(__FILE__, __LINE__, details.str());}};
+#define ASSERT_VEC8_EQUAL(found, expected0, expected1, expected2, expected3, expected4, expected5, expected6, expected7) {if (std::abs((found).lowerVec()[0]-(expected0))>1e-6 || std::abs((found).lowerVec()[1]-(expected1))>1e-6 || std::abs((found).lowerVec()[2]-(expected2))>1e-6 || std::abs((found).lowerVec()[3]-(expected3))>1e-6 || std::abs((found).upperVec()[0]-(expected4))>1e-6 || std::abs((found).upperVec()[1]-(expected5))>1e-6 || std::abs((found).upperVec()[2]-(expected6))>1e-6 || std::abs((found).upperVec()[3]-(expected7))>1e-6) {std::stringstream details; details << " Expected ("<<(expected0)<<","<<(expected1)<<","<<(expected2)<<","<<(expected3)<<","<<(expected4)<<","<<(expected5)<<","<<(expected6)<<","<<(expected7)<<"), found ("<<(found).lowerVec()[0]<<","<<(found).lowerVec()[1]<<","<<(found).lowerVec()[2]<<","<<(found).lowerVec()[3]<<","<<(found).upperVec()[0]<<","<<(found).upperVec()[1]<<","<<(found).upperVec()[2]<<","<<(found).upperVec()[3]<<")"; throwException(__FILE__, __LINE__, details.str());}};
+#define ASSERT_VEC8_EQUAL_INT(found, expected0, expected1, expected2, expected3, expected4, expected5, expected6, expected7) {if ((found).lowerVec()[0] != (expected0) || (found).lowerVec()[1] != (expected1) || (found).lowerVec()[2] != (expected2) || (found).lowerVec()[3] != (expected3) || (found).upperVec()[0] != (expected4) || (found).upperVec()[1] != (expected5) ||(found).upperVec()[2] != (expected6) || (found).upperVec()[3] != (expected7)) {std::stringstream details; details << " Expected ("<<(expected0)<<","<<(expected1)<<","<<(expected2)<<","<<(expected3)<<","<<(expected4)<<","<<(expected5)<<","<<(expected6)<<","<<(expected7)<<"), found ("<<(found).lowerVec()[0]<<","<<(found).lowerVec()[1]<<","<<(found).lowerVec()[2]<<","<<(found).lowerVec()[3]<<","<<(found).upperVec()[0]<<","<<(found).upperVec()[1]<<","<<(found).upperVec()[2]<<","<<(found).upperVec()[3]<<")"; throwException(__FILE__, __LINE__, details.str());}};
+
+
+void testLoadStore() {
+    fvec8 f1(2.0);
+    ivec8 i1(3);
+    ASSERT_VEC8_EQUAL(f1, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0);
+    ASSERT_VEC8_EQUAL_INT(i1, 3, 3, 3, 3, 3, 3, 3, 3);
+    fvec8 f2(2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0);
+    ivec8 i2(2, 3, 4, 5, 6, 7, 8, 9);
+    ASSERT_VEC8_EQUAL(f2, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0);
+    ASSERT_VEC8_EQUAL_INT(i2, 2, 3, 4, 5, 6, 7, 8, 9);
+    float farray[8];
+    int iarray[8];
+    f2.store(farray);
+    i2.store(iarray);
+    fvec8 f3(farray);
+    ivec8 i3(iarray);
+    ASSERT_VEC8_EQUAL(f3, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0);
+    ASSERT_VEC8_EQUAL_INT(i3, 2, 3, 4, 5, 6, 7, 8, 9);
+    ASSERT_EQUAL(f3.lowerVec()[0], 2.5);
+    ASSERT_EQUAL(f3.lowerVec()[1], 3.0);
+    ASSERT_EQUAL(f3.lowerVec()[2], 3.5);
+    ASSERT_EQUAL(f3.lowerVec()[3], 4.0);
+    ASSERT_EQUAL(f3.upperVec()[0], 4.5);
+    ASSERT_EQUAL(f3.upperVec()[1], 5.0);
+    ASSERT_EQUAL(f3.upperVec()[2], 5.5);
+    ASSERT_EQUAL(f3.upperVec()[3], 6.0);
+    ASSERT_EQUAL(i3.lowerVec()[0], 2);
+    ASSERT_EQUAL(i3.lowerVec()[1], 3);
+    ASSERT_EQUAL(i3.lowerVec()[2], 4);
+    ASSERT_EQUAL(i3.lowerVec()[3], 5);
+    ASSERT_EQUAL(i3.upperVec()[0], 6);
+    ASSERT_EQUAL(i3.upperVec()[1], 7);
+    ASSERT_EQUAL(i3.upperVec()[2], 8);
+    ASSERT_EQUAL(i3.upperVec()[3], 9);
+}
+
+void testArithmetic() {
+    fvec8 f1(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
+    ASSERT_VEC8_EQUAL(f1+fvec8(1, 2, 3, 4, 5, 6, 7, 8), 1.5,   3. ,   4.5,   6. ,   7.5,   9. ,  10.5,  12.);
+    ASSERT_VEC8_EQUAL(f1-fvec8(1, 2, 3, 4, 5, 6, 7, 8), -0.5, -1. , -1.5, -2. , -2.5, -3. , -3.5, -4.);
+    ASSERT_VEC8_EQUAL(f1*fvec8(1, 2, 3, 4, 5, 6, 7, 8), 0.5,   2. ,   4.5,   8. ,  12.5,  18. ,  24.5,  32.);
+    ASSERT_VEC8_EQUAL(f1/fvec8(1, 2, 3, 4, 5, 6, 7, 8), 0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5);
+
+    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
+    f1 += fvec8(1, 2, 3, 4, 5, 6, 7, 8);
+    ASSERT_VEC8_EQUAL(f1, 1.5,   3. ,   4.5,   6. ,   7.5,   9. ,  10.5,  12.);
+    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
+    f1 -= fvec8(1, 2, 3, 4, 5, 6, 7, 8);
+    ASSERT_VEC8_EQUAL(f1, -0.5, -1. , -1.5, -2. , -2.5, -3. , -3.5, -4.);
+    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
+    f1 *= fvec8(1, 2, 3, 4, 5, 6, 7, 8);
+    ASSERT_VEC8_EQUAL(f1, 0.5,   2. ,   4.5,   8. ,  12.5,  18. ,  24.5,  32.);
+    f1 = fvec8(0.5, 1.0, 1.5, 2.0,   2.5, 3.0, 3.5, 4.0);
+    f1 /= fvec8(1, 2, 3, 4, 5, 6, 7, 8);
+    ASSERT_VEC8_EQUAL(f1, 0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5,  0.5);
+}
+
+void testLogic() {
+    int allBits = -1;
+    float allBitsf = *((float*) &allBits);
+    ivec8 mask(0, allBits, allBits, 0, 0, allBits, allBits, 0);
+    fvec8 fmask(0, allBitsf, allBitsf, 0, 0, allBitsf, allBitsf, 0);
+    fvec8 f1(0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0);
+    ivec8 i1(1, 2, 3, 4, 5, 6, 7, 8);
+    ASSERT_VEC8_EQUAL(f1&fmask, 0, 1.0, 1.5, 0, 0, 3.0, 3.5, 0.0);
+    fvec8 temp = f1|fmask;
+    ASSERT_EQUAL(0.5, temp.lowerVec()[0]);
+    ASSERT(temp.lowerVec()[1]!= temp.lowerVec()[1]); // All bits set, which is nan
+    ASSERT(temp.lowerVec()[2] != temp.lowerVec()[2]); // All bits set, which is nan
+    ASSERT_EQUAL(2.0, temp.lowerVec()[3]);
+    ASSERT_EQUAL(2.5, temp.upperVec()[0]);
+    ASSERT(temp.upperVec()[1] != temp.upperVec()[1]); // All bits set, which is nan
+    ASSERT(temp.upperVec()[2] != temp.upperVec()[2]); // All bits set, which is nan
+    ASSERT_EQUAL(4.0, temp.upperVec()[3]);
+    ASSERT_VEC8_EQUAL_INT(i1&mask, 0, 2, 3, 0, 0, 6, 7, 0);
+    ASSERT_VEC8_EQUAL_INT(i1|mask, 1, allBits, allBits, 4, 5, allBits, allBits, 8);
+}
+
+void testComparisons() {
+    fvec8 v1(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+    fvec8 v2(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5);
+    ASSERT_VEC8_EQUAL(blend(v1, v2,
+        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)==fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
+        0.0, 1.5, 1.5, 0.0, 0.0, 1.5, 1.5, 0.0);
+    ASSERT_VEC8_EQUAL(blend(v1, v2,
+        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)!=fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
+        1.5, 0.0, 0.0, 1.5, 1.5, 0.0, 0.0, 1.5);
+    ASSERT_VEC8_EQUAL(blend(v1, v2,
+        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)<fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
+        1.5, 0.0, 0.0, 0.0, 1.5, 0.0, 0.0, 0.0);
+    ASSERT_VEC8_EQUAL(blend(v1, v2,
+        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)>fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
+        0.0, 0.0, 0.0, 1.5, 0.0, 0.0, 0.0, 1.5);
+    ASSERT_VEC8_EQUAL(blend(v1, v2,
+        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)<=fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
+        1.5, 1.5, 1.5, 0.0, 1.5, 1.5, 1.5, 0.0);
+    ASSERT_VEC8_EQUAL(blend(v1, v2,
+        fvec8(1.0, 1.5, 3.0, 2.2, 10.0, 10.5, 13.0, 12.2)>=fvec8(1.1, 1.5, 3.0, 2.1, 10.1, 10.5, 13.0, 12.1)),
+        0.0, 1.5, 1.5, 1.5, 0.0, 1.5, 1.5, 1.5);
+}
+
+void testMathFunctions() {
+    fvec8 f1(0.4, 1.9, -1.2, -3.8, 0.4, 1.9, -1.2, -3.8);
+    fvec8 f2(1.1, 1.2, 1.3, -5.0, 1.1, 1.2, 1.3, -5.0);
+    ASSERT_VEC8_EQUAL(floor(f1), 0.0, 1.0, -2.0, -4.0, 0.0, 1.0, -2.0, -4.0);
+    ASSERT_VEC8_EQUAL(ceil(f1), 1.0, 2.0, -1.0, -3.0, 1.0, 2.0, -1.0, -3.0);
+    ASSERT_VEC8_EQUAL(round(f1), 0.0, 2.0, -1.0, -4.0, 0.0, 2.0, -1.0, -4.0);
+    ASSERT_VEC8_EQUAL(abs(f1), 0.4, 1.9, 1.2, 3.8, 0.4, 1.9, 1.2, 3.8);
+    ASSERT_VEC8_EQUAL(min(f1, f2), 0.4, 1.2, -1.2, -5.0, 0.4, 1.2, -1.2, -5.0);
+    ASSERT_VEC8_EQUAL(max(f1, f2), 1.1, 1.9, 1.3, -3.8, 1.1, 1.9, 1.3, -3.8);
+    ASSERT_VEC8_EQUAL(sqrt(fvec8(1.5, 3.1, 4.0, 15.0, 1.5, 3.1, 4.0, 15.0)), sqrt(1.5), sqrt(3.1), sqrt(4.0), sqrt(15.0), sqrt(1.5), sqrt(3.1), sqrt(4.0), sqrt(15.0));
+    ASSERT_VEC8_EQUAL(rsqrt(fvec8(1.5, 3.1, 4.0, 15.0, 1.5, 3.1, 4.0, 15.0)), 1.0/sqrt(1.5), 1.0/sqrt(3.1), 1.0/sqrt(4.0), 1.0/sqrt(15.0), 1.0/sqrt(1.5), 1.0/sqrt(3.1), 1.0/sqrt(4.0), 1.0/sqrt(15.0));
+    ASSERT_EQUAL_TOL(f1.lowerVec()[0]*f2.lowerVec()[0]+f1.lowerVec()[1]*f2.lowerVec()[1]+f1.lowerVec()[2]*f2.lowerVec()[2]+f1.lowerVec()[3]*f2.lowerVec()[3]+f1.upperVec()[0]*f2.upperVec()[0]+f1.upperVec()[1]*f2.upperVec()[1]+f1.upperVec()[2]*f2.upperVec()[2]+f1.upperVec()[3]*f2.upperVec()[3], dot8(f1, f2), 1e-6);
+    ASSERT(any(f1 > 0.5));
+    ASSERT(!any(f1 > 2.0));
+    ASSERT_VEC8_EQUAL(blend(f1, f2, ivec8(-1, 0, -1, 0, -1, 0, -1, 0)), 1.1, 1.9, 1.3, -3.8, 1.1, 1.9, 1.3, -3.8);
+}
+
+void testTranspose() {
+    fvec4 f1(0.0,   1.0,  2.0,  3.0);
+    fvec4 f2(10.0, 11.0, 12.0, 13.0);
+    fvec4 f3(20.0, 21.0, 22.0, 23.0);
+    fvec4 f4(30.0, 31.0, 32.0, 33.0);
+    fvec4 f5(40.0, 41.0, 42.0, 43.0);
+    fvec4 f6(50.0, 51.0, 52.0, 53.0);
+    fvec4 f7(60.0, 61.0, 62.0, 63.0);
+    fvec4 f8(70.0, 71.0, 72.0, 73.0);
+    fvec8 o1, o2, o3, o4;
+
+    transpose(f1, f2, f3, f4, f5, f6, f7, f8, o1, o2, o3, o4);
+    ASSERT_VEC8_EQUAL(o1, 0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0);
+    ASSERT_VEC8_EQUAL(o2, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 61.0, 71.0);
+    ASSERT_VEC8_EQUAL(o3, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 62.0, 72.0);
+    ASSERT_VEC8_EQUAL(o4, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 63.0, 73.0);
+
+    fvec4 g1, g2, g3, g4, g5, g6, g7, g8;
+    transpose(o1, o2, o3, o4, g1, g2, g3, g4, g5, g6, g7, g8);
+    ASSERT_VEC4_EQUAL(g1, 0.0,   1.0,  2.0,  3.0);
+    ASSERT_VEC4_EQUAL(g2, 10.0, 11.0, 12.0, 13.0);
+    ASSERT_VEC4_EQUAL(g3, 20.0, 21.0, 22.0, 23.0);
+    ASSERT_VEC4_EQUAL(g4, 30.0, 31.0, 32.0, 33.0);
+    ASSERT_VEC4_EQUAL(g5, 40.0, 41.0, 42.0, 43.0);
+    ASSERT_VEC4_EQUAL(g6, 50.0, 51.0, 52.0, 53.0);
+    ASSERT_VEC4_EQUAL(g7, 60.0, 61.0, 62.0, 63.0);
+    ASSERT_VEC4_EQUAL(g8, 70.0, 71.0, 72.0, 73.0);
+
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        if (!isVec8Supported()) {
+            cout << "CPU is not supported.  Exiting." << endl;
+            return 0;
+        }
+        testLoadStore();
+        testArithmetic();
+        testLogic();
+        testComparisons();
+        testMathFunctions();
+        testTranspose();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/wrappers/python/setup.py
+++ b/wrappers/python/setup.py
-__author__ = "Peter Eastman"
-__version__ = "1.0"
-
+"""
+setup.py: Used for building python wrappers for Simbios' OpenMM library.
+"""
 import ast
 import re
 import os
@@ -13,6 +13,8 @@ MINOR_VERSION_NUM='@OPENMM_MINOR_VERSION@'
 BUILD_INFO='@OPENMM_BUILD_VERSION@'
 IS_RELEASED = False

+__author__ = "Peter Eastman"
+__version__ = "%s.%s" % (MAJOR_VERSION_NUM, MINOR_VERSION_NUM)

 def reportError(message):
    sys.stdout.write("ERROR: ")