merge

1ebe88ba · Robert McGibbon · 804cbb22 · a37dbc96 · 1ebe88ba · 1ebe88ba
Commit 1ebe88ba authored Jul 24, 2015 by Robert McGibbon
20 changed files
--- a/devtools/packaging/manifests/source/manifest.txt
+++ b/devtools/packaging/manifests/source/manifest.txt
-cmake_modules
+openmm/cmake_modules
-CMakeLists.txt
+openmm/CMakeLists.txt
-docs-source
+openmm/docs-source
-examples
+install/docs
-libraries
+openmm/examples
-olla
+openmm/libraries
-openmmapi
+openmm/olla
-platforms
+openmm/openmmapi
-plugins
+openmm/platforms
-serialization
+openmm/plugins
-tests
+openmm/serialization
-wrappers
+openmm/tests
+openmm/wrappers
--- a/devtools/packaging/scripts/linux/build.sh
+++ b/devtools/packaging/scripts/linux/build.sh
@@ -24,8 +24,8 @@ CMAKE_FLAGS+=" -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-7.0/bin/nvcc"
 CMAKE_FLAGS+=" -DCUDA_SDK_ROOT_DIR=/usr/local/cuda-7.0/"
 CMAKE_FLAGS+=" -DCUDA_TOOLKIT_INCLUDE=/usr/local/cuda-7.0/include"
 CMAKE_FLAGS+=" -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-7.0/"
-CMAKE_FLAGS+=" -DOPENCL_INCLUDE_DIR=/usr/local/cuda-7.0/include"
+CMAKE_FLAGS+=" -DOPENCL_INCLUDE_DIR=/opt/AMDAPPSDK-2.9-1/include/"
-CMAKE_FLAGS+=" -DOPENCL_LIBRARY=/usr/local/cuda-7.0/lib64/libOpenCL.so"
+CMAKE_FLAGS+=" -DOPENCL_LIBRARY=/opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so"
 # Set location for FFTW3
 PREFIX="$WORKSPACE/miniconda"

--- a/devtools/packaging/scripts/linux/prepare.sh
+++ b/devtools/packaging/scripts/linux/prepare.sh
@@ -6,7 +6,7 @@
 export WORKSPACE=`pwd`
 # Install miniconda
-export VERSION="3.7.0"
+export VERSION="Latest"
 export PLATFORM="Linux"
 export ARCH="x86_64"
 export MINICONDA="Miniconda-$VERSION-$PLATFORM-$ARCH.sh"

--- a/devtools/packaging/scripts/osx/prepare.sh
+++ b/devtools/packaging/scripts/osx/prepare.sh
@@ -6,7 +6,7 @@
 export WORKSPACE=`pwd`
 # Install miniconda
-export VERSION="3.7.0"
+export VERSION="Latest"
 export PLATFORM="MacOSX"
 export ARCH="x86_64"
 export MINICONDA="Miniconda-$VERSION-$PLATFORM-$ARCH.sh"

--- a/devtools/packaging/scripts/source/build.sh
+++ b/devtools/packaging/scripts/source/build.sh
+#!/bin/bash
+# Build script for Linux distribution, for use in automated packaging.
+# Note that this must be run from outside the checked-out openmm/ directory.
+# Set relative workspace path.
+export WORKSPACE=`pwd`
+# Add conda binaries to path.
+PATH=$WORKSPACE/miniconda/bin:$PATH
+INSTALL=`pwd`/install
+if [ -e $INSTALL ]; then
+    rm -rf $INSTALL
+fi
+CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=$INSTALL"
+# setting the rpath so that libOpenMMPME.so finds the right libfftw3
+#CMAKE_FLAGS+=" -DCMAKE_INSTALL_RPATH=.."
+CMAKE_FLAGS+=" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_AMOEBA_CUDA_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_CPU_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_CUDA_COMPILER_PLUGIN=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_CUDA_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_DRUDE_CUDA_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_DRUDE_OPENCL_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_OPENCL_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_RPMD_CUDA_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_BUILD_RPMD_OPENCL_LIB=OFF"
+# Set location for FFTW3
+#PREFIX="$WORKSPACE/miniconda"
+#CMAKE_FLAGS+=" -DFFTW_INCLUDES=$PREFIX/include"
+#CMAKE_FLAGS+=" -DFFTW_LIBRARY=$PREFIX/lib/libfftw3f.so"
+#CMAKE_FLAGS+=" -DFFTW_THREADS_LIBRARY=$PREFIX/lib/libfftw3f_threads.so"
+# Build in subdirectory.
+if [ -e build ]; then
+    rm -rf build
+fi
+mkdir build
+cd build
+cmake ../openmm $CMAKE_FLAGS
+make -j4 all DoxygenApiDocs sphinxpdf
+# Install.
+make install
--- a/devtools/packaging/scripts/source/package.sh
+++ b/devtools/packaging/scripts/source/package.sh
+#!/bin/bash
+# Packaging script for Linux distribution, for use in automated packaging.
+# Note that this must be run from outside the checked-out openmm/ directory.
+# CONFIGURE HERE
+export PACKAGE_DIR="packaging" # directory to stuff packaged source distribution
+export VERSION=$(sed -nr "s/OPENMM_VERSION:STRING=(.*)/\1/p" build/CMakeCache.txt)
+export PACKAGE_SUBDIR="OpenMM-${VERSION}-Source" # directory where distribution will be unpacked
+export DISTRO_PREFIX="OpenMM-${VERSION}-Source" # prefix for source distribution (e.g. ${DISTRIBUTION_NAME}.zip)
+# Perform all work in a work directory.
+cd work
+# Clean up.
+rm -rf $PACKAGE_DIR
+# Make a directory to contain packaged source distribution
+mkdir $PACKAGE_DIR
+mkdir $PACKAGE_DIR/$PACKAGE_SUBDIR
+for filename in $( cat openmm/devtools/packaging/manifests/source/manifest.txt ); do
+   CMD="cp -r $filename $PACKAGE_DIR/$PACKAGE_SUBDIR"
+   echo $CMD
+   `$CMD`
+done
+# Add the install.sh script
+#CMD="cp -r openmm/devtools/packaging/install.sh $PACKAGE_DIR/$PACKAGE_SUBDIR"
+#echo $CMD
+#`$CMD`
+# Make Python source distribution.
+echo "Building Python source distribution..."
+pushd .
+cd build
+make PythonSdist
+cd python/dist
+tar zxf OpenMM-${VERSION}.tar.gz
+mv OpenMM-${VERSION} python
+popd
+cp -r build/python/dist/python $PACKAGE_DIR/$PACKAGE_SUBDIR
+# Create archives.
+cd $PACKAGE_DIR
+mkdir compressed
+tar zcf compressed/${DISTRO_PREFIX}.tgz $PACKAGE_SUBDIR
+zip -r compressed/${DISTRO_PREFIX}.zip $PACKAGE_SUBDIR
+cd ..
--- a/devtools/packaging/scripts/source/prepare.sh
+++ b/devtools/packaging/scripts/source/prepare.sh
+#!/bin/tcsh
+# Prepare for build by ensuring necessary prerequisites are locally installed.
+# Set relative workspace path.
+export WORKSPACE=`pwd`
+# Install miniconda
+export VERSION="Latest"
+export PLATFORM="Linux"
+export ARCH="x86_64"
+export MINICONDA="Miniconda-$VERSION-$PLATFORM-$ARCH.sh"
+if [ -f miniconda ];
+then
+   echo "miniconda already exists"
+else
+   echo "Downloading miniconda..."
+   rm -rf Miniconda-*
+   wget --quiet http://repo.continuum.io/miniconda/${MINICONDA}
+   bash ${MINICONDA} -b -p miniconda
+   PIP_ARGS="-U"
+fi
+# Add to path.
+export PATH=$WORKSPACE/miniconda/bin:$PATH
+# Ensure configuration is up to date.
+conda config --add channels http://conda.binstar.org/omnia
+conda install --yes --quiet swig fftw3f pip
+pip install sphinxcontrib-bibtex
--- a/docs-source/DoxyfileC++.in
+++ b/docs-source/DoxyfileC++.in
@@ -92,6 +92,7 @@ EXCLUDE_SYMLINKS       = NO
 EXCLUDE_PATTERNS       = */tests/* \
                         */openmmapi/src/* \
                         */.svn/* \
+                         */internal/* \
                         */olla/include/openmm/kernels.h \
                         */DrudeKernels.h \
                         */RpmdKernels.h \

--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -45,7 +45,7 @@ class ivec8;
 class fvec8 {
 public:
    __m256 val;
    fvec8() {}
    fvec8(float v) : val(_mm256_set1_ps(v)) {}
    fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
@@ -123,7 +123,7 @@ public:
 class ivec8 {
 public:
    __m256i val;
    ivec8() {}
    ivec8(int v) : val(_mm256_set1_epi32(v)) {}
    ivec8(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) : val(_mm256_set_epi32(v8, v7, v6, v5, v4, v3, v2, v1)) {}

--- a/platforms/cpu/CMakeLists.txt
+++ b/platforms/cpu/CMakeLists.txt
@@ -20,6 +20,23 @@ ENDIF(BUILD_TESTING)
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
 SET(OPENMM_SOURCE_SUBDIRS .)
+IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CMAKE_COMPILER_IS_CLANGXX 1)
+ENDIF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+IF(CMAKE_COMPILER_IS_CLANGXX)
+  EXECUTE_PROCESS( COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE __clang_full_version_string )
+  string (REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION_STRING ${__clang_full_version_string})
+  unset(__clang_full_version_string)
+ENDIF(CMAKE_COMPILER_IS_CLANGXX)
+IF(CMAKE_COMPILER_IS_CLANGXX AND CLANG_VERSION_STRING STREQUAL "3.6")
+ message(FATAL_ERROR  "The OpenMM CPU platform cannot be built with your current compiler, clang-3.6, due to bugs \
+in the compiler's AVX support. Either downgrade to clang-3.5, upgrade to clang-3.7 or later, switch to an \
+alternative compiler such as GCC, or turn off building of the CPU platform by unsetting the CMake variable \
+OPENMM_BUILD_CPU_LIB.")
+ENDIF(CMAKE_COMPILER_IS_CLANGXX AND CLANG_VERSION_STRING STREQUAL "3.6")
 # Collect up information about the version of the OpenMM library we're building
 # and make it available to the code so it can be built into the binaries.

--- a/platforms/cpu/tests/TestCpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/tests/TestCpuCustomNonbondedForce.cpp
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -900,6 +900,62 @@ void testInteractionGroupLongRangeCorrection() {
    ASSERT_EQUAL_TOL(expected, energy2-energy1, 1e-4);
 }
+void testMultipleCutoffs() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    // Add multiple nonbonded forces that have different cutoffs.
+    CustomNonbondedForce* nonbonded1 = new CustomNonbondedForce("2*r");
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded1->setCutoffDistance(2.5);
+    system.addForce(nonbonded1);
+    CustomNonbondedForce* nonbonded2 = new CustomNonbondedForce("3*r");
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded2->setCutoffDistance(2.9);
+    nonbonded2->setForceGroup(1);
+    system.addForce(nonbonded2);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 0, 0);
+    for (double r = 2.4; r < 3.2; r += 0.2) {
+        positions[1][1] = r;
+        context.setPositions(positions);
+        double e1 = (r < 2.5 ? 2.0*r : 0.0);
+        double e2 = (r < 2.9 ? 3.0*r : 0.0);
+        double f1 = (r < 2.5 ? 2.0 : 0.0);
+        double f2 = (r < 2.9 ? 3.0 : 0.0);
+        // Check the first force.
+        State state = context.getState(State::Forces | State::Energy, false, 1);
+        ASSERT_EQUAL_VEC(Vec3(0, f1, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1, state.getPotentialEnergy(), TOL);
+        // Check the second force.
+        state = context.getState(State::Forces | State::Energy, false, 2);
+        ASSERT_EQUAL_VEC(Vec3(0, f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e2, state.getPotentialEnergy(), TOL);
+        // Check the sum of both forces.
+        state = context.getState(State::Forces | State::Energy);
+        ASSERT_EQUAL_VEC(Vec3(0, f1+f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1-f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1+e2, state.getPotentialEnergy(), TOL);
+    }
+}
 int main() {
    try {
        if (!CpuPlatform::isProcessorSupported()) {
@@ -924,6 +980,7 @@ int main() {
        testInteractionGroups();
        testLargeInteractionGroup();
        testInteractionGroupLongRangeCorrection();
+        testMultipleCutoffs();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -752,7 +752,7 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
-    double prefactor, surfaceAreaFactor;
+    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    CudaContext& cu;
@@ -802,6 +802,7 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    double cutoff;
    bool hasInitializedKernels, needParameterGradient;
    int maxTiles, numComputedValues;
    CudaContext& cu;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -129,31 +129,17 @@ public:
        return forceThreadBlockSize;
    }
    /**
-     * Get the cutoff distance.
+     * Get the maximum cutoff distance used by any force group.
     */
-    double getCutoffDistance() {
+    double getMaxCutoffDistance();
-        return cutoff;
-    }
-    /**
-     * Get whether any interactions have been added.
-     */
-    bool getHasInteractions() {
-        return cutoff != -1.0;
-    }
-    /**
-     * Get the force group in which nonbonded interactions should be computed.
-     */
-    int getForceGroup() {
-        return nonbondedForceGroup;
-    }
    /**
     * Prepare to compute interactions.  This updates the neighbor list.
     */
-    void prepareInteractions();
+    void prepareInteractions(int forceGroups);
    /**
     * Compute the nonbonded interactions.
     */
-    void computeInteractions();
+    void computeInteractions(int forceGroups);
    /**
     * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
     */
@@ -246,16 +232,20 @@ public:
     * @param arguments     arrays (other than per-atom parameters) that should be passed as arguments to the kernel
     * @param useExclusions specifies whether exclusions are applied to this interaction
     * @param isSymmetric   specifies whether the interaction is symmetric
+     * @param groups        the set of force groups this kernel is for
     */
-    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
+    CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups);
+    /**
+     * Create the set of kernels that will be needed for a particular combination of force groups.
+     * 
+     * @param groups    the set of force groups
+     */
+    void createKernelsForGroups(int groups);
 private:
+    class KernelSet;
    class BlockSortTrait;
    CudaContext& context;
-    CUfunction forceKernel;
+    std::map<int, KernelSet> groupKernels;
-    CUfunction findBlockBoundsKernel;
-    CUfunction sortBoxDataKernel;
-    CUfunction findInteractingBlocksKernel;
-    CUfunction findInteractionsWithinBlocksKernel;
    CudaArray* exclusionTiles;
    CudaArray* exclusions;
    CudaArray* exclusionIndices;
@@ -275,11 +265,26 @@ private:
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
-    std::string kernelSource;
+    std::map<int, double> groupCutoff;
-    std::map<std::string, std::string> kernelDefines;
+    std::map<int, std::string> groupKernelSource;
-    double cutoff;
+    double lastCutoff;
    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
-    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+};
+/**
+ * This class stores the kernels to execute for a set of force groups.
+ */
+class CudaNonbondedUtilities::KernelSet {
+public:
+    bool hasForces;
+    double cutoffDistance;
+    CUfunction forceKernel;
+    CUfunction findBlockBoundsKernel;
+    CUfunction sortBoxDataKernel;
+    CUfunction findInteractingBlocksKernel;
+    CUfunction findInteractionsWithinBlocksKernel;
 };
 /**

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -49,6 +49,7 @@
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <set>
 #include <sstream>
 #include <typeinfo>
 #include <cudaProfiler.h>
@@ -405,13 +406,32 @@ void CudaContext::setAsCurrent() {
 }
 string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
+    static set<char> symbolChars;
+    if (symbolChars.size() == 0) {
+        symbolChars.insert('_');
+        for (char c = 'a'; c <= 'z'; c++)
+            symbolChars.insert(c);
+        for (char c = 'A'; c <= 'Z'; c++)
+            symbolChars.insert(c);
+        for (char c = '0'; c <= '9'; c++)
+            symbolChars.insert(c);
+    }
    string result = input;
    for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
-        int index = -1;
+        int index = 0;
+        int size = iter->first.size();
        do {
-            index = result.find(iter->first);
+            index = result.find(iter->first, index);
-            if (index != result.npos)
+            if (index != result.npos) {
-                result.replace(index, iter->first.size(), iter->second);
+                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
+                    // We have found a complete symbol, not part of a longer symbol.
+                    result.replace(index, size, iter->second);
+                    index += iter->second.size();
+                }
+                else
+                    index++;
+            }
        } while (index != result.npos);
    }
    return result;
@@ -1216,7 +1236,7 @@ void CudaContext::reorderAtomsImpl() {
        if (useHilbert)
            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
        else
-            binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
+            binWidth = (Real) (0.2*nonbonded->getMaxCutoffDistance());
        Real invBinWidth = (Real) (1.0/binWidth);
        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -98,16 +98,13 @@ void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool
    for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
        (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
-    bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
    cu.setComputeForceCount(cu.getComputeForceCount()+1);
-    if (includeNonbonded)
+    nb.prepareInteractions(groups);
-        nb.prepareInteractions();
 }
 double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
    cu.getBondedUtilities().computeInteractions(groups);
-    if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
+    cu.getNonbondedUtilities().computeInteractions(groups);
-        cu.getNonbondedUtilities().computeInteractions();
    double sum = 0.0;
    for (vector<CudaContext::ForcePostComputation*>::iterator iter = cu.getPostComputations().begin(); iter != cu.getPostComputations().end(); ++iter)
        sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
@@ -2570,8 +2567,9 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
    surfaceAreaFactor = -6.0*4*M_PI*force.getSurfaceAreaEnergy();
    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
    bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
+    cutoff = force.getCutoffDistance();
    string source = CudaKernelSources::gbsaObc2;
-    nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
+    nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup());
    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(float2), params->getDevicePointer()));;
    nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "long long", 1, sizeof(long long), bornForce->getDevicePointer()));;
    cu.addForce(new CudaGBSAOBCForceInfo(force));
@@ -2591,8 +2589,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
            defines["USE_PERIODIC"] = "1";
        if (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision())
            defines["ENABLE_SHUFFLE"] = "1";
-        defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
+        defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
-        defines["CUTOFF"] = cu.doubleToString(nb.getCutoffDistance());
+        defines["CUTOFF"] = cu.doubleToString(cutoff);
        defines["PREFACTOR"] = cu.doubleToString(prefactor);
        defines["SURFACE_AREA_FACTOR"] = cu.doubleToString(surfaceAreaFactor);
        defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
@@ -2764,6 +2762,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
    cu.setAsCurrent();
    if (cu.getPlatformData().contexts.size() > 1)
        throw OpenMMException("CustomGBForce does not support using multiple CUDA devices");
+    cutoff = force.getCutoffDistance();
    bool useExclusionsForValue = false;
    numComputedValues = force.getNumComputedValues();
    vector<string> computedValueNames(force.getNumComputedValues());
@@ -2956,7 +2955,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            pairValueDefines["NEED_PADDING"] = "1";
        pairValueDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
        pairValueDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
-        pairValueDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairValueDefines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
        pairValueDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
        pairValueDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        pairValueDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
@@ -3123,7 +3122,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            pairEnergyDefines["NEED_PADDING"] = "1";
        pairEnergyDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
        pairEnergyDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
-        pairEnergyDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
+        pairEnergyDefines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
        pairEnergyDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
        pairEnergyDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        pairEnergyDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
@@ -3367,7 +3366,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            globals->upload(globalParamValues);
            arguments.push_back(CudaNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(float), globals->getDevicePointer()));
        }
-        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getNumExclusions() > 0, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getNumExclusions() > 0, cutoff, exclusionList, source, force.getForceGroup());
        for (int i = 0; i < (int) parameters.size(); i++)
            cu.getNonbondedUtilities().addParameter(parameters[i]);
        for (int i = 0; i < (int) arguments.size(); i++)
@@ -3393,7 +3392,7 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
            int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
            pairValueDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
            pairValueDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
-            pairValueDefines["CUTOFF"] = cu.doubleToString(nb.getCutoffDistance());
+            pairValueDefines["CUTOFF"] = cu.doubleToString(cutoff);
            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairValueSrc, pairValueDefines);
            pairValueKernel = cu.getKernel(module, "computeN2Value");
            pairValueSrc = "";
@@ -3407,7 +3406,7 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
            int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
            pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
            pairEnergyDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
-            pairEnergyDefines["CUTOFF"] = cu.doubleToString(nb.getCutoffDistance());
+            pairEnergyDefines["CUTOFF"] = cu.doubleToString(cutoff);
            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairEnergySrc, pairEnergyDefines);
            pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
            pairEnergySrc = "";

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -62,10 +62,10 @@ private:
    bool useDouble;
 };
-CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
+CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0), forceRebuildNeighborList(true) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
    // Decide how many thread blocks to use.
    string errorMessage = "Error initializing nonbonded utilities";
@@ -109,24 +109,28 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
 }
 void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
-    if (cutoff != -1.0) {
+    if (groupCutoff.size() > 0) {
        if (usesCutoff != useCutoff)
            throw OpenMMException("All Forces must agree on whether to use a cutoff");
        if (usesPeriodic != usePeriodic)
            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
-        if (cutoffDistance != cutoff)
+        if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
-            throw OpenMMException("All Forces must use the same cutoff distance");
+            throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
-        if (forceGroup != nonbondedForceGroup)
-            throw OpenMMException("All nonbonded forces must be in the same force group");
    }
    if (usesExclusions)
        requestExclusions(exclusionList);
    useCutoff = usesCutoff;
    usePeriodic = usesPeriodic;
-    cutoff = cutoffDistance;
+    groupCutoff[forceGroup] = cutoffDistance;
-    if (kernel.size() > 0)
+    groupFlags |= 1<<forceGroup;
-        kernelSource += kernel+"\n";
+    if (kernel.size() > 0) {
-    nonbondedForceGroup = forceGroup;
+        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
+            groupKernelSource[forceGroup] = "";
+        map<string, string> replacements;
+        replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
+        replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
+        groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
+    }
 }
 void CudaNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
@@ -213,6 +217,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
    }
+    maxExclusions = 0;
+    for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
+        maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
    exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
    exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices->upload(exclusionIndicesVec);
@@ -270,29 +277,33 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        interactionCount->upload(count);
    }
-    // Create kernels.
+    // Record arguments for kernels.
-    if (kernelSource.size() > 0)
+    forceArgs.push_back(&context.getForce().getDevicePointer());
-        forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
+    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
+    forceArgs.push_back(&context.getPosq().getDevicePointer());
+    forceArgs.push_back(&exclusions->getDevicePointer());
+    forceArgs.push_back(&exclusionTiles->getDevicePointer());
+    forceArgs.push_back(&startTileIndex);
+    forceArgs.push_back(&numTiles);
+    if (useCutoff) {
+        forceArgs.push_back(&interactingTiles->getDevicePointer());
+        forceArgs.push_back(&interactionCount->getDevicePointer());
+        forceArgs.push_back(context.getPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
+        forceArgs.push_back(context.getPeriodicBoxVecXPointer());
+        forceArgs.push_back(context.getPeriodicBoxVecYPointer());
+        forceArgs.push_back(context.getPeriodicBoxVecZPointer());
+        forceArgs.push_back(&maxTiles);
+        forceArgs.push_back(&blockCenter->getDevicePointer());
+        forceArgs.push_back(&blockBoundingBox->getDevicePointer());
+        forceArgs.push_back(&interactingAtoms->getDevicePointer());
+    }
+    for (int i = 0; i < (int) parameters.size(); i++)
+        forceArgs.push_back(&parameters[i].getMemory());
+    for (int i = 0; i < (int) arguments.size(); i++)
+        forceArgs.push_back(&arguments[i].getMemory());
    if (useCutoff) {
-        double padding = (usePadding ? 0.1*cutoff : 0.0);
-        double paddedCutoff = cutoff+padding;
-        map<string, string> defines;
-        defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
-        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
-        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
-        defines["PADDING"] = context.doubleToString(padding);
-        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
-        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
-        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
-        if (usePeriodic)
-            defines["USE_PERIODIC"] = "1";
-        int maxExclusions = 0;
-        for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
-            maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
-        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
-        CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
-        findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
        findBlockBoundsArgs.push_back(&numAtoms);
        findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
        findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
@@ -304,7 +315,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
        findBlockBoundsArgs.push_back(&rebuildNeighborList->getDevicePointer());
        findBlockBoundsArgs.push_back(&sortedBlocks->getDevicePointer());
-        sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
        sortBoxDataArgs.push_back(&sortedBlocks->getDevicePointer());
        sortBoxDataArgs.push_back(&blockCenter->getDevicePointer());
        sortBoxDataArgs.push_back(&blockBoundingBox->getDevicePointer());
@@ -315,7 +325,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
        sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
        sortBoxDataArgs.push_back(&forceRebuildNeighborList);
-        findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
        findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
@@ -338,30 +347,48 @@ void CudaNonbondedUtilities::initialize(const System& system) {
    }
 }
-void CudaNonbondedUtilities::prepareInteractions() {
+double CudaNonbondedUtilities::getMaxCutoffDistance() {
+    double cutoff = 0.0;
+    for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
+        cutoff = max(cutoff, iter->second);
+    return cutoff;
+}
+void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
+    if ((forceGroups&groupFlags) == 0)
+        return;
+    if (groupKernels.find(forceGroups) == groupKernels.end())
+        createKernelsForGroups(forceGroups);
    if (!useCutoff)
        return;
    if (numTiles == 0)
        return;
+    KernelSet& kernels = groupKernels[forceGroups];
    if (usePeriodic) {
        double4 box = context.getPeriodicBoxSize();
-        double minAllowedSize = 1.999999*cutoff;
+        double minAllowedSize = 1.999999*kernels.cutoffDistance;
        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
    }
    // Compute the neighbor list.
-    context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
+    if (lastCutoff != kernels.cutoffDistance)
+        forceRebuildNeighborList = true;
+    context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
    blockSorter->sort(*sortedBlocks);
-    context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
+    context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
-    context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
+    context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
    forceRebuildNeighborList = false;
+    lastCutoff = kernels.cutoffDistance;
 }
-void CudaNonbondedUtilities::computeInteractions() {
+void CudaNonbondedUtilities::computeInteractions(int forceGroups) {
-    if (kernelSource.size() > 0) {
+    if ((forceGroups&groupFlags) == 0)
-        context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        return;
+    KernelSet& kernels = groupKernels[forceGroups];
+    if (kernels.hasForces) {
+        context.executeKernel(kernels.forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
        if (context.getComputeForceCount() == 1)
            updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
    }
@@ -411,7 +438,43 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
    forceRebuildNeighborList = true;
 }
-CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
+void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
+    KernelSet kernels;
+    double cutoff = 0.0;
+    string source;
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            cutoff = max(cutoff, groupCutoff[i]);
+            source += groupKernelSource[i];
+        }
+    }
+    kernels.hasForces = (source.size() > 0);
+    kernels.cutoffDistance = cutoff;
+    if (kernels.hasForces)
+        kernels.forceKernel = createInteractionKernel(source, parameters, arguments, true, true, groups);
+    if (useCutoff && kernels.hasForces) {
+        double padding = (usePadding ? 0.1*cutoff : 0.0);
+        double paddedCutoff = cutoff+padding;
+        map<string, string> defines;
+        defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
+        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
+        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
+        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
+        if (usePeriodic)
+            defines["USE_PERIODIC"] = "1";
+        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
+        kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
+        kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
+        kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
+    }
+    groupKernels[groups] = kernels;
+}
+CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups) {
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
    const string suffixes[] = {"x", "y", "z", "w"};
@@ -588,8 +651,16 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    if (useShuffle)
        defines["ENABLE_SHUFFLE"] = "1";
    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
-    defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
+    double maxCutoff = 0.0;
-    defines["CUTOFF"] = context.doubleToString(cutoff);
+    for (int i = 0; i < 32; i++) {
+        if ((groups&(1<<i)) != 0) {
+            double cutoff = groupCutoff[i];
+            maxCutoff = max(maxCutoff, cutoff);
+            defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
+            defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
+        }
+    }
+    defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
@@ -605,33 +676,5 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");
-    // Set arguments to the Kernel.
-    int index = 0;
-    forceArgs.push_back(&context.getForce().getDevicePointer());
-    forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
-    forceArgs.push_back(&context.getPosq().getDevicePointer());
-    forceArgs.push_back(&exclusions->getDevicePointer());
-    forceArgs.push_back(&exclusionTiles->getDevicePointer());
-    forceArgs.push_back(&startTileIndex);
-    forceArgs.push_back(&numTiles);
-    if (useCutoff) {
-        forceArgs.push_back(&interactingTiles->getDevicePointer());
-        forceArgs.push_back(&interactionCount->getDevicePointer());
-        forceArgs.push_back(context.getPeriodicBoxSizePointer());
-        forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
-        forceArgs.push_back(context.getPeriodicBoxVecXPointer());
-        forceArgs.push_back(context.getPeriodicBoxVecYPointer());
-        forceArgs.push_back(context.getPeriodicBoxVecZPointer());
-        forceArgs.push_back(&maxTiles);
-        forceArgs.push_back(&blockCenter->getDevicePointer());
-        forceArgs.push_back(&blockBoundingBox->getDevicePointer());
-        forceArgs.push_back(&interactingAtoms->getDevicePointer());
-    }
-    for (int i = 0; i < (int) params.size(); i++)
-        forceArgs.push_back(&params[i].getMemory());
-    for (int i = 0; i < (int) arguments.size(); i++)
-        forceArgs.push_back(&arguments[i].getMemory());
    return kernel;
 }
--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -330,9 +330,9 @@ extern "C" __global__ void computeNonbonded(
        if (numTiles <= maxTiles) {
            x = tiles[pos];
            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                                  0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+                                  0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
        }
        else
 #endif

--- a/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
@@ -974,6 +974,62 @@ void testInteractionGroupLongRangeCorrection() {
    ASSERT_EQUAL_TOL(expected, energy2-energy1, 1e-4);
 }
+void testMultipleCutoffs() {
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+    VerletIntegrator integrator(0.01);
+    // Add multiple nonbonded forces that have different cutoffs.
+    CustomNonbondedForce* nonbonded1 = new CustomNonbondedForce("2*r");
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->addParticle(vector<double>());
+    nonbonded1->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded1->setCutoffDistance(2.5);
+    system.addForce(nonbonded1);
+    CustomNonbondedForce* nonbonded2 = new CustomNonbondedForce("3*r");
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->addParticle(vector<double>());
+    nonbonded2->setNonbondedMethod(CustomNonbondedForce::CutoffNonPeriodic);
+    nonbonded2->setCutoffDistance(2.9);
+    nonbonded2->setForceGroup(1);
+    system.addForce(nonbonded2);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(0, 0, 0);
+    for (double r = 2.4; r < 3.2; r += 0.2) {
+        positions[1][1] = r;
+        context.setPositions(positions);
+        double e1 = (r < 2.5 ? 2.0*r : 0.0);
+        double e2 = (r < 2.9 ? 3.0*r : 0.0);
+        double f1 = (r < 2.5 ? 2.0 : 0.0);
+        double f2 = (r < 2.9 ? 3.0 : 0.0);
+        // Check the first force.
+        State state = context.getState(State::Forces | State::Energy, false, 1);
+        ASSERT_EQUAL_VEC(Vec3(0, f1, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1, state.getPotentialEnergy(), TOL);
+        // Check the second force.
+        state = context.getState(State::Forces | State::Energy, false, 2);
+        ASSERT_EQUAL_VEC(Vec3(0, f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e2, state.getPotentialEnergy(), TOL);
+        // Check the sum of both forces.
+        state = context.getState(State::Forces | State::Energy);
+        ASSERT_EQUAL_VEC(Vec3(0, f1+f2, 0), state.getForces()[0], TOL);
+        ASSERT_EQUAL_VEC(Vec3(0, -f1-f2, 0), state.getForces()[1], TOL);
+        ASSERT_EQUAL_TOL(e1+e2, state.getPotentialEnergy(), TOL);
+    }
+}
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -998,6 +1054,7 @@ int main(int argc, char* argv[]) {
        testInteractionGroups();
        testLargeInteractionGroup();
        testInteractionGroupLongRangeCorrection();
+        testMultipleCutoffs();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -732,7 +732,7 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
-    double prefactor, surfaceAreaFactor;
+    double prefactor, surfaceAreaFactor, cutoff;
    bool hasCreatedKernels;
    int maxTiles;
    OpenCLContext& cl;
@@ -783,6 +783,7 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
 private:
+    double cutoff;
    bool hasInitializedKernels, needParameterGradient;
    int maxTiles, numComputedValues;
    OpenCLContext& cl;

--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -135,31 +135,23 @@ public:
        return forceThreadBlockSize;
    }
    /**
-     * Get the cutoff distance.
+     * Get the maximum cutoff distance used by any force group.
     */
-    double getCutoffDistance() {
+    double getMaxCutoffDistance();
-        return cutoff;
-    }
    /**
     * Get whether any interactions have been added.
     */
    bool getHasInteractions() {
-        return cutoff != -1.0;
+        return (groupCutoff.size() > 0);
-    }
-    /**
-     * Get the force group in which nonbonded interactions should be computed.
-     */
-    int getForceGroup() {
-        return nonbondedForceGroup;
    }
    /**
     * Prepare to compute interactions.  This updates the neighbor list.
     */
-    void prepareInteractions();
+    void prepareInteractions(int forceGroups);
    /**
     * Compute the nonbonded interactions.
     */
-    void computeInteractions();
+    void computeInteractions(int forceGroups);
    /**
     * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
     */
@@ -252,16 +244,20 @@ public:
     * @param arguments     arrays (other than per-atom parameters) that should be passed as arguments to the kernel
     * @param useExclusions specifies whether exclusions are applied to this interaction
     * @param isSymmetric   specifies whether the interaction is symmetric
+     * @param groups        the set of force groups this kernel is for
+     */
+    cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups);
+    /**
+     * Create the set of kernels that will be needed for a particular combination of force groups.
+     * 
+     * @param groups    the set of force groups
     */
-    cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
+    void createKernelsForGroups(int groups);
 private:
+    class KernelSet;
    class BlockSortTrait;
    OpenCLContext& context;
-    cl::Kernel forceKernel;
+    std::map<int, KernelSet> groupKernels;
-    cl::Kernel findBlockBoundsKernel;
-    cl::Kernel sortBoxDataKernel;
-    cl::Kernel findInteractingBlocksKernel;
-    cl::Kernel findInteractionsWithinBlocksKernel;
    OpenCLArray* exclusionTiles;
    OpenCLArray* exclusions;
    OpenCLArray* exclusionIndices;
@@ -280,12 +276,27 @@ private:
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;
-    std::string kernelSource;
+    std::map<int, double> groupCutoff;
-    std::map<std::string, std::string> kernelDefines;
+    std::map<int, std::string> groupKernelSource;
-    double cutoff;
+    double lastCutoff;
-    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding;
+    bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
-    int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks;
+    int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
-    int forceThreadBlockSize, interactingBlocksThreadBlockSize, nonbondedForceGroup;
+    int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
+};
+/**
+ * This class stores the kernels to execute for a set of force groups.
+ */
+class OpenCLNonbondedUtilities::KernelSet {
+public:
+    bool hasForces;
+    double cutoffDistance;
+    cl::Kernel forceKernel;
+    cl::Kernel findBlockBoundsKernel;
+    cl::Kernel sortBoxDataKernel;
+    cl::Kernel findInteractingBlocksKernel;
+    cl::Kernel findInteractionsWithinBlocksKernel;
 };
 /**