Merge pull request #2014 from peastman/groups

CustomNonbondedForce with interaction groups uses neighbor lists

Merge pull request #2014 from peastman/groups
CustomNonbondedForce with interaction groups uses neighbor lists
c89cbcdb · peastman · GitHub · c6df2891 · 2df35b4e · c89cbcdb
Unverified Commit c89cbcdb authored Mar 14, 2018 by peastman Committed by GitHub Mar 14, 2018
12 changed files
--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp
@@ -252,7 +252,9 @@ void CpuCustomNonbondedForce::calculateOneIxn(int ii, int jj, ThreadData& data,
    // accumulate forces

    double dEdR = (includeForce ? data.forceExpression.evaluate()/r : 0.0);
-    double energy = (includeEnergy ? data.energyExpression.evaluate() : 0.0);
+    double energy = 0.0;
+    if (includeEnergy || (useSwitch && r > switchingDistance))
+        energy = data.energyExpression.evaluate();
    double switchValue = 1.0;
    if (useSwitch) {
        if (r > switchingDistance) {

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -763,15 +763,15 @@ private:
    ForceInfo* info;
    CudaParameterSet* params;
    CudaArray globals;
-    CudaArray interactionGroupData;
-    CUfunction interactionGroupKernel;
-    std::vector<void*> interactionGroupArgs;
+    CudaArray interactionGroupData, filteredGroupData, numGroupTiles;
+    CUfunction interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
+    std::vector<void*> interactionGroupArgs, prepareNeighborListArgs, buildNeighborListArgs;
    std::vector<std::string> globalParamNames;
    std::vector<float> globalParamValues;
    std::vector<CudaArray> tabulatedFunctions;
    double longRangeCoefficient;
    std::vector<double> longRangeCoefficientDerivs;
-    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs;
+    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
    const System& system;

--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -142,6 +142,11 @@ public:
     * Get the maximum cutoff distance used by any force group.
     */
    double getMaxCutoffDistance();
+    /**
+     * Given a nonbonded cutoff, get the padded cutoff distance used in computing
+     * the neighbor list.
+     */
+    double padCutoff(double cutoff);
    /**
     * Prepare to compute interactions.  This updates the neighbor list.
     */
@@ -220,6 +225,13 @@ public:
    CudaArray& getExclusionRowIndices() {
        return exclusionRowIndices;
    }
+    /**
+     * Get the array containing a flag for whether the neighbor list was rebuilt
+     * on the most recent call to prepareInteractions().
+     */
+    CudaArray& getRebuildNeighborList() {
+        return rebuildNeighborList;
+    }
    /**
     * Get the index of the first tile this context is responsible for processing.
     */

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -54,6 +54,7 @@
 #include "jama_eig.h"
 #include <algorithm>
 #include <cmath>
+#include <iterator>
 #include <set>

 using namespace OpenMM;
@@ -2429,7 +2430,8 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    
    vector<vector<int> > atomLists;
    vector<pair<int, int> > tiles;
-    map<pair<int, int>, int> duplicateInteractions;
+    vector<int> tileGroup;
+    vector<vector<int> > duplicateAtomsForGroup;
    for (int group = 0; group < force.getNumInteractionGroups(); group++) {
        // Get the list of atoms in this group and sort them.
        
@@ -2440,6 +2442,10 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
        atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
        sort(atoms1.begin(), atoms1.end());
        sort(atoms2.begin(), atoms2.end());
+        duplicateAtomsForGroup.push_back(vector<int>());
+        set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(),
+                inserter(duplicateAtomsForGroup[group], duplicateAtomsForGroup[group].begin()));
+        sort(duplicateAtomsForGroup[group].begin(), duplicateAtomsForGroup[group].end());
        
        // Find how many tiles we will create for this group.
        
@@ -2451,9 +2457,12 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
        
        // Add the tiles.
        
+        int firstTile = tiles.size();
        for (int i = 0; i < numBlocks1; i++)
-            for (int j = 0; j < numBlocks2; j++)
+            for (int j = 0; j < numBlocks2; j++) {
                tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
+                tileGroup.push_back(group);
+            }
        
        // Add the atom lists.
        
@@ -2473,22 +2482,6 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
                atoms.push_back(atoms2[j]);
            atomLists.push_back(atoms);
        }
-        
-        // If this group contains duplicate interactions, record that we need to skip them once.
-        
-        for (int a1 : atoms1) {
-            if (set2.find(a1) == set2.end())
-                continue;
-            for (int j = 0; j < (int) atoms2.size() && atoms2[j] < a1; j++) {
-                int a2 = atoms2[j];
-                if (set1.find(a2) != set1.end()) {
-                    pair<int, int> key = make_pair(a2, a1);
-                    if (duplicateInteractions.find(key) == duplicateInteractions.end())
-                        duplicateInteractions[key] = 0;
-                    duplicateInteractions[key]++;
-                }
-            }
-        }
    }
    
    // Build a lookup table for quickly identifying excluded interactions.
@@ -2506,15 +2499,18 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    vector<vector<int> > exclusionFlags(tiles.size());
    vector<pair<int, int> > tileOrder;
    for (int tile = 0; tile < tiles.size(); tile++) {
+        bool swapped = false;
        if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
            // For efficiency, we want the first axis to be the larger one.
            
            int swap = tiles[tile].first;
            tiles[tile].first = tiles[tile].second;
            tiles[tile].second = swap;
+            swapped = true;
        }
        vector<int>& atoms1 = atomLists[tiles[tile].first];
        vector<int>& atoms2 = atomLists[tiles[tile].second];
+        vector<int>& duplicateAtoms = duplicateAtomsForGroup[tileGroup[tile]];
        vector<int> flags(atoms1.size(), (int) (1LL<<atoms2.size())-1);
        int numExcluded = 0;
        for (int i = 0; i < (int) atoms1.size(); i++)
@@ -2525,11 +2521,10 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
                pair<int, int> key = make_pair(min(a1, a2), max(a1, a2));
                if (a1 == a2 || exclusions.find(key) != exclusions.end())
                    isExcluded = true; // This is an excluded interaction.
-                else if (duplicateInteractions.find(key) != duplicateInteractions.end() && duplicateInteractions[key] > 0) {
+                else if ((a1 > a2) == swapped && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a1) && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a2)) {
                    // Both atoms are in both sets, so skip duplicate interactions.
                    
                    isExcluded = true;
-                    duplicateInteractions[key]--;
                }
                if (isExcluded) {
                    flags[i] &= -1-(1<<j);
@@ -2584,6 +2579,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    }
    interactionGroupData.initialize<int4>(cu, groupData.size(), "interactionGroupData");
    interactionGroupData.upload(groupData);
+    numGroupTiles.initialize<int>(cu, 1, "numGroupTiles");
+
+    // Allocate space for a neighbor list, if necessary.
+
+    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && groupData.size() > cu.getNumThreadBlocks()) {
+        filteredGroupData.initialize<int4>(cu, groupData.size(), "filteredGroupData");
+        interactionGroupData.copyTo(filteredGroupData);
+        int numTiles = groupData.size()/32;
+        numGroupTiles.upload(&numTiles);
+    }
    
    // Create the kernel.
    
@@ -2662,11 +2667,16 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
        defines["USE_CUTOFF"] = "1";
    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
        defines["USE_PERIODIC"] = "1";
-    defines["LOCAL_MEMORY_SIZE"] = cu.intToString(max(32, cu.getNonbondedUtilities().getForceThreadBlockSize()));
+    int localMemorySize = max(32, cu.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["LOCAL_MEMORY_SIZE"] = cu.intToString(localMemorySize);
+    defines["WARPS_IN_BLOCK"] = cu.intToString(localMemorySize/32);
    double cutoff = force.getCutoffDistance();
    defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
+    double paddedCutoff = cu.getNonbondedUtilities().padCutoff(cutoff);
+    defines["PADDED_CUTOFF_SQUARED"] = cu.doubleToString(paddedCutoff*paddedCutoff);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    defines["TILE_SIZE"] = "32";
+    defines["NUM_TILES"] = cu.intToString(numTileSets);
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*numTileSets/numContexts;
    int endIndex = (cu.getContextIndex()+1)*numTileSets/numContexts;
@@ -2674,12 +2684,19 @@ void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbo
    defines["LAST_TILE"] = cu.intToString(endIndex);
    if ((localDataSize/4)%2 == 0 && !cu.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    CUmodule program = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
-    interactionGroupKernel = cu.getKernel(program, "computeInteractionGroups");
+    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
+    interactionGroupKernel = cu.getKernel(module, "computeInteractionGroups");
+    prepareNeighborListKernel = cu.getKernel(module, "prepareToBuildNeighborList");
+    buildNeighborListKernel = cu.getKernel(module, "buildNeighborList");
    numGroupThreadBlocks = cu.getNonbondedUtilities().getNumForceThreadBlocks();
 }

 double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    useNeighborList = (filteredGroupData.isInitialized() && cu.getNonbondedUtilities().getUseCutoff());
+    if (useNeighborList && cu.getContextIndex() > 0) {
+        // When using a neighbor list, run the whole calculation on a single device.
+        return 0.0;
+    }
    if (globals.isInitialized()) {
        bool changed = false;
        for (int i = 0; i < (int) globalParamNames.size(); i++) {
@@ -2706,7 +2723,9 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
            interactionGroupArgs.push_back(&cu.getForce().getDevicePointer());
            interactionGroupArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
            interactionGroupArgs.push_back(&cu.getPosq().getDevicePointer());
-            interactionGroupArgs.push_back(&interactionGroupData.getDevicePointer());
+            interactionGroupArgs.push_back(&(useNeighborList ? filteredGroupData : interactionGroupData).getDevicePointer());
+            interactionGroupArgs.push_back(&numGroupTiles.getDevicePointer());
+            interactionGroupArgs.push_back(&useNeighborList);
            interactionGroupArgs.push_back(cu.getPeriodicBoxSizePointer());
            interactionGroupArgs.push_back(cu.getInvPeriodicBoxSizePointer());
            interactionGroupArgs.push_back(cu.getPeriodicBoxVecXPointer());
@@ -2720,8 +2739,30 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
                interactionGroupArgs.push_back(&globals.getDevicePointer());
            if (hasParamDerivs)
                interactionGroupArgs.push_back(&cu.getEnergyParamDerivBuffer().getDevicePointer());
+            if (useNeighborList) {
+                // Initialize kernels for building the interaction group neighbor list.
+
+                prepareNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
+                prepareNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
+                buildNeighborListArgs.push_back(&cu.getNonbondedUtilities().getRebuildNeighborList().getDevicePointer());
+                buildNeighborListArgs.push_back(&numGroupTiles.getDevicePointer());
+                buildNeighborListArgs.push_back(&cu.getPosq().getDevicePointer());
+                buildNeighborListArgs.push_back(&interactionGroupData.getDevicePointer());
+                buildNeighborListArgs.push_back(&filteredGroupData.getDevicePointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxSizePointer());
+                buildNeighborListArgs.push_back(cu.getInvPeriodicBoxSizePointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxVecXPointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxVecYPointer());
+                buildNeighborListArgs.push_back(cu.getPeriodicBoxVecZPointer());
+            }
        }
        int forceThreadBlockSize = cu.getNonbondedUtilities().getForceThreadBlockSize();
+        if (useNeighborList) {
+            // Rebuild the neighbor list, if necessary.
+
+            cu.executeKernel(prepareNeighborListKernel, &prepareNeighborListArgs[0], 1, 1);
+            cu.executeKernel(buildNeighborListKernel, &buildNeighborListArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        }
        cu.executeKernel(interactionGroupKernel, &interactionGroupArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    double4 boxSize = cu.getPeriodicBoxSize();

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -266,6 +266,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
        vector<unsigned int> count(2, 0);
        interactionCount.upload(count);
+        rebuildNeighborList.upload(count);
    }

    // Record arguments for kernels.
@@ -351,6 +352,11 @@ double CudaNonbondedUtilities::getMaxCutoffDistance() {
    return cutoff;
 }

+double CudaNonbondedUtilities::padCutoff(double cutoff) {
+    double padding = (usePadding ? 0.1*cutoff : 0.0);
+    return cutoff+padding;
+}
+
 void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
    if ((forceGroups&groupFlags) == 0)
        return;
@@ -462,13 +468,12 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
    kernels.source = source;
    kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
    if (useCutoff) {
-        double padding = (usePadding ? 0.1*cutoff : 0.0);
-        double paddedCutoff = cutoff+padding;
+        double paddedCutoff = padCutoff(cutoff);
        map<string, string> defines;
        defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
-        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -10,6 +10,7 @@ typedef struct {

 extern "C" __global__ void computeInteractionGroups(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
+        int* __restrict__ numGroupTiles, bool useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -20,8 +21,8 @@ extern "C" __global__ void computeInteractionGroups(
    INIT_DERIVATIVES
    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];

-    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
-    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
+    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
+    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
    for (int tile = startTile; tile < endTile; tile++) {
        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
        const int atom1 = atomData.x;
@@ -86,3 +87,75 @@ extern "C" __global__ void computeInteractionGroups(
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
    SAVE_DERIVATIVES
 }
+
+/**
+ * If the neighbor list needs to be rebuilt, reset the number of tiles to 0.  This is
+ * executed by a single thread.
+ */
+extern "C" __global__  void prepareToBuildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles) {
+    if (rebuildNeighborList[0] == 1)
+        numGroupTiles[0] = 0;
+}
+
+/**
+ * Filter the list of tiles to include only ones that have interactions within the
+ * padded cutoff.
+ */
+extern "C" __global__  void buildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles,
+        const real4* __restrict__ posq, const int4* __restrict__ groupData, int4* __restrict__ filteredGroupData,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+    
+    // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
+    
+    if (rebuildNeighborList[0] == 0)
+        return;
+
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
+    const unsigned int local_warp = threadIdx.x/TILE_SIZE; // local warpIndex
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
+
+    __shared__ real4 localPos[LOCAL_MEMORY_SIZE];
+    __shared__ volatile bool anyInteraction[WARPS_IN_BLOCK];
+    __shared__ volatile int tileIndex[WARPS_IN_BLOCK];
+
+    const unsigned int startTile = warp*NUM_TILES/totalWarps;
+    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
+    for (int tile = startTile; tile < endTile; tile++) {
+        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
+        const int atom1 = atomData.x;
+        const int atom2 = atomData.y;
+        const int rangeStart = atomData.z&0xFFFF;
+        const int rangeEnd = (atomData.z>>16)&0xFFFF;
+        const int exclusions = atomData.w;
+        real4 posq1 = posq[atom1];
+        localPos[threadIdx.x] = posq[atom2];
+        if (tgx == 0)
+            anyInteraction[local_warp] = false;
+        int tj = tgx;
+        SYNC_WARPS;
+        for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
+            if (tj < rangeEnd) {
+                bool isExcluded = (((exclusions>>tj)&1) == 0);
+                int localIndex = tbx+tj;
+                real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
+                    anyInteraction[local_warp] = true;
+            }
+            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
+            SYNC_WARPS;
+        }
+        if (anyInteraction[local_warp]) {
+            SYNC_WARPS;
+            if (tgx == 0)
+                tileIndex[local_warp] = atomicAdd(numGroupTiles, 1);
+            SYNC_WARPS;
+            filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
+        }
+    }
+}
--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -742,15 +742,15 @@ private:
    ForceInfo* info;
    OpenCLParameterSet* params;
    OpenCLArray globals;
-    OpenCLArray interactionGroupData;
-    cl::Kernel interactionGroupKernel;
+    OpenCLArray interactionGroupData, filteredGroupData, numGroupTiles;
+    cl::Kernel interactionGroupKernel, prepareNeighborListKernel, buildNeighborListKernel;
    std::vector<void*> interactionGroupArgs;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
    std::vector<OpenCLArray> tabulatedFunctions;
    double longRangeCoefficient;
    std::vector<double> longRangeCoefficientDerivs;
-    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs;
+    bool hasInitializedLongRangeCorrection, hasInitializedKernel, hasParamDerivs, useNeighborList;
    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
    const System& system;

--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -153,6 +153,11 @@ public:
    bool getHasInteractions() {
        return (groupCutoff.size() > 0);
    }
+    /**
+     * Given a nonbonded cutoff, get the padded cutoff distance used in computing
+     * the neighbor list.
+     */
+    double padCutoff(double cutoff);
    /**
     * Prepare to compute interactions.  This updates the neighbor list.
     */
@@ -225,6 +230,13 @@ public:
    OpenCLArray& getExclusionRowIndices() {
        return exclusionRowIndices;
    }
+    /**
+     * Get the array containing a flag for whether the neighbor list was rebuilt
+     * on the most recent call to prepareInteractions().
+     */
+    OpenCLArray& getRebuildNeighborList() {
+        return rebuildNeighborList;
+    }
    /**
     * Get the index of the first tile this context is responsible for processing.
     */

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -54,6 +54,7 @@
 #include "jama_eig.h"
 #include <algorithm>
 #include <cmath>
+#include <iterator>
 #include <set>

 using namespace OpenMM;
@@ -2550,7 +2551,8 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
    
    vector<vector<int> > atomLists;
    vector<pair<int, int> > tiles;
-    map<pair<int, int>, int> duplicateInteractions;
+    vector<int> tileGroup;
+    vector<vector<int> > duplicateAtomsForGroup;
    for (int group = 0; group < force.getNumInteractionGroups(); group++) {
        // Get the list of atoms in this group and sort them.
        
@@ -2561,6 +2563,10 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
        atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
        sort(atoms1.begin(), atoms1.end());
        sort(atoms2.begin(), atoms2.end());
+        duplicateAtomsForGroup.push_back(vector<int>());
+        set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(),
+                inserter(duplicateAtomsForGroup[group], duplicateAtomsForGroup[group].begin()));
+        sort(duplicateAtomsForGroup[group].begin(), duplicateAtomsForGroup[group].end());
        
        // Find how many tiles we will create for this group.
        
@@ -2572,9 +2578,12 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
        
        // Add the tiles.
        
+        int firstTile = tiles.size();
        for (int i = 0; i < numBlocks1; i++)
-            for (int j = 0; j < numBlocks2; j++)
+            for (int j = 0; j < numBlocks2; j++) {
                tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
+                tileGroup.push_back(group);
+            }
        
        // Add the atom lists.
        
@@ -2594,22 +2603,6 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
                atoms.push_back(atoms2[j]);
            atomLists.push_back(atoms);
        }
-        
-        // If this group contains duplicate interactions, record that we need to skip them once.
-        
-        for (int a1 : atoms1) {
-            if (set2.find(a1) == set2.end())
-                continue;
-            for (int j = 0; j < (int) atoms2.size() && atoms2[j] < a1; j++) {
-                int a2 = atoms2[j];
-                if (set1.find(a2) != set1.end()) {
-                    pair<int, int> key = make_pair(a2, a1);
-                    if (duplicateInteractions.find(key) == duplicateInteractions.end())
-                        duplicateInteractions[key] = 0;
-                    duplicateInteractions[key]++;
-                }
-            }
-        }
    }
    
    // Build a lookup table for quickly identifying excluded interactions.
@@ -2627,15 +2620,18 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
    vector<vector<int> > exclusionFlags(tiles.size());
    vector<pair<int, int> > tileOrder;
    for (int tile = 0; tile < tiles.size(); tile++) {
+        bool swapped = false;
        if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
            // For efficiency, we want the first axis to be the larger one.
            
            int swap = tiles[tile].first;
            tiles[tile].first = tiles[tile].second;
            tiles[tile].second = swap;
+            swapped = true;
        }
        vector<int>& atoms1 = atomLists[tiles[tile].first];
        vector<int>& atoms2 = atomLists[tiles[tile].second];
+        vector<int>& duplicateAtoms = duplicateAtomsForGroup[tileGroup[tile]];
        vector<int> flags(atoms1.size(), (int) (1LL<<atoms2.size())-1);
        int numExcluded = 0;
        for (int i = 0; i < (int) atoms1.size(); i++)
@@ -2646,11 +2642,10 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
                pair<int, int> key = make_pair(min(a1, a2), max(a1, a2));
                if (a1 == a2 || exclusions.find(key) != exclusions.end())
                    isExcluded = true; // This is an excluded interaction.
-                else if (duplicateInteractions.find(key) != duplicateInteractions.end() && duplicateInteractions[key] > 0) {
+                else if ((a1 > a2) == swapped && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a1) && binary_search(duplicateAtoms.begin(), duplicateAtoms.end(), a2)) {
                    // Both atoms are in both sets, so skip duplicate interactions.
                    
                    isExcluded = true;
-                    duplicateInteractions[key]--;
                }
                if (isExcluded) {
                    flags[i] &= -1-(1<<j);
@@ -2713,6 +2708,16 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
    }
    interactionGroupData.initialize<mm_int4>(cl, groupData.size(), "interactionGroupData");
    interactionGroupData.upload(groupData);
+    numGroupTiles.initialize<cl_int>(cl, 1, "numGroupTiles");
+
+    // Allocate space for a neighbor list, if necessary.
+
+    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff && groupData.size() > cl.getNumThreadBlocks()) {
+        filteredGroupData.initialize<mm_int4>(cl, groupData.size(), "filteredGroupData");
+        interactionGroupData.copyTo(filteredGroupData);
+        int numTiles = groupData.size()/32;
+        numGroupTiles.upload(&numTiles);
+    }
    
    // Create the kernel.
    
@@ -2791,11 +2796,16 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
        defines["USE_CUTOFF"] = "1";
    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
        defines["USE_PERIODIC"] = "1";
-    defines["LOCAL_MEMORY_SIZE"] = cl.intToString(max(32, cl.getNonbondedUtilities().getForceThreadBlockSize()));
+    int localMemorySize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
+    defines["LOCAL_MEMORY_SIZE"] = cl.intToString(localMemorySize);
+    defines["WARPS_IN_BLOCK"] = cl.intToString(localMemorySize/32);
    double cutoff = force.getCutoffDistance();
    defines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
+    double paddedCutoff = cl.getNonbondedUtilities().padCutoff(cutoff);
+    defines["PADDED_CUTOFF_SQUARED"] = cl.doubleToString(paddedCutoff*paddedCutoff);
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
    defines["TILE_SIZE"] = "32";
+    defines["NUM_TILES"] = cl.intToString(numTileSets);
    int numContexts = cl.getPlatformData().contexts.size();
    int startIndex = cl.getContextIndex()*numTileSets/numContexts;
    int endIndex = (cl.getContextIndex()+1)*numTileSets/numContexts;
@@ -2805,10 +2815,17 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customNonbondedGroups, replacements), defines);
    interactionGroupKernel = cl::Kernel(program, "computeInteractionGroups");
+    prepareNeighborListKernel = cl::Kernel(program, "prepareToBuildNeighborList");
+    buildNeighborListKernel = cl::Kernel(program, "buildNeighborList");
    numGroupThreadBlocks = cl.getNonbondedUtilities().getNumForceThreadBlocks();
 }

 double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    useNeighborList = (filteredGroupData.isInitialized() && cl.getNonbondedUtilities().getUseCutoff());
+    if (useNeighborList && cl.getContextIndex() > 0) {
+        // When using a neighbor list, run the whole calculation on a single device.
+        return 0.0;
+    }
    if (globals.isInitialized()) {
        bool changed = false;
        for (int i = 0; i < (int) globalParamNames.size(); i++) {
@@ -2837,7 +2854,9 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
            interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
-            interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData.getDeviceBuffer());
+            interactionGroupKernel.setArg<cl::Buffer>(index++, (useNeighborList ? filteredGroupData : interactionGroupData).getDeviceBuffer());
+            interactionGroupKernel.setArg<cl::Buffer>(index++, numGroupTiles.getDeviceBuffer());
+            interactionGroupKernel.setArg<cl_int>(index++, useNeighborList);
            index += 5;
            for (auto& buffer : params->getBuffers())
                interactionGroupKernel.setArg<cl::Memory>(index++, buffer.getMemory());
@@ -2847,9 +2866,27 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
                interactionGroupKernel.setArg<cl::Buffer>(index++, globals.getDeviceBuffer());
            if (hasParamDerivs)
                interactionGroupKernel.setArg<cl::Memory>(index++, cl.getEnergyParamDerivBuffer().getDeviceBuffer());
+            if (useNeighborList) {
+                // Initialize kernels for building the interaction group neighbor list.
+                
+                prepareNeighborListKernel.setArg<cl::Buffer>(0, cl.getNonbondedUtilities().getRebuildNeighborList().getDeviceBuffer());
+                prepareNeighborListKernel.setArg<cl::Buffer>(1, numGroupTiles.getDeviceBuffer());
+                buildNeighborListKernel.setArg<cl::Buffer>(0, cl.getNonbondedUtilities().getRebuildNeighborList().getDeviceBuffer());
+                buildNeighborListKernel.setArg<cl::Buffer>(1, numGroupTiles.getDeviceBuffer());
+                buildNeighborListKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
+                buildNeighborListKernel.setArg<cl::Buffer>(3, interactionGroupData.getDeviceBuffer());
+                buildNeighborListKernel.setArg<cl::Buffer>(4, filteredGroupData.getDeviceBuffer());
+            }
        }
-        setPeriodicBoxArgs(cl, interactionGroupKernel, 4);
        int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
+        if (useNeighborList) {
+            // Rebuild the neighbor list, if necessary.
+
+            setPeriodicBoxArgs(cl, buildNeighborListKernel, 5);
+            cl.executeKernel(prepareNeighborListKernel, 1, 1);
+            cl.executeKernel(buildNeighborListKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+        }
+        setPeriodicBoxArgs(cl, interactionGroupKernel, 6);
        cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -296,6 +296,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
        vector<cl_uint> count(1, 0);
        interactionCount.upload(count);
+        rebuildNeighborList.upload(count);
    }
 }

@@ -323,6 +324,11 @@ double OpenCLNonbondedUtilities::getMaxCutoffDistance() {
    return cutoff;
 }

+double OpenCLNonbondedUtilities::padCutoff(double cutoff) {
+    double padding = (usePadding ? 0.1*cutoff : 0.0);
+    return cutoff+padding;
+}
+
 void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
    if ((forceGroups&groupFlags) == 0)
        return;
@@ -464,12 +470,11 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
    kernels.cutoffDistance = cutoff;
    kernels.source = source;
    if (useCutoff) {
-        double padding = (usePadding ? 0.1*cutoff : 0.0);
-        double paddedCutoff = cutoff+padding;
+        double paddedCutoff = padCutoff(cutoff);
        map<string, string> defines;
        defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
-        defines["PADDING"] = context.doubleToString(padding);
+        defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());

--- a/platforms/opencl/src/kernels/customNonbondedGroups.cl
+++ b/platforms/opencl/src/kernels/customNonbondedGroups.cl
@@ -43,6 +43,7 @@ __kernel void computeInteractionGroups(
        __global real4* restrict forceBuffers,
 #endif
        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
+        __global int* restrict numGroupTiles, int useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
@@ -53,8 +54,8 @@ __kernel void computeInteractionGroups(
    INIT_DERIVATIVES
    __local AtomData localData[LOCAL_MEMORY_SIZE];

-    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
-    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
+    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
+    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
    for (int tile = startTile; tile < endTile; tile++) {
        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
        const int atom1 = atomData.x;
@@ -129,3 +130,74 @@ __kernel void computeInteractionGroups(
    energyBuffer[get_global_id(0)] += energy;
    SAVE_DERIVATIVES
 }
+
+/**
+ * If the neighbor list needs to be rebuilt, reset the number of tiles to 0.  This is
+ * executed by a single thread.
+ */
+__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) {
+    if (rebuildNeighborList[0] == 1)
+        numGroupTiles[0] = 0;
+}
+
+/**
+ * Filter the list of tiles to include only ones that have interactions within the
+ * padded cutoff.
+ */
+__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles,
+        __global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+    
+    // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
+    
+    if (rebuildNeighborList[0] == 0)
+        return;
+
+    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
+    const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
+    const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex
+    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
+    __local real4 localPos[LOCAL_MEMORY_SIZE];
+    __local volatile bool anyInteraction[WARPS_IN_BLOCK];
+    __local volatile int tileIndex[WARPS_IN_BLOCK];
+
+    const unsigned int startTile = warp*NUM_TILES/totalWarps;
+    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
+    for (int tile = startTile; tile < endTile; tile++) {
+        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
+        const int atom1 = atomData.x;
+        const int atom2 = atomData.y;
+        const int rangeStart = atomData.z&0xFFFF;
+        const int rangeEnd = (atomData.z>>16)&0xFFFF;
+        const int exclusions = atomData.w;
+        real4 posq1 = posq[atom1];
+        localPos[get_local_id(0)] = posq[atom2];
+        if (tgx == 0)
+            anyInteraction[local_warp] = false;
+        int tj = tgx;
+        SYNC_WARPS;
+        for (int j = rangeStart; j < rangeEnd && !anyInteraction[local_warp]; j++) {
+            if (tj < rangeEnd) {
+                bool isExcluded = (((exclusions>>tj)&1) == 0);
+                int localIndex = tbx+tj;
+                real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0);
+#ifdef USE_PERIODIC
+                APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
+                    anyInteraction[local_warp] = true;
+            }
+            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
+            SYNC_WARPS;
+        }
+        if (anyInteraction[local_warp]) {
+            SYNC_WARPS;
+            if (tgx == 0)
+                tileIndex[local_warp] = atomic_add(numGroupTiles, 1);
+            SYNC_WARPS;
+            filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
+        }
+    }
+}
--- a/tests/TestCustomNonbondedForce.h
+++ b/tests/TestCustomNonbondedForce.h
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -978,6 +978,68 @@ void testInteractionGroupTabulatedFunction() {
    }
 }

+void testInteractionGroupWithCutoff() {
+    const int numParticles = 1000;
+    const double boxSize = 10.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    NonbondedForce* standard = new NonbondedForce();
+    CustomNonbondedForce* custom = new CustomNonbondedForce("100/(r+0.1)");
+    system.addForce(standard);
+    system.addForce(custom);
+    standard->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    custom->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    standard->setCutoffDistance(1.0);
+    custom->setCutoffDistance(1.0);
+    standard->setUseSwitchingFunction(true);
+    custom->setUseSwitchingFunction(true);
+    standard->setSwitchingDistance(0.9);
+    custom->setSwitchingDistance(0.8);
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(10.0);
+        standard->addParticle(0.0, 0.2, 0.1);
+        custom->addParticle();
+        while (true) {
+            positions[i] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt))*boxSize;
+            bool tooClose = false;
+            for (int j = 0; j < i; j++) {
+                Vec3 delta = positions[i]-positions[j];
+                if (delta.dot(delta) < 0.5*0.5)
+                    tooClose = true;
+            }
+            if (!tooClose)
+                break;
+        }
+    }
+    set<int> set1, set2;
+    for (int i = 0; i < 10; i++)
+        set1.insert(2*i);
+    for (int i = 0; i < numParticles; i++)
+        set2.insert(i);
+    custom->addInteractionGroup(set1, set2);
+    custom->setForceGroup(1);
+    
+    // Try simulating it and see if energy is conserved (indicating that any optimizations
+    // for combining the cutoff with the interaction group are behaving consistently).
+
+    VerletIntegrator integrator(0.001);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    context.setVelocitiesToTemperature(100);
+    ASSERT(context.getState(State::Energy, false, 1<<1).getPotentialEnergy() != 0.0);
+    State initialState = context.getState(State::Energy);
+    double initialEnergy = initialState.getPotentialEnergy()+initialState.getKineticEnergy();
+    for (int i = 0; i < 100; i++) {
+        integrator.step(10);
+        State state = context.getState(State::Energy);
+        double energy = state.getPotentialEnergy()+state.getKineticEnergy();
+        ASSERT_EQUAL_TOL(initialEnergy, energy, 0.001);
+    }
+}
+
 void testMultipleCutoffs() {
    System system;
    system.addParticle(1.0);
@@ -1253,6 +1315,7 @@ int main(int argc, char* argv[]) {
        testLargeInteractionGroup();
        testInteractionGroupLongRangeCorrection();
        testInteractionGroupTabulatedFunction();
+        testInteractionGroupWithCutoff();
        testMultipleCutoffs();
        testMultipleSwitches();
        testIllegalVariable();