Created CUDA implementation of restricting CustomNonbondedForce to selected interaction groups

09611266 · peastman · 9840ca70 · 09611266 · 09611266 · 09611266
Commit 09611266 authored Aug 07, 2013 by peastman
4 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -638,7 +638,7 @@ private:
 class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
 public:
    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomNonbondedForceKernel(name, platform),
-            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), forceCopy(NULL), system(system) {
+            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), interactionGroupData(NULL), forceCopy(NULL), system(system), hasInitializedKernel(false) {
    }
    ~CudaCalcCustomNonbondedForceKernel();
    /**
@@ -665,15 +665,20 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
    CudaContext& cu;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* tabulatedFunctionParams;
+    CudaArray* interactionGroupData;
+    CUfunction interactionGroupKernel;
+    std::vector<void*> interactionGroupArgs;
    std::vector<std::string> globalParamNames;
    std::vector<float> globalParamValues;
    std::vector<CudaArray*> tabulatedFunctions;
    double longRangeCoefficient;
-    bool hasInitializedLongRangeCorrection;
+    bool hasInitializedLongRangeCorrection, hasInitializedKernel;
+    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
    const System& system;
 };

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1862,6 +1862,17 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
 class CudaCustomNonbondedForceInfo : public CudaForceInfo {
 public:
    CudaCustomNonbondedForceInfo(const CustomNonbondedForce& force) : force(force) {
+        if (force.getNumInteractionGroups() > 0) {
+            groupsForParticle.resize(force.getNumParticles());
+            for (int i = 0; i < force.getNumInteractionGroups(); i++) {
+                set<int> set1, set2;
+                force.getInteractionGroupParameters(i, set1, set2);
+                for (set<int>::const_iterator iter = set1.begin(); iter != set1.end(); ++iter)
+                    groupsForParticle[*iter].insert(2*i);
+                for (set<int>::const_iterator iter = set2.begin(); iter != set2.end(); ++iter)
+                    groupsForParticle[*iter].insert(2*i+1);
+            }
+        }
    }
    bool areParticlesIdentical(int particle1, int particle2) {
        vector<double> params1;
@@ -1871,6 +1882,8 @@ public:
        for (int i = 0; i < (int) params1.size(); i++)
            if (params1[i] != params2[i])
                return false;
+        if (groupsForParticle.size() > 0 && groupsForParticle[particle1] != groupsForParticle[particle2])
+            return false;
        return true;
    }
    int getNumParticleGroups() {
@@ -1888,6 +1901,7 @@ public:
    }
 private:
    const CustomNonbondedForce& force;
+    vector<set<int> > groupsForParticle;
 };
 CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
@@ -1898,6 +1912,8 @@ CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
        delete globals;
    if (tabulatedFunctionParams != NULL)
        delete tabulatedFunctionParams;
+    if (interactionGroupData != NULL)
+        delete interactionGroupData;
    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
        delete tabulatedFunctions[i];
    if (forceCopy != NULL)
@@ -1909,7 +1925,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    int forceIndex;
    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
        ;
-    string prefix = "custom"+cu.intToString(forceIndex)+"_";
+    string prefix = (force.getNumInteractionGroups() == 0 ? "custom"+cu.intToString(forceIndex)+"_" : "");
    // Record parameters and exclusions.
@@ -2010,6 +2026,9 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
        replacements["SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
    }
    string source = cu.replaceStrings(CudaKernelSources::customNonbonded, replacements);
+    if (force.getNumInteractionGroups() > 0)
+        initInteractionGroups(force, source);
+    else {
        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
@@ -2019,6 +2038,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
            globals->upload(globalParamValues);
            cu.getNonbondedUtilities().addArgument(CudaNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(float), globals->getDevicePointer()));
        }
+    }
    cu.addForce(new CudaCustomNonbondedForceInfo(force));
    // Record information for the long range correction.
@@ -2033,6 +2053,215 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    }
 }
+void CudaCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbondedForce& force, const string& interactionSource) {
+    // Process groups to form tiles.
+    vector<vector<int> > atomLists;
+    vector<pair<int, int> > tiles;
+    for (int group = 0; group < force.getNumInteractionGroups(); group++) {
+        // Get the list of atoms in this group and sort them.
+        set<int> set1, set2;
+        force.getInteractionGroupParameters(group, set1, set2);
+        vector<int> atoms1, atoms2;
+        atoms1.insert(atoms1.begin(), set1.begin(), set1.end());
+        atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
+        sort(atoms1.begin(), atoms1.end());
+        sort(atoms2.begin(), atoms2.end());
+        // Find how many tiles we will create for this group.
+        int tileWidth = min(32, (int) atoms2.size());
+        int numBlocks1 = (atoms1.size()+tileWidth-1)/tileWidth;
+        int numBlocks2 = (atoms2.size()+tileWidth-1)/tileWidth;
+        // Add the tiles.
+        for (int i = 0; i < numBlocks1; i++)
+            for (int j = 0; j < numBlocks2; j++)
+                tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
+        // Add the atom lists.
+        for (int i = 0; i < numBlocks1; i++) {
+            vector<int> atoms;
+            int first = i*tileWidth;
+            int last = min((i+1)*tileWidth, (int) atoms1.size());
+            for (int j = first; j < last; j++)
+                atoms.push_back(atoms1[j]);
+            atomLists.push_back(atoms);
+        }
+        for (int i = 0; i < numBlocks2; i++) {
+            vector<int> atoms;
+            int first = i*tileWidth;
+            int last = min((i+1)*tileWidth, (int) atoms2.size());
+            for (int j = first; j < last; j++)
+                atoms.push_back(atoms2[j]);
+            atomLists.push_back(atoms);
+        }
+    }
+    // Build a lookup table for quickly identifying excluded interactions.
+    set<pair<int, int> > exclusions;
+    for (int i = 0; i < force.getNumExclusions(); i++) {
+        int p1, p2;
+        force.getExclusionParticles(i, p1, p2);
+        exclusions.insert(make_pair(p1, p2));
+    }
+    // Build the exclusion flags for each tile.  While we're at it, filter out tiles
+    // where all interactions are excluded, and sort the tiles by size.
+    vector<vector<int> > exclusionFlags(tiles.size());
+    vector<pair<int, int> > tileOrder;
+    for (int tile = 0; tile < tiles.size(); tile++) {
+        if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
+            // For efficiency, we want the first axis to be the larger one.
+            int swap = tiles[tile].first;
+            tiles[tile].first = tiles[tile].second;
+            tiles[tile].second = swap;
+        }
+        vector<int>& atoms1 = atomLists[tiles[tile].first];
+        vector<int>& atoms2 = atomLists[tiles[tile].second];
+        vector<int> flags(atoms1.size(), (1<<atoms2.size())-1);
+        int numExcluded = 0;
+        for (int i = 0; i < (int) atoms1.size(); i++)
+            for (int j = 0; j < (int) atoms2.size(); j++) {
+                int a1 = atoms1[i];
+                int a2 = atoms2[j];
+                if (a1 == a2 || exclusions.find(make_pair(a1, a2)) != exclusions.end() || exclusions.find(make_pair(a2, a1)) != exclusions.end()) {
+                    flags[i] &= -1-(1<<j);
+                    numExcluded++;
+                }
+            }
+        if (numExcluded == atoms1.size()*atoms2.size())
+            continue; // All interactions are excluded.
+        tileOrder.push_back(make_pair((int) -atoms2.size(), tile));
+        if (numExcluded > 0)
+            exclusionFlags[tile] = flags;
+    }
+    sort(tileOrder.begin(), tileOrder.end());
+    // Merge tiles to get as close as possible to 32 along the first axis of each one.
+    vector<int> tileSetStart;
+    tileSetStart.push_back(0);
+    int tileSetSize = 0;
+    for (int i = 0; i < tileOrder.size(); i++) {
+        int tile = tileOrder[i].second;
+        int size = atomLists[tiles[tile].first].size();
+        if (tileSetSize+size > 32) {
+            tileSetStart.push_back(i);
+            tileSetSize = 0;
+        }
+        tileSetSize += size;
+    }
+    tileSetStart.push_back(tileOrder.size());
+    // Build the data structures.
+    int numTileSets = tileSetStart.size()-1;
+    vector<int4> groupData;
+    for (int tileSet = 0; tileSet < numTileSets; tileSet++) {
+        int indexInTileSet = 0;
+        for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) {
+            int tile = tileOrder[i].second;
+            vector<int>& atoms1 = atomLists[tiles[tile].first];
+            vector<int>& atoms2 = atomLists[tiles[tile].second];
+            int range = indexInTileSet + ((indexInTileSet+atoms1.size())<<16);
+            int allFlags = (1<<atoms2.size())-1;
+            for (int j = 0; j < (int) atoms1.size(); j++) {
+                int a1 = atoms1[j];
+                int a2 = (j < atoms2.size() ? atoms2[j] : 0);
+                int flags = (exclusionFlags[tile].size() > 0 ? exclusionFlags[tile][j] : allFlags);
+                groupData.push_back(make_int4(a1, a2, range, flags<<indexInTileSet));
+            }
+            indexInTileSet += atoms1.size();
+        }
+        for (; indexInTileSet < 32; indexInTileSet++)
+            groupData.push_back(make_int4(0, 0, 0, 0));
+    }
+    interactionGroupData = CudaArray::create<int4>(cu, groupData.size(), "interactionGroupData");
+    interactionGroupData->upload(groupData);
+    // Create the kernel.
+    map<string, string> replacements;
+    replacements["COMPUTE_INTERACTION"] = interactionSource;
+    const string suffixes[] = {"x", "y", "z", "w"};
+    stringstream localData;
+    int localDataSize = 0;
+    vector<CudaNonbondedUtilities::ParameterInfo>& buffers = params->getBuffers(); 
+    for (int i = 0; i < (int) buffers.size(); i++) {
+        if (buffers[i].getNumComponents() == 1)
+            localData<<buffers[i].getComponentType()<<" params"<<(i+1)<<";\n";
+        else {
+            for (int j = 0; j < buffers[i].getNumComponents(); ++j)
+                localData<<buffers[i].getComponentType()<<" params"<<(i+1)<<"_"<<suffixes[j]<<";\n";
+        }
+        localDataSize += buffers[i].getSize();
+    }
+    replacements["ATOM_PARAMETER_DATA"] = localData.str();
+    stringstream args;
+    for (int i = 0; i < (int) buffers.size(); i++)
+        args<<", const "<<buffers[i].getType()<<"* __restrict__ global_params"<<(i+1);
+    if (globals != NULL)
+        args<<", const float* __restrict__ globals";
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    stringstream load1;
+    for (int i = 0; i < (int) buffers.size(); i++)
+        load1<<buffers[i].getType()<<" params"<<(i+1)<<"1 = global_params"<<(i+1)<<"[atom1];\n";
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    stringstream loadLocal2;
+    for (int i = 0; i < (int) buffers.size(); i++) {
+        if (buffers[i].getNumComponents() == 1)
+            loadLocal2<<"localData[threadIdx.x].params"<<(i+1)<<" = global_params"<<(i+1)<<"[atom2];\n";
+        else {
+            loadLocal2<<buffers[i].getType()<<" temp_params"<<(i+1)<<" = global_params"<<(i+1)<<"[atom2];\n";
+            for (int j = 0; j < buffers[i].getNumComponents(); ++j)
+                loadLocal2<<"localData[threadIdx.x].params"<<(i+1)<<"_"<<suffixes[j]<<" = temp_params"<<(i+1)<<"."<<suffixes[j]<<";\n";
+        }
+    }
+    replacements["LOAD_LOCAL_PARAMETERS"] = loadLocal2.str();
+    stringstream load2;
+    for (int i = 0; i < (int) buffers.size(); i++) {
+        if (buffers[i].getNumComponents() == 1)
+            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n";
+        else {
+            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = make_"<<buffers[i].getType()<<"(";
+            for (int j = 0; j < buffers[i].getNumComponents(); ++j) {
+                if (j > 0)
+                    load2<<", ";
+                load2<<"localData[localIndex].params"<<(i+1)<<"_"<<suffixes[j];
+            }
+            load2<<");\n";
+        }
+    }
+    replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+    map<string, string> defines;
+    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
+        defines["USE_CUTOFF"] = "1";
+    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
+        defines["USE_PERIODIC"] = "1";
+    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
+    double cutoff = force.getCutoffDistance();
+    defines["CUTOFF_SQUARED"] = cu.doubleToString(cutoff*cutoff);
+    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
+    defines["TILE_SIZE"] = "32";
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*numTileSets/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*numTileSets/numContexts;
+    defines["FIRST_TILE"] = cu.intToString(startIndex);
+    defines["LAST_TILE"] = cu.intToString(endIndex);
+    if ((localDataSize/4)%2 == 0 && !cu.getUseDoublePrecision())
+        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
+    CUmodule program = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customNonbondedGroups, replacements), defines);
+    interactionGroupKernel = cu.getKernel(program, "computeInteractionGroups");
+    numGroupThreadBlocks = cu.getNonbondedUtilities().getNumForceThreadBlocks();
+}
 double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    if (globals != NULL) {
        bool changed = false;
@@ -2054,6 +2283,21 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
        longRangeCoefficient = CustomNonbondedForceImpl::calcLongRangeCorrection(*forceCopy, context.getOwner());
        hasInitializedLongRangeCorrection = true;
    }
+    if (interactionGroupData != NULL) {
+        if (!hasInitializedKernel) {
+            hasInitializedKernel = true;
+            interactionGroupArgs.push_back(&cu.getForce().getDevicePointer());
+            interactionGroupArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
+            interactionGroupArgs.push_back(&cu.getPosq().getDevicePointer());
+            interactionGroupArgs.push_back(&interactionGroupData->getDevicePointer());
+            for (int i = 0; i < (int) params->getBuffers().size(); i++)
+                interactionGroupArgs.push_back(&params->getBuffers()[i].getMemory());
+            if (globals != NULL)
+                interactionGroupArgs.push_back(&globals->getDevicePointer());
+        }
+        int forceThreadBlockSize = cu.getNonbondedUtilities().getForceThreadBlockSize();
+        cu.executeKernel(interactionGroupKernel, &interactionGroupArgs[0], numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+    }
    double4 boxSize = cu.getPeriodicBoxSize();
    return longRangeCoefficient/(boxSize.x*boxSize.y*boxSize.z);
 }

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
+    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    real padding;
+#endif
+} AtomData;
+extern "C" __global__ void computeInteractionGroups(
+        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
+    real energy = 0.0f;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
+    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
+    for (int tile = startTile; tile < endTile; tile++) {
+        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
+        const int atom1 = atomData.x;
+        const int atom2 = atomData.y;
+        const int rangeStart = atomData.z&0xFFFF;
+        const int rangeEnd = (atomData.z>>16)&0xFFFF;
+        const int exclusions = atomData.w;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+        real3 force = make_real3(0);
+        real4 posq2 = posq[atom2];
+        localData[threadIdx.x].x = posq2.x;
+        localData[threadIdx.x].y = posq2.y;
+        localData[threadIdx.x].z = posq2.z;
+        localData[threadIdx.x].q = posq2.w;
+        LOAD_LOCAL_PARAMETERS
+        localData[threadIdx.x].fx = 0.0f;
+        localData[threadIdx.x].fy = 0.0f;
+        localData[threadIdx.x].fz = 0.0f;
+        int tj = tgx;
+        for (int j = rangeStart; j < rangeEnd; j++) {
+            bool isExcluded = (((exclusions>>tj)&1) == 0);
+            int localIndex = tbx+tj;
+            posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
+            real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+            real invR = RSQRT(r2);
+            real r = RECIP(invR);
+            LOAD_ATOM2_PARAMETERS
+            real dEdR = 0.0f;
+            real tempEnergy = 0.0f;
+            COMPUTE_INTERACTION
+            energy += tempEnergy;
+            delta *= dEdR;
+            force.x -= delta.x;
+            force.y -= delta.y;
+            force.z -= delta.z;
+            localData[localIndex].fx += delta.x;
+            localData[localIndex].fy += delta.y;
+            localData[localIndex].fz += delta.z;
+            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
+        }
+        if (exclusions != 0) {
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+            atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+            atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+            atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+        }
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
\ No newline at end of file
--- a/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
@@ -538,6 +538,109 @@ void testLongRangeCorrection() {
    ASSERT_EQUAL_TOL(standardEnergy1-standardEnergy2, customEnergy1-customEnergy2, 1e-4);
 }
+void testInteractionGroups() {
+    const int numParticles = 6;
+    System system;
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("v1+v2");
+    nonbonded->addPerParticleParameter("v");
+    vector<double> params(1, 0.001);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(params);
+        params[0] *= 10;
+    }
+    set<int> set1, set2, set3, set4;
+    set1.insert(2);
+    set2.insert(0);
+    set2.insert(1);
+    set2.insert(2);
+    set2.insert(3);
+    set2.insert(4);
+    set2.insert(5);
+    nonbonded->addInteractionGroup(set1, set2); // Particle 2 interacts with every other particle.
+    set3.insert(0);
+    set3.insert(1);
+    set4.insert(4);
+    set4.insert(5);
+    nonbonded->addInteractionGroup(set3, set4); // Particles 0 and 1 interact with 4 and 5.
+    nonbonded->addExclusion(1, 2); // Add an exclusion to make sure it gets skipped.
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+    double expectedEnergy = 331.423; // Each digit is the number of interactions a particle particle is involved in.
+    ASSERT_EQUAL_TOL(expectedEnergy, state.getPotentialEnergy(), TOL);
+}
+void testLargeInteractionGroup() {
+    const int numMolecules = 300;
+    const int numParticles = numMolecules*2;
+    const double boxSize = 20.0;
+    // Create a large system.
+    System system;
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6)+138.935456*q/r; q=q1*q2; sigma=0.5*(sigma1+sigma2); eps=sqrt(eps1*eps2)");
+    nonbonded->addPerParticleParameter("q");
+    nonbonded->addPerParticleParameter("sigma");
+    nonbonded->addPerParticleParameter("eps");
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<double> params(3);
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.1;
+            nonbonded->addParticle(params);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            nonbonded->addParticle(params);
+        }
+        else {
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.2;
+            nonbonded->addParticle(params);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            nonbonded->addParticle(params);
+        }
+        positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        nonbonded->addExclusion(2*i, 2*i+1);
+    }
+    nonbonded->setNonbondedMethod(CustomNonbondedForce::NoCutoff);
+    system.addForce(nonbonded);
+    // Compute the forces.
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces);
+    // Modify the force so only one particle interacts with everything else.
+    set<int> set1, set2;
+    set1.insert(151);
+    for (int i = 0; i < numParticles; i++)
+        set2.insert(i);
+    nonbonded->addInteractionGroup(set1, set2);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state2 = context.getState(State::Forces);
+    // The force on that one particle should be the same.
+    ASSERT_EQUAL_VEC(state1.getForces()[151], state2.getForces()[151], 1e-4);
+}
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -553,6 +656,8 @@ int main(int argc, char* argv[]) {
        testParallelComputation();
        testSwitchingFunction();
        testLongRangeCorrection();
+        testInteractionGroups();
+        testLargeInteractionGroup();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;