Merge branch 'master' of github.com:leeping/openmm

3cb25ad8 · Lee-Ping Wang · 7bfb75c7 · 24608623 · 3cb25ad8 · 3cb25ad8
Commit 3cb25ad8 authored Aug 25, 2013 by Lee-Ping Wang
20 changed files
--- a/openmmapi/src/MonteCarloAnisotropicBarostat.cpp
+++ b/openmmapi/src/MonteCarloAnisotropicBarostat.cpp
@@ -35,8 +35,8 @@

 using namespace OpenMM;

-MonteCarloAnisotropicBarostat::MonteCarloAnisotropicBarostat(const Vec3& defaultPressure, double temperature, int frequency, bool scaleX, bool scaleY, bool scaleZ) :
-        defaultPressure(defaultPressure), temperature(temperature), frequency(frequency), scaleX(scaleX), scaleY(scaleY), scaleZ(scaleZ) {
+MonteCarloAnisotropicBarostat::MonteCarloAnisotropicBarostat(const Vec3& defaultPressure, double temperature, bool scaleX, bool scaleY, bool scaleZ, int frequency) :
+        defaultPressure(defaultPressure), temperature(temperature), scaleX(scaleX), scaleY(scaleY), scaleZ(scaleZ), frequency(frequency) {
    setRandomNumberSeed((int) time(NULL));
 }


--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -489,7 +489,6 @@ private:
    struct MoleculeGroup;
    class VirtualSiteInfo;
    void findMoleculeGroups();
-    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
    /**
     * Ensure that all molecules marked as "identical" really are identical.  This should be
     * called whenever force field parameters change.  If necessary, it will rebuild the list
@@ -515,7 +514,7 @@ private:
    int numAtomBlocks;
    int numThreadBlocks;
    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered;
-    std::string compiler, tempDir, gpuArchitecture;
+    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
    double4 periodicBoxSize, invPeriodicBoxSize;
    std::string defaultOptimizationOptions;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -638,7 +638,7 @@ private:
 class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
 public:
    CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomNonbondedForceKernel(name, platform),
-            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), forceCopy(NULL), system(system) {
+            cu(cu), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), interactionGroupData(NULL), forceCopy(NULL), system(system), hasInitializedKernel(false) {
    }
    ~CudaCalcCustomNonbondedForceKernel();
    /**
@@ -665,15 +665,20 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
    CudaContext& cu;
    CudaParameterSet* params;
    CudaArray* globals;
    CudaArray* tabulatedFunctionParams;
+    CudaArray* interactionGroupData;
+    CUfunction interactionGroupKernel;
+    std::vector<void*> interactionGroupArgs;
    std::vector<std::string> globalParamNames;
    std::vector<float> globalParamValues;
    std::vector<CudaArray*> tabulatedFunctions;
    double longRangeCoefficient;
-    bool hasInitializedLongRangeCorrection;
+    bool hasInitializedLongRangeCorrection, hasInitializedKernel;
+    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
    const System& system;
 };

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,19 +35,25 @@
 #include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
+#include "SHA1.h"
 #include "hilbert.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/VirtualSite.h"
 #include "CudaExpressionUtilities.h"
+#include "openmm/internal/ContextImpl.h"
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <sstream>
 #include <typeinfo>
 #include <cudaProfiler.h>
+#ifndef WIN32
+  #include <unistd.h>
+#endif


 #define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
@@ -87,10 +93,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    }
    else
        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
+    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
+    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
 #ifdef WIN32
    this->tempDir = tempDir+"\\";
+    cacheDir = cacheDir+"\\";
 #else
    this->tempDir = tempDir+"/";
+    cacheDir = cacheDir+"/";
 #endif
    contextIndex = platformData.contexts.size();
    int numDevices;
@@ -214,6 +224,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
+    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
+    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
    
    // Create the work thread used for parallelization when running on multiple devices.
    
@@ -347,6 +359,7 @@ static bool compileInWindows(const string &command) {
 #endif

 CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
+    string bits = intToString(8*sizeof(void*));
    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
    stringstream src;
    if (!options.empty())
@@ -394,17 +407,38 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        src << endl;
    src << source << endl;
    
+    // See whether we already have PTX for this kernel cached.
+    
+    CSHA1 sha1;
+    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
+    sha1.Final();
+    UINT_8 hash[20];
+    sha1.GetHash(hash);
+    stringstream cacheFile;
+    cacheFile << cacheDir;
+    cacheFile.flags(ios::hex);
+    for (int i = 0; i < 20; i++)
+        cacheFile << setw(2) << setfill('0') << (int) hash[i];
+    cacheFile << '_' << gpuArchitecture << '_' << bits;
+    CUmodule module;
+    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
+        return module;
+    
    // Write out the source to a temporary file.
    
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
+#ifdef WIN32
+    tempFileName << "_" << GetCurrentProcessId();
+#else
+    tempFileName << "_" << getpid();
+#endif
    string inputFile = (tempDir+tempFileName.str()+".cu");
    string outputFile = (tempDir+tempFileName.str()+".ptx");
    string logFile = (tempDir+tempFileName.str()+".log");
    ofstream out(inputFile.c_str());
    out << src.str();
    out.close();
-    string bits = intToString(8*sizeof(void*));
 #ifdef WIN32
 #ifdef _DEBUG
    string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
@@ -433,7 +467,6 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
            }
            throw OpenMMException(error.str());
        }
-        CUmodule module;
        CUresult result = cuModuleLoad(&module, outputFile.c_str());
        if (result != CUDA_SUCCESS) {
            std::stringstream m;
@@ -441,7 +474,8 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
            throw OpenMMException(m.str());
        }
        remove(inputFile.c_str());
-        remove(outputFile.c_str());
+        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0)
+            remove(outputFile.c_str());
        remove(logFile.c_str());
        return module;
    }
@@ -616,15 +650,6 @@ void CudaContext::clearAutoclearBuffers() {
    }
 }

-void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
-    // Recursively tag atoms as belonging to a particular molecule.
-
-    atomMolecule[atom] = molecule;
-    for (int i = 0; i < (int) atomBonds[atom].size(); i++)
-        if (atomMolecule[atomBonds[atom][i]] == -1)
-            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
-}
-
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -719,16 +744,14 @@ void CudaContext::findMoleculeGroups() {
            }
        }

-        // Now tag atoms by which molecule they belong to.
+        // Now identify atoms by which molecule they belong to.

-        vector<int> atomMolecule(numAtoms, -1);
-        int numMolecules = 0;
-        for (int i = 0; i < numAtoms; i++)
-            if (atomMolecule[i] == -1)
-                tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
-        vector<vector<int> > atomIndices(numMolecules);
-        for (int i = 0; i < numAtoms; i++)
-            atomIndices[atomMolecule[i]].push_back(i);
+        vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
+        int numMolecules = atomIndices.size();
+        vector<int> atomMolecule(numAtoms);
+        for (int i = 0; i < (int) atomIndices.size(); i++)
+            for (int j = 0; j < (int) atomIndices[i].size(); j++)
+                atomMolecule[atomIndices[i][j]] = i;

        // Construct a description of each molecule.


--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/kernels/andersenThermostat.cu
+++ b/platforms/cuda/src/kernels/andersenThermostat.cu
@@ -2,11 +2,11 @@
 * Apply the Andersen thermostat to adjust particle velocities.
 */

-extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
+extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
        unsigned int randomIndex, const int* __restrict__ atomGroups) {
    float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
    float randomRange = erff(collisionProbability/sqrtf(2.0f));
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];

--- a/platforms/cuda/src/kernels/brownian.cu
+++ b/platforms/cuda/src/kernels/brownian.cu
@@ -2,16 +2,16 @@
 * Perform the first step of Brownian integration.
 */

-extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
+extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
        mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
    randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
    const mixed fscale = tauDeltaT/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed invMass = velm[index].w;
        if (invMass != 0) {
            posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
-            posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
-            posDelta[index].z = fscale*invMass*force[index+PADDED_NUM_ATOMS*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
+            posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
+            posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
        }
        randomIndex += blockDim.x*gridDim.x;
    }
@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm
 * Perform the second step of Brownian integration.
 */

-extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
+extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
    const mixed oneOverDeltaT = RECIP(deltaT);
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        if (velm[index].w != 0) {
            mixed4 delta = posDelta[index];
            velm[index].x = oneOverDeltaT*delta.x;

--- a/platforms/cuda/src/kernels/constraints.cu
+++ b/platforms/cuda/src/kernels/constraints.cu
-extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+extern "C" __global__ void applyPositionDeltas(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
 #ifdef USE_MIXED_PRECISION
        real4 pos1 = posq[index];
        real4 pos2 = posqCorrection[index];

--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -22,8 +22,11 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

-        tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
-        tempEnergy += -prefactor*(1.0f-erfcAlphaR);
+        if (1-erfcAlphaR > 1e-6) {
+            real erfAlphaR = ERF(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
+            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
+            tempEnergy += -prefactor*erfAlphaR;
+        }
    }
    else {
 #if HAS_LENNARD_JONES

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
+#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
+
+typedef struct {
+    real x, y, z;
+    real q;
+    real fx, fy, fz;
+    ATOM_PARAMETER_DATA
+#ifndef PARAMETER_SIZE_IS_EVEN
+    real padding;
+#endif
+} AtomData;
+
+extern "C" __global__ void computeInteractionGroups(
+        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize
+        PARAMETER_ARGUMENTS) {
+    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
+    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
+    real energy = 0.0f;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+
+    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
+    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
+    for (int tile = startTile; tile < endTile; tile++) {
+        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
+        const int atom1 = atomData.x;
+        const int atom2 = atomData.y;
+        const int rangeStart = atomData.z&0xFFFF;
+        const int rangeEnd = (atomData.z>>16)&0xFFFF;
+        const int exclusions = atomData.w;
+        real4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+        real3 force = make_real3(0);
+        real4 posq2 = posq[atom2];
+        localData[threadIdx.x].x = posq2.x;
+        localData[threadIdx.x].y = posq2.y;
+        localData[threadIdx.x].z = posq2.z;
+        localData[threadIdx.x].q = posq2.w;
+        LOAD_LOCAL_PARAMETERS
+        localData[threadIdx.x].fx = 0.0f;
+        localData[threadIdx.x].fy = 0.0f;
+        localData[threadIdx.x].fz = 0.0f;
+        int tj = tgx;
+        for (int j = rangeStart; j < rangeEnd; j++) {
+            bool isExcluded = (((exclusions>>tj)&1) == 0);
+            int localIndex = tbx+tj;
+            posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
+            real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+            if (!isExcluded && r2 < CUTOFF_SQUARED) {
+#endif
+                real invR = RSQRT(r2);
+                real r = RECIP(invR);
+                LOAD_ATOM2_PARAMETERS
+                real dEdR = 0.0f;
+                real tempEnergy = 0.0f;
+                COMPUTE_INTERACTION
+                energy += tempEnergy;
+                delta *= dEdR;
+                force.x -= delta.x;
+                force.y -= delta.y;
+                force.z -= delta.z;
+                localData[localIndex].fx += delta.x;
+                localData[localIndex].fy += delta.y;
+                localData[localIndex].fz += delta.z;
+#ifdef USE_CUTOFF
+            }
+#endif
+            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
+        }
+        if (exclusions != 0) {
+            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
+            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+        }
+        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
\ No newline at end of file
--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
 * Perform the first step of Langevin integration.
 */

-extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
+extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
        const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
    mixed vscale = paramBuffer[VelScale];
    mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    randomIndex += index;
-    while (index < NUM_ATOMS) {
+    while (index < numAtoms) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0) {
            mixed sqrtInvMass = SQRT(velocity.w);
            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
-            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y;
-            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z;
+            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+paddedNumAtoms] + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+paddedNumAtoms*2] + noisescale*sqrtInvMass*random[randomIndex].z;
            velm[index] = velocity;
            posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
        }
@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
 * Perform the second step of Langevin integration.
 */

-extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
+extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
 #if __CUDA_ARCH__ >= 130
    double invStepSize = 1.0/dt[0].y;
 #else
@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
 #endif
    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    while (index < NUM_ATOMS) {
+    while (index < numAtoms) {
        mixed4 vel = velm[index];
        if (vel.w != 0) {
 #ifdef USE_MIXED_PRECISION
@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
 * Select the step size to use for the next step.
 */

-extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
+extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
        const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
    // Calculate the error.

@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    mixed err = 0;
    unsigned int index = threadIdx.x;
    const mixed scale = RECIP((mixed) 0x100000000);
-    while (index < NUM_ATOMS) {
-        mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+    while (index < numAtoms) {
+        mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
        index += blockDim.x*gridDim.x;
@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
        // Select the new step size.

-        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed totalError = SQRT(error[0]/(numAtoms*3));
        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -2,13 +2,13 @@
 * Perform the first step of Verlet integration.
 */

-extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq,
+extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
    const mixed2 stepSize = dt[0];
    const mixed dtPos = stepSize.y;
    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
    const mixed scale = dtVel/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
            real4 pos = posq[index];
 #endif
            velocity.x += scale*force[index]*velocity.w;
-            velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
-            velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
+            velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
+            velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
            pos.x = velocity.x*dtPos;
            pos.y = velocity.y*dtPos;
            pos.z = velocity.z*dtPos;
@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
 * Perform the second step of Verlet integration.
 */

-extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
+extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
    mixed2 stepSize = dt[0];
 #if __CUDA_ARCH__ >= 130
@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    if (index == 0)
        dt[0].x = stepSize.y;
-    for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
 * Select the step size to use for the next step.
 */

-extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
+extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
    // Calculate the error.

    extern __shared__ mixed error[];
    mixed err = 0.0f;
    const mixed scale = RECIP((mixed) 0x100000000);
-    for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
-        mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
+    for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+        mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
    }
@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
        __syncthreads();
    }
    if (threadIdx.x == 0) {
-        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed totalError = SQRT(error[0]/(numAtoms*3));
        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)

--- a/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomNonbondedForce.cpp
@@ -34,6 +34,9 @@
 * This tests all the different force terms in the CUDA implementation of CustomNonbondedForce.
 */

+#ifdef WIN32
+  #define _USE_MATH_DEFINES // Needed to get M_PI
+#endif
 #include "openmm/internal/AssertionUtilities.h"
 #include "sfmt/SFMT.h"
 #include "openmm/Context.h"
@@ -42,6 +45,7 @@
 #include "openmm/NonbondedForce.h"
 #include "openmm/System.h"
 #include "openmm/VerletIntegrator.h"
+#include <cmath>
 #include <iostream>
 #include <vector>

@@ -538,6 +542,179 @@ void testLongRangeCorrection() {
    ASSERT_EQUAL_TOL(standardEnergy1-standardEnergy2, customEnergy1-customEnergy2, 1e-4);
 }

+void testInteractionGroups() {
+    const int numParticles = 6;
+    System system;
+    VerletIntegrator integrator(0.01);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("v1+v2");
+    nonbonded->addPerParticleParameter("v");
+    vector<double> params(1, 0.001);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(params);
+        params[0] *= 10;
+    }
+    set<int> set1, set2, set3, set4;
+    set1.insert(2);
+    set2.insert(0);
+    set2.insert(1);
+    set2.insert(2);
+    set2.insert(3);
+    set2.insert(4);
+    set2.insert(5);
+    nonbonded->addInteractionGroup(set1, set2); // Particle 2 interacts with every other particle.
+    set3.insert(0);
+    set3.insert(1);
+    set4.insert(4);
+    set4.insert(5);
+    nonbonded->addInteractionGroup(set3, set4); // Particles 0 and 1 interact with 4 and 5.
+    nonbonded->addExclusion(1, 2); // Add an exclusion to make sure it gets skipped.
+    system.addForce(nonbonded);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+    double expectedEnergy = 331.423; // Each digit is the number of interactions a particle particle is involved in.
+    ASSERT_EQUAL_TOL(expectedEnergy, state.getPotentialEnergy(), TOL);
+}
+
+void testLargeInteractionGroup() {
+    const int numMolecules = 300;
+    const int numParticles = numMolecules*2;
+    const double boxSize = 20.0;
+    
+    // Create a large system.
+    
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    for (int i = 0; i < numParticles; i++)
+        system.addParticle(1.0);
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("4*eps*((sigma/r)^12-(sigma/r)^6)+138.935456*q/r; q=q1*q2; sigma=0.5*(sigma1+sigma2); eps=sqrt(eps1*eps2)");
+    nonbonded->addPerParticleParameter("q");
+    nonbonded->addPerParticleParameter("sigma");
+    nonbonded->addPerParticleParameter("eps");
+    vector<Vec3> positions(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<double> params(3);
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.1;
+            nonbonded->addParticle(params);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            nonbonded->addParticle(params);
+        }
+        else {
+            params[0] = 1.0;
+            params[1] = 0.2;
+            params[2] = 0.2;
+            nonbonded->addParticle(params);
+            params[0] = -1.0;
+            params[1] = 0.1;
+            nonbonded->addParticle(params);
+        }
+        positions[2*i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1] = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        nonbonded->addExclusion(2*i, 2*i+1);
+    }
+    nonbonded->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    system.addForce(nonbonded);
+    
+    // Compute the forces.
+    
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces);
+    
+    // Modify the force so only one particle interacts with everything else.
+    
+    set<int> set1, set2;
+    set1.insert(151);
+    for (int i = 0; i < numParticles; i++)
+        set2.insert(i);
+    nonbonded->addInteractionGroup(set1, set2);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state2 = context.getState(State::Forces);
+    
+    // The force on that one particle should be the same.
+    
+    ASSERT_EQUAL_VEC(state1.getForces()[151], state2.getForces()[151], 1e-4);
+    
+    // Modify the interaction group so it includes all interactions.  This should now reproduce the original forces
+    // on all atoms.
+
+    for (int i = 0; i < numParticles; i++)
+        set1.insert(i);
+    nonbonded->setInteractionGroupParameters(0, set1, set2);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state3 = context.getState(State::Forces);
+    for (int i = 0; i < numParticles; i++)
+        ASSERT_EQUAL_VEC(state1.getForces()[i], state3.getForces()[i], 1e-4);
+}
+
+void testInteractionGroupLongRangeCorrection() {
+    const int numParticles = 10;
+    const double boxSize = 10.0;
+    const double cutoff = 0.5;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    CustomNonbondedForce* nonbonded = new CustomNonbondedForce("c1*c2*r^-4");
+    nonbonded->addPerParticleParameter("c");
+    vector<Vec3> positions(numParticles);
+    vector<double> params(1);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        params[0] = (i%2 == 0 ? 1.1 : 2.0);
+        nonbonded->addParticle(params);
+        positions[i] = Vec3(0.5*i, 0, 0);
+    }
+    nonbonded->setNonbondedMethod(CustomNonbondedForce::CutoffPeriodic);
+    nonbonded->setCutoffDistance(cutoff);
+    system.addForce(nonbonded);
+    
+    // Setup nonbonded groups.  They involve 1 interaction of type AA,
+    // 2 of type BB, and 5 of type AB.
+    
+    set<int> set1, set2, set3, set4, set5;
+    set1.insert(0);
+    set1.insert(1);
+    set1.insert(2);
+    nonbonded->addInteractionGroup(set1, set1);
+    set2.insert(3);
+    set3.insert(4);
+    set3.insert(6);
+    set3.insert(8);
+    nonbonded->addInteractionGroup(set2, set3);
+    set4.insert(5);
+    set5.insert(7);
+    set5.insert(9);
+    nonbonded->addInteractionGroup(set4, set5);
+    
+    // Compute energy with and without the correction.
+    
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+    double energy1 = context.getState(State::Energy).getPotentialEnergy();
+    nonbonded->setUseLongRangeCorrection(true);
+    context.reinitialize();
+    context.setPositions(positions);
+    double energy2 = context.getState(State::Energy).getPotentialEnergy();
+    
+    // Check the result.
+    
+    double sum = (1.1*1.1 + 2*2.0*2.0 + 5*1.1*2.0)*2.0;
+    int numPairs = (numParticles*(numParticles+1))/2;
+    double expected = 2*M_PI*numParticles*numParticles*sum/(numPairs*boxSize*boxSize*boxSize);
+    ASSERT_EQUAL_TOL(expected, energy2-energy1, 1e-4);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
@@ -553,6 +730,9 @@ int main(int argc, char* argv[]) {
        testParallelComputation();
        testSwitchingFunction();
        testLongRangeCorrection();
+        testInteractionGroups();
+        testLargeInteractionGroup();
+        testInteractionGroupLongRangeCorrection();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/cuda/tests/TestCudaMonteCarloAnisotropicBarostat.cpp
+++ b/platforms/cuda/tests/TestCudaMonteCarloAnisotropicBarostat.cpp
@@ -53,44 +53,6 @@ using namespace std;

 CudaPlatform platform;

-void testChangingBoxSize() {
-    System system;
-    system.setDefaultPeriodicBoxVectors(Vec3(4, 0, 0), Vec3(0, 5, 0), Vec3(0, 0, 6));
-    system.addParticle(1.0);
-    NonbondedForce* nb = new NonbondedForce();
-    nb->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
-    nb->setCutoffDistance(2.0);
-    nb->addParticle(1, 0.5, 0.5);
-    system.addForce(nb);
-    LangevinIntegrator integrator(300.0, 1.0, 0.01);
-    Context context(system, integrator, platform);
-    vector<Vec3> positions;
-    positions.push_back(Vec3());
-    context.setPositions(positions);
-    Vec3 x, y, z;
-    context.getState(State::Forces).getPeriodicBoxVectors(x, y, z);
-    ASSERT_EQUAL_VEC(Vec3(4, 0, 0), x, 0);
-    ASSERT_EQUAL_VEC(Vec3(0, 5, 0), y, 0);
-    ASSERT_EQUAL_VEC(Vec3(0, 0, 6), z, 0);
-    context.setPeriodicBoxVectors(Vec3(7, 0, 0), Vec3(0, 8, 0), Vec3(0, 0, 9));
-    context.getState(State::Forces).getPeriodicBoxVectors(x, y, z);
-    ASSERT_EQUAL_VEC(Vec3(7, 0, 0), x, 0);
-    ASSERT_EQUAL_VEC(Vec3(0, 8, 0), y, 0);
-    ASSERT_EQUAL_VEC(Vec3(0, 0, 9), z, 0);
-    
-    // Shrinking the box too small should produce an exception.
-    
-    context.setPeriodicBoxVectors(Vec3(7, 0, 0), Vec3(0, 3.9, 0), Vec3(0, 0, 9));
-    bool ok = true;
-    try {
-        context.getState(State::Forces).getPeriodicBoxVectors(x, y, z);
-        ok = false;
-    }
-    catch (exception& ex) {
-    }
-    ASSERT(ok);
-}
-
 void testIdealGas() {
    const int numParticles = 64;
    const int frequency = 10;
@@ -112,7 +74,7 @@ void testIdealGas() {
        system.addParticle(1.0);
        positions[i] = Vec3(initialLength*genrand_real2(sfmt), 0.5*initialLength*genrand_real2(sfmt), 2*initialLength*genrand_real2(sfmt));
    }
-    MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], frequency);
+    MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], true, true, true, frequency);
    system.addForce(barostat);
    
    // Test it for three different temperatures.
@@ -170,7 +132,7 @@ void testIdealGasAxis(int axis) {
        system.addParticle(1.0);
        positions[i] = Vec3(initialLength*genrand_real2(sfmt), 0.5*initialLength*genrand_real2(sfmt), 2*initialLength*genrand_real2(sfmt));
    }
-    MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], frequency, scaleX, scaleY, scaleZ);
+    MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp[0], scaleX, scaleY, scaleZ, frequency);
    system.addForce(barostat);
    
    // Test it for three different temperatures.
@@ -226,7 +188,7 @@ void testRandomSeed() {
        forceField->addParticle((i%2 == 0 ? 1.0 : -1.0), 1.0, 5.0);
    }
    system.addForce(forceField);
-    MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp, 1);
+    MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pressure, pressure, pressure), temp, true, true, true, 1);
    system.addForce(barostat);
    vector<Vec3> positions(numParticles);
    vector<Vec3> velocities(numParticles);
@@ -332,7 +294,7 @@ void testEinsteinCrystal() {
            system.addForce(force);
            system.addForce(nb);
            // Create the barostat.
-            MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pres3[p], pres3[p], pres3[p]), temp, frequency, (a==0||a==3), (a==1||a==3), (a==2||a==3));
+            MonteCarloAnisotropicBarostat* barostat = new MonteCarloAnisotropicBarostat(Vec3(pres3[p], pres3[p], pres3[p]), temp, (a==0||a==3), (a==1||a==3), (a==2||a==3), frequency);
            system.addForce(barostat);
            barostat->setTemperature(temp);
            LangevinIntegrator integrator(temp, 0.1, 0.01);
@@ -422,7 +384,6 @@ int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
-        testChangingBoxSize();
        testIdealGas();
        testIdealGasAxis(0);
        testIdealGasAxis(1);

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2011 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -191,6 +191,12 @@ public:
    int getDeviceIndex() {
        return deviceIndex;
    }
+    /**
+     * Get the index of the cl::Platform associated with this object.
+     */
+    int getPlatformIndex() {
+        return platformIndex;
+    }
    /**
     * Get the PlatformData object this context is part of.
     */
@@ -589,7 +595,6 @@ private:
    struct MoleculeGroup;
    class VirtualSiteInfo;
    void findMoleculeGroups();
-    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
    /**
     * Ensure that all molecules marked as "identical" really are identical.  This should be
     * called whenever force field parameters change.  If necessary, it will rebuild the list
@@ -605,6 +610,7 @@ private:
    double time;
    OpenCLPlatform::PlatformData& platformData;
    int deviceIndex;
+    int platformIndex;
    int contextIndex;
    int stepCount;
    int computeForceCount;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -639,7 +639,7 @@ private:
 class OpenCLCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
 public:
    OpenCLCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, const System& system) : CalcCustomNonbondedForceKernel(name, platform),
-            cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), forceCopy(NULL), system(system) {
+            cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), interactionGroupData(NULL), forceCopy(NULL), system(system), hasInitializedKernel(false) {
    }
    ~OpenCLCalcCustomNonbondedForceKernel();
    /**
@@ -666,15 +666,20 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
+    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
    OpenCLContext& cl;
    OpenCLParameterSet* params;
    OpenCLArray* globals;
    OpenCLArray* tabulatedFunctionParams;
+    OpenCLArray* interactionGroupData;
+    cl::Kernel interactionGroupKernel;
+    std::vector<void*> interactionGroupArgs;
    std::vector<std::string> globalParamNames;
    std::vector<cl_float> globalParamValues;
    std::vector<OpenCLArray*> tabulatedFunctions;
    double longRangeCoefficient;
-    bool hasInitializedLongRangeCorrection;
+    bool hasInitializedLongRangeCorrection, hasInitializedKernel;
+    int numGroupThreadBlocks;
    CustomNonbondedForce* forceCopy;
    const System& system;
 };

--- a/platforms/opencl/include/OpenCLParameterSet.h
+++ b/platforms/opencl/include/OpenCLParameterSet.h
@@ -81,6 +81,13 @@ public:
     */
    template <class T>
    void setParameterValues(const std::vector<std::vector<T> >& values);
+    /**
+     * Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
+     * containing the data.
+     */
+    std::vector<OpenCLNonbondedUtilities::ParameterInfo>& getBuffers() {
+        return buffers;
+    }
    /**
     * Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.

--- a/platforms/opencl/sharedTarget/CMakeLists.txt
+++ b/platforms/opencl/sharedTarget/CMakeLists.txt
@@ -19,6 +19,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB}  ${OPENCL_LIBRARIES} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_OPENCL_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-msse2 -DOPENMM_OPENCL_BUILDING_SHARED_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -39,6 +39,7 @@
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include "openmm/VirtualSite.h"
+#include "openmm/internal/ContextImpl.h"
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -87,17 +88,25 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        contextIndex = platformData.contexts.size();
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
-        if (platformIndex < 0 || platformIndex >= (int) platforms.size())
-            throw OpenMMException("Illegal value for OpenCL platform index");
-        string platformVendor = platforms[platformIndex].getInfo<CL_PLATFORM_VENDOR>();
-        vector<cl::Device> devices;
-        platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
        const int minThreadBlockSize = 32;
-        if (deviceIndex < 0 || deviceIndex >= (int) devices.size()) {
-            // Try to figure out which device is the fastest.

-            int bestSpeed = -1;
+        int bestSpeed = -1;
+        int bestDevice = -1;
+        int bestPlatform = -1;
+        for (int j = 0; j < platforms.size(); j++) {
+            // if they supplied a valid platformIndex, we only look through that platform
+            if (j != platformIndex && platformIndex >= 0 && platformIndex < (int) platforms.size())
+                continue;
+
+            string platformVendor = platforms[j].getInfo<CL_PLATFORM_VENDOR>();
+            vector<cl::Device> devices;
+            platforms[j].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+
            for (int i = 0; i < (int) devices.size(); i++) {
+                // if they supplied a valid deviceIndex, we only look through that one
+                if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
+                    continue;
+
                if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
                    continue; // Don't use AMD GPUs on OS X due to serious bugs.
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
@@ -136,15 +145,26 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                }
                int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
                if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
-                    deviceIndex = i;
+                    bestDevice = i;
                    bestSpeed = speed;
+                    bestPlatform = j;
                }
            }
        }
-        if (deviceIndex == -1)
+
+        if (bestPlatform == -1)
+            throw OpenMMException("No compatible OpenCL platform is available");
+
+        if (bestDevice == -1)
            throw OpenMMException("No compatible OpenCL device is available");
-        device = devices[deviceIndex];
-        this->deviceIndex = deviceIndex;
+
+        vector<cl::Device> devices;
+        platforms[bestPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+        string platformVendor = platforms[bestPlatform].getInfo<CL_PLATFORM_VENDOR>();
+        device = devices[bestDevice];
+
+        this->deviceIndex = bestDevice;
+        this->platformIndex = bestPlatform;
        if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
            throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
@@ -226,7 +246,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
        vector<cl::Device> contextDevices;
        contextDevices.push_back(device);
-        cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[platformIndex](), 0};
+        cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
        context = cl::Context(contextDevices, cprops, errorCallback);
        queue = cl::CommandQueue(context, device);
        numAtoms = system.getNumParticles();
@@ -618,15 +638,6 @@ void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
    executeKernel(reduceReal4Kernel, bufferSize, 128);
 }

-void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
-    // Recursively tag atoms as belonging to a particular molecule.
-
-    atomMolecule[atom] = molecule;
-    for (int i = 0; i < (int) atomBonds[atom].size(); i++)
-        if (atomMolecule[atomBonds[atom][i]] == -1)
-            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
-}
-
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
@@ -722,16 +733,14 @@ void OpenCLContext::findMoleculeGroups() {
            }
        }

-        // Now tag atoms by which molecule they belong to.
+        // Now identify atoms by which molecule they belong to.

-        vector<int> atomMolecule(numAtoms, -1);
-        int numMolecules = 0;
-        for (int i = 0; i < numAtoms; i++)
-            if (atomMolecule[i] == -1)
-                tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
-        vector<vector<int> > atomIndices(numMolecules);
-        for (int i = 0; i < numAtoms; i++)
-            atomIndices[atomMolecule[i]].push_back(i);
+        vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
+        int numMolecules = atomIndices.size();
+        vector<int> atomMolecule(numAtoms);
+        for (int i = 0; i < (int) atomIndices.size(); i++)
+            for (int j = 0; j < (int) atomIndices[i].size(); j++)
+                atomMolecule[atomIndices[i][j]] = i;

        // Construct a description of each molecule.


--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -46,6 +46,7 @@
 #include "lepton/ParsedExpression.h"
 #include "SimTKOpenMMRealType.h"
 #include "SimTKOpenMMUtilities.h"
+#include <algorithm>
 #include <cmath>
 #include <set>

@@ -1875,6 +1876,17 @@ void OpenCLCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& contex
 class OpenCLCustomNonbondedForceInfo : public OpenCLForceInfo {
 public:
    OpenCLCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : OpenCLForceInfo(requiredBuffers), force(force) {
+        if (force.getNumInteractionGroups() > 0) {
+            groupsForParticle.resize(force.getNumParticles());
+            for (int i = 0; i < force.getNumInteractionGroups(); i++) {
+                set<int> set1, set2;
+                force.getInteractionGroupParameters(i, set1, set2);
+                for (set<int>::const_iterator iter = set1.begin(); iter != set1.end(); ++iter)
+                    groupsForParticle[*iter].insert(2*i);
+                for (set<int>::const_iterator iter = set2.begin(); iter != set2.end(); ++iter)
+                    groupsForParticle[*iter].insert(2*i+1);
+            }
+        }
    }
    bool areParticlesIdentical(int particle1, int particle2) {
        vector<double> params1;
@@ -1884,6 +1896,8 @@ public:
        for (int i = 0; i < (int) params1.size(); i++)
            if (params1[i] != params2[i])
                return false;
+        if (groupsForParticle.size() > 0 && groupsForParticle[particle1] != groupsForParticle[particle2])
+            return false;
        return true;
    }
    int getNumParticleGroups() {
@@ -1901,6 +1915,7 @@ public:
    }
 private:
    const CustomNonbondedForce& force;
+    vector<set<int> > groupsForParticle;
 };

 OpenCLCalcCustomNonbondedForceKernel::~OpenCLCalcCustomNonbondedForceKernel() {
@@ -1910,6 +1925,8 @@ OpenCLCalcCustomNonbondedForceKernel::~OpenCLCalcCustomNonbondedForceKernel() {
        delete globals;
    if (tabulatedFunctionParams != NULL)
        delete tabulatedFunctionParams;
+    if (interactionGroupData != NULL)
+        delete interactionGroupData;
    for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
        delete tabulatedFunctions[i];
    if (forceCopy != NULL)
@@ -1920,7 +1937,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
    int forceIndex;
    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
        ;
-    string prefix = "custom"+cl.intToString(forceIndex)+"_";
+    string prefix = (force.getNumInteractionGroups() == 0 ? "custom"+cl.intToString(forceIndex)+"_" : "");

    // Record parameters and exclusions.

@@ -2021,14 +2038,18 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
        replacements["SWITCH_C5"] = cl.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
    }
    string source = cl.replaceStrings(OpenCLKernelSources::customNonbonded, replacements);
-    cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
-    for (int i = 0; i < (int) params->getBuffers().size(); i++) {
-        const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
-        cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+cl.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
-    }
-    if (globals != NULL) {
-        globals->upload(globalParamValues);
-        cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
+    if (force.getNumInteractionGroups() > 0)
+        initInteractionGroups(force, source);
+    else {
+        cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
+            const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
+            cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+cl.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
+        }
+        if (globals != NULL) {
+            globals->upload(globalParamValues);
+            cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
+        }
    }
    cl.addForce(new OpenCLCustomNonbondedForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
    
@@ -2044,6 +2065,250 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
    }
 }

+void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNonbondedForce& force, const string& interactionSource) {
+    // Process groups to form tiles.
+    
+    vector<vector<int> > atomLists;
+    vector<pair<int, int> > tiles;
+    map<pair<int, int>, int> duplicateInteractions;
+    for (int group = 0; group < force.getNumInteractionGroups(); group++) {
+        // Get the list of atoms in this group and sort them.
+        
+        set<int> set1, set2;
+        force.getInteractionGroupParameters(group, set1, set2);
+        vector<int> atoms1, atoms2;
+        atoms1.insert(atoms1.begin(), set1.begin(), set1.end());
+        atoms2.insert(atoms2.begin(), set2.begin(), set2.end());
+        sort(atoms1.begin(), atoms1.end());
+        sort(atoms2.begin(), atoms2.end());
+        
+        // Find how many tiles we will create for this group.
+        
+        int tileWidth = min(min(32, (int) atoms1.size()), (int) atoms2.size());
+        int numBlocks1 = (atoms1.size()+tileWidth-1)/tileWidth;
+        int numBlocks2 = (atoms2.size()+tileWidth-1)/tileWidth;
+        
+        // Add the tiles.
+        
+        for (int i = 0; i < numBlocks1; i++)
+            for (int j = 0; j < numBlocks2; j++)
+                tiles.push_back(make_pair(atomLists.size()+i, atomLists.size()+numBlocks1+j));
+        
+        // Add the atom lists.
+        
+        for (int i = 0; i < numBlocks1; i++) {
+            vector<int> atoms;
+            int first = i*tileWidth;
+            int last = min((i+1)*tileWidth, (int) atoms1.size());
+            for (int j = first; j < last; j++)
+                atoms.push_back(atoms1[j]);
+            atomLists.push_back(atoms);
+        }
+        for (int i = 0; i < numBlocks2; i++) {
+            vector<int> atoms;
+            int first = i*tileWidth;
+            int last = min((i+1)*tileWidth, (int) atoms2.size());
+            for (int j = first; j < last; j++)
+                atoms.push_back(atoms2[j]);
+            atomLists.push_back(atoms);
+        }
+        
+        // If this group contains duplicate interactions, record that we need to skip them once.
+        
+        for (int i = 0; i < (int) atoms1.size(); i++) {
+            int a1 = atoms1[i];
+            if (set2.find(a1) == set2.end())
+                continue;
+            for (int j = 0; j < (int) atoms2.size() && atoms2[j] < a1; j++) {
+                int a2 = atoms2[j];
+                if (set1.find(a2) != set1.end()) {
+                    pair<int, int> key = make_pair(a2, a1);
+                    if (duplicateInteractions.find(key) == duplicateInteractions.end())
+                        duplicateInteractions[key] = 0;
+                    duplicateInteractions[key]++;
+                }
+            }
+        }
+    }
+    
+    // Build a lookup table for quickly identifying excluded interactions.
+    
+    set<pair<int, int> > exclusions;
+    for (int i = 0; i < force.getNumExclusions(); i++) {
+        int p1, p2;
+        force.getExclusionParticles(i, p1, p2);
+        exclusions.insert(make_pair(min(p1, p2), max(p1, p2)));
+    }
+    
+    // Build the exclusion flags for each tile.  While we're at it, filter out tiles
+    // where all interactions are excluded, and sort the tiles by size.
+
+    vector<vector<int> > exclusionFlags(tiles.size());
+    vector<pair<int, int> > tileOrder;
+    for (int tile = 0; tile < tiles.size(); tile++) {
+        if (atomLists[tiles[tile].first].size() < atomLists[tiles[tile].second].size()) {
+            // For efficiency, we want the first axis to be the larger one.
+            
+            int swap = tiles[tile].first;
+            tiles[tile].first = tiles[tile].second;
+            tiles[tile].second = swap;
+        }
+        vector<int>& atoms1 = atomLists[tiles[tile].first];
+        vector<int>& atoms2 = atomLists[tiles[tile].second];
+        vector<int> flags(atoms1.size(), (int) (1LL<<atoms2.size())-1);
+        int numExcluded = 0;
+        for (int i = 0; i < (int) atoms1.size(); i++)
+            for (int j = 0; j < (int) atoms2.size(); j++) {
+                int a1 = atoms1[i];
+                int a2 = atoms2[j];
+                bool isExcluded = false;
+                pair<int, int> key = make_pair(min(a1, a2), max(a1, a2));
+                if (a1 == a2 || exclusions.find(key) != exclusions.end())
+                    isExcluded = true; // This is an excluded interaction.
+                else if (duplicateInteractions.find(key) != duplicateInteractions.end() && duplicateInteractions[key] > 0) {
+                    // Both atoms are in both sets, so skip duplicate interactions.
+                    
+                    isExcluded = true;
+                    duplicateInteractions[key]--;
+                }
+                if (isExcluded) {
+                    flags[i] &= -1-(1<<j);
+                    numExcluded++;
+                }
+            }
+        if (numExcluded == atoms1.size()*atoms2.size())
+            continue; // All interactions are excluded.
+        tileOrder.push_back(make_pair((int) -atoms2.size(), tile));
+        exclusionFlags[tile] = flags;
+    }
+    sort(tileOrder.begin(), tileOrder.end());
+    
+    // Merge tiles to get as close as possible to 32 along the first axis of each one.
+    
+    vector<int> tileSetStart;
+    tileSetStart.push_back(0);
+    int tileSetSize = 0;
+    for (int i = 0; i < tileOrder.size(); i++) {
+        int tile = tileOrder[i].second;
+        int size = atomLists[tiles[tile].first].size();
+        if (tileSetSize+size > 32) {
+            tileSetStart.push_back(i);
+            tileSetSize = 0;
+        }
+        tileSetSize += size;
+    }
+    tileSetStart.push_back(tileOrder.size());
+    
+    // Build the data structures.
+    
+    int numTileSets = tileSetStart.size()-1;
+    vector<mm_int4> groupData;
+    for (int tileSet = 0; tileSet < numTileSets; tileSet++) {
+        int indexInTileSet = 0;
+        int minSize = 0;
+        if (cl.getSIMDWidth() < 32) {
+            // We need to include a barrier inside the inner loop, so ensure that all
+            // threads will loop the same number of times.
+            
+            for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++)
+                minSize = max(minSize, (int) atomLists[tiles[tileOrder[i].second].first].size());
+        }
+        for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) {
+            int tile = tileOrder[i].second;
+            vector<int>& atoms1 = atomLists[tiles[tile].first];
+            vector<int>& atoms2 = atomLists[tiles[tile].second];
+            int range = indexInTileSet + ((indexInTileSet+max(minSize, (int) atoms1.size()))<<16);
+            int allFlags = (1<<atoms2.size())-1;
+            for (int j = 0; j < (int) atoms1.size(); j++) {
+                int a1 = atoms1[j];
+                int a2 = (j < atoms2.size() ? atoms2[j] : 0);
+                int flags = (exclusionFlags[tile].size() > 0 ? exclusionFlags[tile][j] : allFlags);
+                groupData.push_back(mm_int4(a1, a2, range, flags<<indexInTileSet));
+            }
+            indexInTileSet += atoms1.size();
+        }
+        for (; indexInTileSet < 32; indexInTileSet++)
+            groupData.push_back(mm_int4(0, 0, minSize<<16, 0));
+    }
+    interactionGroupData = OpenCLArray::create<mm_int4>(cl, groupData.size(), "interactionGroupData");
+    interactionGroupData->upload(groupData);
+    
+    // Create the kernel.
+    
+    map<string, string> replacements;
+    replacements["COMPUTE_INTERACTION"] = interactionSource;
+    const string suffixes[] = {"x", "y", "z", "w"};
+    stringstream localData;
+    int localDataSize = 0;
+    vector<OpenCLNonbondedUtilities::ParameterInfo>& buffers = params->getBuffers(); 
+    for (int i = 0; i < (int) buffers.size(); i++) {
+        if (buffers[i].getNumComponents() == 1)
+            localData<<buffers[i].getComponentType()<<" params"<<(i+1)<<";\n";
+        else {
+            for (int j = 0; j < buffers[i].getNumComponents(); ++j)
+                localData<<buffers[i].getComponentType()<<" params"<<(i+1)<<"_"<<suffixes[j]<<";\n";
+        }
+        localDataSize += buffers[i].getSize();
+    }
+    replacements["ATOM_PARAMETER_DATA"] = localData.str();
+    stringstream args;
+    for (int i = 0; i < (int) buffers.size(); i++)
+        args<<", __global const "<<buffers[i].getType()<<"* restrict global_params"<<(i+1);
+    if (globals != NULL)
+        args<<", __global const float* restrict globals";
+    replacements["PARAMETER_ARGUMENTS"] = args.str();
+    stringstream load1;
+    for (int i = 0; i < (int) buffers.size(); i++)
+        load1<<buffers[i].getType()<<" params"<<(i+1)<<"1 = global_params"<<(i+1)<<"[atom1];\n";
+    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
+    stringstream loadLocal2;
+    for (int i = 0; i < (int) buffers.size(); i++) {
+        if (buffers[i].getNumComponents() == 1)
+            loadLocal2<<"localData[get_local_id(0)].params"<<(i+1)<<" = global_params"<<(i+1)<<"[atom2];\n";
+        else {
+            loadLocal2<<buffers[i].getType()<<" temp_params"<<(i+1)<<" = global_params"<<(i+1)<<"[atom2];\n";
+            for (int j = 0; j < buffers[i].getNumComponents(); ++j)
+                loadLocal2<<"localData[get_local_id(0)].params"<<(i+1)<<"_"<<suffixes[j]<<" = temp_params"<<(i+1)<<"."<<suffixes[j]<<";\n";
+        }
+    }
+    replacements["LOAD_LOCAL_PARAMETERS"] = loadLocal2.str();
+    stringstream load2;
+    for (int i = 0; i < (int) buffers.size(); i++) {
+        if (buffers[i].getNumComponents() == 1)
+            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n";
+        else {
+            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = ("<<buffers[i].getType()<<") (";
+            for (int j = 0; j < buffers[i].getNumComponents(); ++j) {
+                if (j > 0)
+                    load2<<", ";
+                load2<<"localData[localIndex].params"<<(i+1)<<"_"<<suffixes[j];
+            }
+            load2<<");\n";
+        }
+    }
+    replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
+    map<string, string> defines;
+    if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
+        defines["USE_CUTOFF"] = "1";
+    if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic)
+        defines["USE_PERIODIC"] = "1";
+    defines["THREAD_BLOCK_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
+    double cutoff = force.getCutoffDistance();
+    defines["CUTOFF_SQUARED"] = cl.doubleToString(cutoff*cutoff);
+    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
+    defines["TILE_SIZE"] = "32";
+    int numContexts = cl.getPlatformData().contexts.size();
+    int startIndex = cl.getContextIndex()*numTileSets/numContexts;
+    int endIndex = (cl.getContextIndex()+1)*numTileSets/numContexts;
+    defines["FIRST_TILE"] = cl.intToString(startIndex);
+    defines["LAST_TILE"] = cl.intToString(endIndex);
+    if ((localDataSize/4)%2 == 0 && !cl.getUseDoublePrecision())
+        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
+    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customNonbondedGroups, replacements), defines);
+    interactionGroupKernel = cl::Kernel(program, "computeInteractionGroups");
+    numGroupThreadBlocks = cl.getNonbondedUtilities().getNumForceThreadBlocks();
+}
+
 double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    if (globals != NULL) {
        bool changed = false;
@@ -2065,6 +2330,25 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
        longRangeCoefficient = CustomNonbondedForceImpl::calcLongRangeCorrection(*forceCopy, context.getOwner());
        hasInitializedLongRangeCorrection = true;
    }
+    if (interactionGroupData != NULL) {
+        if (!hasInitializedKernel) {
+            hasInitializedKernel = true;
+            int index = 0;
+            bool useLong = cl.getSupports64BitGlobalAtomics();
+            interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
+            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
+            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
+            interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer());
+            setPeriodicBoxSizeArg(cl, interactionGroupKernel, index++);
+            setInvPeriodicBoxSizeArg(cl, interactionGroupKernel, index++);
+            for (int i = 0; i < (int) params->getBuffers().size(); i++)
+                interactionGroupKernel.setArg<cl::Memory>(index++, params->getBuffers()[i].getMemory());
+            if (globals != NULL)
+                interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
+        }
+        int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
+        cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
+    }
    mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
    return longRangeCoefficient/(boxSize.x*boxSize.y*boxSize.z);
 }