Continuing implementation of nonbonded forces

c3125421 · Peter Eastman · d95e723d · c3125421 · c3125421 · c3125421
Commit c3125421 authored Oct 06, 2009 by Peter Eastman
12 changed files
--- a/platforms/opencl/src/OpenCLArray.h
+++ b/platforms/opencl/src/OpenCLArray.h
@@ -139,6 +139,8 @@ public:
     * Copy the values in the Buffer to a vector.
     */
    void download(std::vector<T>& data) const {
+        if (data.size() != size)
+            data.resize(size);
        try {
            context.getQueue().enqueueReadBuffer(*buffer, CL_TRUE, 0, size*sizeof(T), &data[0]);
        }

--- a/platforms/opencl/src/OpenCLCompact.cpp
+++ b/platforms/opencl/src/OpenCLCompact.cpp
+
+
+/* Code for OpenCL stream compaction. Roughly based on:
+    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
+        High Performance Graphics 2009.
+
+    Notes:
+        - paper recommends 128 threads/block, so this is hard coded.
+        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
+          complicated and performs poorly on current hardware
+        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
+          performance across most of the tested workloads in the paper. The selective variant is not
+          implemented.
+        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
+          manner. It is, however, done in a very easy to program manner, and integrated into the top of
+          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
+          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
+          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
+          dgBlockCounts in stages.
+
+  Date:         23 Aug 2009
+  Author:       CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
+  Affiliation:  Stanford University
+  License:      Public Domain
+*/
+
+#include "OpenCLCompact.h"
+
+using namespace OpenMM;
+
+OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context), dgBlockCounts(NULL) {
+    dgBlockCounts = new OpenCLArray<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
+    cl::Program program = context.createProgram(context.loadSourceFromFile("compact.cl"));
+    countKernel = cl::Kernel(program, "countElts");
+    moveValidKernel = cl::Kernel(program, "moveValidElementsStaged");
+}
+
+OpenCLCompact::~OpenCLCompact() {
+    if (dgBlockCounts != NULL)
+        delete dgBlockCounts;
+}
+
+void OpenCLCompact::compactStream(OpenCLArray<cl_uint>& dOut, OpenCLArray<cl_uint>& dIn, OpenCLArray<cl_uint>& dValid, OpenCLArray<cl_uint>& numValid) {
+    // Figure out # elements per block
+    int len = dIn.getSize();
+    unsigned int numBlocks = context.getNumThreadBlocks();
+    if (numBlocks*128 > len)
+        numBlocks = (len+127)/128;
+    const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);
+
+    // TODO: implement loop over blocks of 10M
+    // Phase 1: Calculate number of valid elements per thread block
+    countKernel.setArg<cl::Buffer>(0, dgBlockCounts->getDeviceBuffer());
+    countKernel.setArg<cl::Buffer>(1, dValid.getDeviceBuffer());
+    countKernel.setArg<cl_uint>(2, len);
+    countKernel.setArg(3, 128*sizeof(cl_uint), NULL);
+    context.executeKernel(countKernel, len, 128);
+
+    // Phase 2/3: Move valid elements using SIMD compaction
+    moveValidKernel.setArg<cl::Buffer>(0, dIn.getDeviceBuffer());
+    moveValidKernel.setArg<cl::Buffer>(1, dOut.getDeviceBuffer());
+    moveValidKernel.setArg<cl::Buffer>(2, dValid.getDeviceBuffer());
+    moveValidKernel.setArg<cl::Buffer>(3, dgBlockCounts->getDeviceBuffer());
+    moveValidKernel.setArg<cl_uint>(4, len);
+    moveValidKernel.setArg<cl::Buffer>(5, numValid.getDeviceBuffer());
+    moveValidKernel.setArg(6, 128*sizeof(cl_uint), NULL);
+    moveValidKernel.setArg(7, 128*sizeof(cl_uint), NULL);
+    moveValidKernel.setArg(8, 128*sizeof(cl_uint), NULL);
+    context.executeKernel(moveValidKernel, len, 128);
+}
--- a/platforms/opencl/src/OpenCLCompact.h
+++ b/platforms/opencl/src/OpenCLCompact.h
+#ifndef __OPENMM_OPENCLCOMPACT_H__
+#define __OPENMM_OPENCLCOMPACT_H__
+
+/* Code for OPENCL stream compaction. Roughly based on:
+    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
+        High Performance Graphics 2009.
+
+    Notes:
+        - paper recommends 128 threads/block, so this is hard coded.
+        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
+          complicated and performs poorly on current hardware
+        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
+          performance across most of the tested workloads in the paper. The selective variant is not
+          implemented.
+        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
+          manner. It is, however, done in a very easy to program manner, and integrated into the top of
+          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
+          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
+          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
+          dgBlockCounts in stages.
+
+  Date:         23 Aug 2009
+  Author:       CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
+  Affiliation:  Stanford University
+  License:      Public Domain
+*/
+
+#include "OpenCLArray.h"
+
+namespace OpenMM {
+
+class OpenCLCompact {
+public:
+    OpenCLCompact(OpenCLContext& context);
+    ~OpenCLCompact();
+    void compactStream(OpenCLArray<cl_uint>& dOut, OpenCLArray<cl_uint>& dIn, OpenCLArray<cl_uint>& dValid, OpenCLArray<cl_uint>& numValid);
+private:
+    OpenCLContext& context;
+    OpenCLArray<cl_uint>* dgBlockCounts;
+    cl::Kernel countKernel;
+    cl::Kernel moveValidKernel;
+};
+
+} // namespace OpenMM
+
+#endif // __OPENMM_OPENCLCOMPACT_H__
\ No newline at end of file
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -58,6 +58,9 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
    device = devices[deviceIndex];
    if (device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] < minThreadBlockSize)
        throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
+    compilationOptions = "-cl-fast-relaxed-math";
+    if (device.getInfo<CL_DEVICE_VENDOR>() == "NVIDIA")
+        compilationOptions += " -DWARPS_ARE_ATOMIC";
    queue = cl::CommandQueue(context, device);
    numAtoms = numParticles;
    paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
@@ -132,21 +135,30 @@ string OpenCLContext::loadSourceFromFile(const string& filename) const {
    return kernel;
 }

-cl::Program OpenCLContext::createProgram(const std::string source) {
+cl::Program OpenCLContext::createProgram(const string source) {
+    return createProgram(source, map<string, string>());
+}
+
+cl::Program OpenCLContext::createProgram(const string source, const map<string, string>& defines) {
    cl::Program::Sources sources(1, make_pair(source.c_str(), source.size()));
    cl::Program program(context, sources);
+    string options = compilationOptions;
+    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter)
+        options += " -D"+iter->first+"="+iter->second;
    try {
-        program.build(vector<cl::Device>(1, device));
+        program.build(vector<cl::Device>(1, device), options.c_str());
    } catch (cl::Error err) {
        throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
    }
    return program;
 }

-void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits) {
-    int size = std::min((workUnits+ThreadBlockSize-1)/ThreadBlockSize, numThreadBlocks)*ThreadBlockSize;
+void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int workUnitSize) {
+    if (workUnitSize == -1)
+        workUnitSize = ThreadBlockSize;
+    int size = std::min((workUnits+workUnitSize-1)/workUnitSize, numThreadBlocks)*workUnitSize;
    try {
-        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(ThreadBlockSize));
+        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(workUnitSize));
    }
    catch (cl::Error err) {
        stringstream str;

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -28,6 +28,8 @@
 * -------------------------------------------------------------------------- */

 #define __CL_ENABLE_EXCEPTIONS
+#include <map>
+#include <string>
 #include <cl.hpp>

 namespace OpenMM {
@@ -139,13 +141,20 @@ public:
     * Create an OpenCL Program from source code.
     */
    cl::Program createProgram(const std::string source);
+    /**
+     * Create an OpenCL Program from source code.
+     *
+     * @param defines    a set of preprocessor definitions (name, value) to define when compiling the program
+     */
+    cl::Program createProgram(const std::string source, const std::map<std::string, std::string>& defines);
    /**
     * Execute a kernel.
     *
-     * @param kernel    the kernel to execute
-     * @param workUnits the maximum number of work units that should be used
+     * @param kernel       the kernel to execute
+     * @param workUnits    the maximum number of work units that should be used
+     * @param workUnitSize the size of each work unit to use
     */
-    void executeKernel(cl::Kernel& kernel, int workUnits);
+    void executeKernel(cl::Kernel& kernel, int workUnits, int workUnitSize = -1);
    /**
     * Set all elements of an array to 0.
     */
@@ -226,7 +235,7 @@ public:
    /**
     * Get the OpenCLNonbondedUtilities for this context.
     */
-    OpenCLNonbondedUtilities& getNonbondedUtilties() {
+    OpenCLNonbondedUtilities& getNonbondedUtilities() {
        return *nonbonded;
    }
 private:
@@ -237,6 +246,7 @@ private:
    int numAtomBlocks;
    int numThreadBlocks;
    int numForceBuffers;
+    std::string compilationOptions;
    cl::Context context;
    cl::Device device;
    cl::CommandQueue queue;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -47,17 +47,17 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {

 void OpenCLCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context) {
    cl.clearBuffer(cl.getForceBuffers());
-    cl.getNonbondedUtilties().prepareInteractions();
+    cl.getNonbondedUtilities().prepareInteractions();
 }

 void OpenCLCalcForcesAndEnergyKernel::finishForceComputation(ContextImpl& context) {
    cl.reduceBuffer(cl.getForceBuffers(), cl.getNumForceBuffers());
-    cl.getNonbondedUtilties().prepareInteractions();
+    cl.getNonbondedUtilities().prepareInteractions();
 }

 void OpenCLCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& context) {
    cl.clearBuffer(cl.getEnergyBuffer());
-    cl.getNonbondedUtilties().prepareInteractions();
+    cl.getNonbondedUtilities().prepareInteractions();
 }

 double OpenCLCalcForcesAndEnergyKernel::finishEnergyComputation(ContextImpl& context) {
@@ -566,8 +566,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
 //    }
 //    data.nonbondedMethod = method;
 //    gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
-    cl.getNonbondedUtilties().addInteraction(useCutoff, usePeriodic, force.getCutoffDistance(), exclusionList);
-    cl.getNonbondedUtilties().addParameter("sigmaEpsilon", "float2", 8, sigmaEpsilon->getDeviceBuffer());
+    cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getCutoffDistance(), exclusionList);
+    cl.getNonbondedUtilities().addParameter("sigmaEpsilon", "float2", 8, sigmaEpsilon->getDeviceBuffer());
    cutoffSquared = force.getCutoffDistance()*force.getCutoffDistance();

    // Compute the Ewald self energy.
@@ -582,7 +582,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
    // Initialize the exceptions.

    int numExceptions = exceptions.size();
-    int maxBuffers = 1;
+    int maxBuffers = cl.getNonbondedUtilities().getNumForceBuffers();
    if (numExceptions > 0) {
        exceptionParams = new OpenCLArray<mm_float4>(cl, numExceptions, "exceptionParams");
        exceptionIndices = new OpenCLArray<mm_int4>(cl, numExceptions, "exceptionIndices");
@@ -607,13 +607,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
 }

 void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
-    cl.getNonbondedUtilties().computeInteractions();
+    cl.getNonbondedUtilities().computeInteractions();
    if (exceptionIndices != NULL) {
        int numExceptions = exceptionIndices->getSize();
        exceptionsKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
        exceptionsKernel.setArg<cl_int>(1, numExceptions);
        exceptionsKernel.setArg<cl_float>(2, cutoffSquared);
-        exceptionsKernel.setArg<mm_float4>(3, cl.getNonbondedUtilties().getPeriodicBoxSize());
+        exceptionsKernel.setArg<mm_float4>(3, cl.getNonbondedUtilities().getPeriodicBoxSize());
        exceptionsKernel.setArg<cl::Buffer>(4, cl.getForceBuffers().getDeviceBuffer());
        exceptionsKernel.setArg<cl::Buffer>(5, cl.getEnergyBuffer().getDeviceBuffer());
        exceptionsKernel.setArg<cl::Buffer>(6, cl.getPosq().getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -26,13 +26,25 @@

 #include "OpenCLNonbondedUtilities.h"
 #include "OpenCLArray.h"
+#include "OpenCLCompact.h"
 #include <map>

 using namespace OpenMM;
 using namespace std;

 OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
-        numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL) {
+        numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
+        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) {
+    // Decide how many force buffers to use.
+
+    forceBufferPerAtomBlock = false;
+    numForceBuffers = context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize/OpenCLContext::TileSize;
+    if (numForceBuffers >= context.getNumAtomBlocks()) {
+        // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
+
+        forceBufferPerAtomBlock = true;
+        numForceBuffers = context.getNumAtomBlocks();
+    }
 }

 OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
@@ -42,6 +54,18 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
        delete exclusionIndex;
    if (exclusions != NULL)
        delete exclusions;
+    if (interactingTiles != NULL)
+        delete interactingTiles;
+    if (interactionFlags != NULL)
+        delete interactionFlags;
+    if (interactionCount != NULL)
+        delete interactionCount;
+    if (blockCenter != NULL)
+        delete blockCenter;
+    if (blockBoundingBox != NULL)
+        delete blockBoundingBox;
+    if (compact != NULL)
+        delete compact;
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList) {
@@ -90,21 +114,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        for (unsigned int x = y; x < numAtomBlocks; x++)
            tileVec[count++] = (x << 17) | (y << 2);

-    // Decide how many force buffers to use.
-
-    bool forceBufferPerAtomBlock = false;
-    numForceBuffers = context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize/OpenCLContext::TileSize;
-    if (numForceBuffers >= numAtomBlocks) {
-        // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
-
-        forceBufferPerAtomBlock = true;
-        numForceBuffers = numAtomBlocks;
-    }
-
    // Create kernels.

-    cl::Program forceProgram = context.createProgram(context.loadSourceFromFile("nonbonded.cl"));
+    map<string, string> defines;
+    if (forceBufferPerAtomBlock)
+        defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "true";
+    cl::Program forceProgram = context.createProgram(context.loadSourceFromFile("nonbonded.cl"), defines);
    forceKernel = cl::Kernel(forceProgram, "computeNonbonded");
+    cl::Program interactingBlocksProgram = context.createProgram(context.loadSourceFromFile("findInteractingBlocks.cl"), defines);
+    findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
+    findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
+    findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");

    // Mark which tiles have exclusions.

@@ -182,16 +202,60 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    tiles->upload(tileVec);
    exclusions->upload(exclusionVec);
    exclusionIndex->upload(exclusionIndexVec);
+
+    // Record the periodic box size.
+
    Vec3 boxVectors[3];
    system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    periodicBoxSize = (mm_float4) {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2], 0.0f};
+
+    // Create data structures for the neighbor list.
+
+    if (useCutoff) {
+        interactingTiles = new OpenCLArray<cl_uint>(context, numTiles, "interactingTiles");
+        interactionFlags = new OpenCLArray<cl_uint>(context, numTiles, "interactionFlags");
+        interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount");
+        blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter");
+        blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
+        compact = new OpenCLCompact(context);
+    }
 }

 void OpenCLNonbondedUtilities::prepareInteractions() {
    hasComputedInteractions = false;
    if (!useCutoff)
        return;
-    // TODO compute the neighbor list
+
+    // Compute the neighbor list.
+
+    findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
+    findBlockBoundsKernel.setArg<mm_float4>(1, periodicBoxSize);
+    findBlockBoundsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+    findBlockBoundsKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
+    findBlockBoundsKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
+    context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
+
+    findInteractingBlocksKernel.setArg<cl_int>(0, tiles->getSize());
+    findInteractingBlocksKernel.setArg<cl_float>(1, cutoff*cutoff);
+    findInteractingBlocksKernel.setArg<mm_float4>(2, periodicBoxSize);
+    findInteractingBlocksKernel.setArg<cl::Buffer>(3, tiles->getDeviceBuffer());
+    findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
+    findInteractingBlocksKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
+    findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactionFlags->getDeviceBuffer());
+    context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms());
+
+    compact->compactStream(*interactingTiles, *tiles, *interactionFlags, *interactionCount);
+
+    findInteractionsWithinBlocksKernel.setArg<cl_float>(0, cutoff*cutoff);
+    findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, periodicBoxSize);
+    findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+    findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, tiles->getDeviceBuffer());
+    findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
+    findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
+    findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, interactionFlags->getDeviceBuffer());
+    findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
+    findInteractionsWithinBlocksKernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL);
+    context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms());
 }

 void OpenCLNonbondedUtilities::computeInteractions() {

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
@@ -34,6 +34,8 @@

 namespace OpenMM {

+class OpenCLCompact;
+
 /**
 * This class implements features that are used by several different force.  It provides
 * a generic interface for calculating nonbonded interactions.
@@ -86,17 +88,62 @@ public:
     * prepareInteractions().  Additional calls return immediately without doing anything.
     */
    void computeInteractions();
+    /**
+     * Get the array containing the center of each atom block.
+     */
+    OpenCLArray<mm_float4>& getBlockCenters() {
+        return *blockCenter;
+    }
+    /**
+     * Get the array containing the dimensions of each atom block.
+     */
+    OpenCLArray<mm_float4>& getBlockBoundingBoxes() {
+        return *blockBoundingBox;
+    }
+    /**
+     * Get the array containing the full set of tiles.
+     */
+    OpenCLArray<cl_uint>& getTiles() {
+        return *tiles;
+    }
+    /**
+     * Get the array whose first element contains the number of tiles with interactions.
+     */
+    OpenCLArray<cl_uint>& getInteractionCount() {
+        return *interactionCount;
+    }
+    /**
+     * Get the array containing tiles with interactions.
+     */
+    OpenCLArray<cl_uint>& getInteractingTiles() {
+        return *interactingTiles;
+    }
+    /**
+     * Get the array containing flags for tiles with interactions.
+     */
+    OpenCLArray<cl_uint>& getInteractionFlags() {
+        return *interactionFlags;
+    }
 private:
    class ParameterInfo;
    OpenCLContext& context;
    cl::Kernel forceKernel;
+    cl::Kernel findBlockBoundsKernel;
+    cl::Kernel findInteractingBlocksKernel;
+    cl::Kernel findInteractionsWithinBlocksKernel;
    OpenCLArray<cl_uint>* tiles;
    OpenCLArray<cl_uint>* exclusionIndex;
    OpenCLArray<cl_uint>* exclusions;
+    OpenCLArray<cl_uint>* interactingTiles;
+    OpenCLArray<cl_uint>* interactionFlags;
+    OpenCLArray<cl_uint>* interactionCount;
+    OpenCLArray<mm_float4>* blockCenter;
+    OpenCLArray<mm_float4>* blockBoundingBox;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
+    OpenCLCompact* compact;
    double cutoff;
-    bool useCutoff, usePeriodic, hasComputedInteractions;
+    bool useCutoff, usePeriodic, forceBufferPerAtomBlock, hasComputedInteractions;
    int numForceBuffers;
    mm_float4 periodicBoxSize;
 };

--- a/platforms/opencl/src/kernels/compact.cl
+++ b/platforms/opencl/src/kernels/compact.cl
+/* Code for CUDA stream compaction. Roughly based on:
+    Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
+        High Performance Graphics 2009.
+
+    Notes:
+        - paper recommends 128 threads/block, so this is hard coded.
+        - I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
+          complicated and performs poorly on current hardware
+        - I only implement the scattered- and staged-write variant of phase III as it they have reasonable
+          performance across most of the tested workloads in the paper. The selective variant is not
+          implemented.
+        - The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
+          manner. It is, however, done in a very easy to program manner, and integrated into the top of
+          phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
+          it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
+          a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
+          dgBlockCounts in stages.
+
+  Date:         23 Aug 2009
+  Author:       CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
+  Affiliation:  Stanford University
+  License:      Public Domain
+*/
+
+// Phase 1: Count valid elements per thread block
+// Hard-code 128 thd/blk
+unsigned int sumReduce128(__local unsigned int* arr) {
+    // Parallel reduce element counts
+    // Assumes 128 thd/block
+    int thread = get_local_id(0);
+    if (thread < 64) arr[thread] += arr[thread+64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef WARPS_ARE_ATOMIC
+    if (thread < 32) {
+        arr[thread] += arr[thread+32];
+        if (thread < 16) arr[thread] += arr[thread+16];
+        if (thread < 8) arr[thread] += arr[thread+8];
+        if (thread < 4) arr[thread] += arr[thread+4];
+        if (thread < 2) arr[thread] += arr[thread+2];
+        if (thread < 1) arr[thread] += arr[thread+1];
+    }
+#else
+    if (thread < 32) arr[thread] += arr[thread+32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (thread < 16) arr[thread] += arr[thread+16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (thread < 8) arr[thread] += arr[thread+8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (thread < 4) arr[thread] += arr[thread+4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (thread < 2) arr[thread] += arr[thread+2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (thread < 1) arr[thread] += arr[thread+1];
+#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return arr[0];
+}
+
+__kernel void countElts(__global unsigned int* dgBlockCounts, __global unsigned int* dgValid, const unsigned int len, __local unsigned int* dsCount) {
+    dsCount[get_local_id(0)] = 0;
+    unsigned int ub;
+    const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
+    ub = (len < (get_group_id(0)+1)*eltsPerBlock) ? len : ((get_group_id(0) + 1)*eltsPerBlock);
+    for (int base = get_group_id(0) * eltsPerBlock; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
+        if ((base + get_local_id(0)) < ub && dgValid[base+get_local_id(0)])
+            dsCount[get_local_id(0)]++;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    unsigned int blockCount = sumReduce128(dsCount);
+    if (get_local_id(0) == 0) dgBlockCounts[get_group_id(0)] = blockCount;
+    return;
+}
+
+// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
+// Exclusive prefix scan over 128 elements
+// Assumes 128 threads
+// Taken from cuda SDK "scan" sample for naive scan, with small modifications
+int exclusivePrescan128(__local const unsigned int* in, __local unsigned int* outAndTemp) {
+    const int n=128;
+    //TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
+    //__shared__ int temp[2*n];
+    __local unsigned int* temp = outAndTemp;
+    int pout = 1, pin = 0;
+
+    // load input into temp
+    // This is exclusive scan, so shift right by one and set first elt to 0
+    int thread = get_local_id(0);
+    temp[pout*n + get_local_id(0)] = (get_local_id(0) > 0) ? in[get_local_id(0)-1] : 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int offset = 1; offset < n; offset *= 2)
+    {
+        pout = 1 - pout; // swap double buffer indices
+        pin  = 1 - pout;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        temp[pout*n+get_local_id(0)] = temp[pin*n+get_local_id(0)];
+        if (get_local_id(0) >= offset)
+            temp[pout*n+get_local_id(0)] += temp[pin*n+get_local_id(0) - offset];
+    }
+
+    //out[get_local_id(0)] = temp[pout*n+get_local_id(0)]; // write output
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return outAndTemp[127]+in[127]; // Return sum of all elements
+}
+
+int compactSIMDPrefixSum(__local const unsigned int* dsData, __local const unsigned int* dsValid, __local unsigned int* dsCompact, __local unsigned int* dsLocalIndex) {
+    int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
+    int thread = get_local_id(0);
+    if (dsValid[get_local_id(0)]) dsCompact[dsLocalIndex[get_local_id(0)]] = dsData[get_local_id(0)];
+    return numValid;
+}
+
+__kernel void moveValidElementsStaged(__global unsigned int* dgData, __global unsigned int* dgCompact, __global unsigned int* dgValid,
+            __global unsigned int* dgBlockCounts, unsigned int len, __global unsigned int* dNumValidElements,
+            __local unsigned int* inBlock, __local unsigned int* validBlock, __local unsigned int* compactBlock) {
+    __local unsigned int dsLocalIndex[256];
+    int blockOutOffset=0;
+    // Sum up the blockCounts before us to find our offset
+    // This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
+    // Paper implements this as a prefix sum kernel in phase II
+    // May still be faster than an extra kernel invocation?
+    int thread = get_local_id(0);
+    for (int base = 0; base < get_group_id(0); base += get_local_size(0)) {
+        // Load up the count of valid elements for each block before us in batches of 128
+        if ((base + get_local_id(0)) < get_group_id(0)) {
+            validBlock[get_local_id(0)] = dgBlockCounts[base+get_local_id(0)];
+        } else {
+            validBlock[get_local_id(0)] = 0;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Parallel reduce these counts
+        // Accumulate in the final offset variable
+        blockOutOffset += sumReduce128(validBlock);
+    }
+
+    unsigned int ub;
+    const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
+    ub = (len < (get_group_id(0)+1)*eltsPerBlock) ? len : ((get_group_id(0) + 1)*eltsPerBlock);
+    for (int base = get_group_id(0) * eltsPerBlock; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
+        if ((base + get_local_id(0)) < ub) {
+            validBlock[get_local_id(0)] = dgValid[base+get_local_id(0)];
+            inBlock[get_local_id(0)] = dgData[base+get_local_id(0)];
+        } else {
+            validBlock[get_local_id(0)] = 0;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock,dsLocalIndex);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (get_local_id(0) < numValidBlock) {
+            dgCompact[blockOutOffset + get_local_id(0)] = compactBlock[get_local_id(0)];
+        }
+        blockOutOffset += numValidBlock;
+    }
+    if (get_group_id(0) == (get_num_groups(0)-1) && get_local_id(0) == 0) {
+        *dNumValidElements = blockOutOffset;
+    }
+}
--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
+const int BlockSize = 32;
+
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox) {
+    int index = get_global_id(0);
+    int base = index*BlockSize;
+    while (base < numAtoms) {
+        float4 pos = posq[base];
+#ifdef USE_PERIODIC
+        pos.x -= floor(pos.x/periodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y/periodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z/periodicBoxSize.z)*periodicBoxSize.z;
+        float4 firstPoint = pos;
+#endif
+        float4 minPos = pos;
+        float4 maxPos = pos;
+        int last = min(base+BlockSize, numAtoms);
+        for (int i = base+1; i < last; i++) {
+            pos = posq[i];
+#ifdef USE_PERIODIC
+            pos.x -= floor((pos.x-firstPoint.x)/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            pos.y -= floor((pos.y-firstPoint.y)/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            pos.z -= floor((pos.z-firstPoint.z)/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            minPos = min(minPos, pos);
+            maxPos = max(maxPos, pos);
+        }
+        blockBoundingBox[index] = 0.5f*(maxPos-minPos);
+        blockCenter[index] = 0.5f*(maxPos+minPos);
+        index += get_global_size(0);
+        base = index*BlockSize;
+    }
+}
+
+/**
+ * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
+ * mark them as non-interacting.
+ */
+__kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, float4 periodicBoxSize, __global unsigned int* tiles, __global float4* blockCenter,
+        __global float4* blockBoundingBox, __global unsigned int* interactionFlag) {
+    int index = get_global_id(0);
+    while (index < numTiles) {
+        // Extract cell coordinates from appropriate work unit
+
+        unsigned int x = tiles[index];
+        unsigned int y = ((x >> 2) & 0x7fff);
+        x = (x >> 17);
+
+        // Find the distance between the bounding boxes of the two cells.
+
+        float4 delta = blockCenter[x]-blockCenter[y];
+#ifdef USE_PERIODIC
+        delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
+        delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
+        delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+        float4 boxSizea = blockBoundingBox[x];
+        float4 boxSizeb = blockBoundingBox[y];
+        delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
+        delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
+        delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
+        interactionFlag[index] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1);
+        index += get_global_size(0);
+    }
+}
+
+/**
+ * Compare each atom in one block to the bounding box of another block, and set
+ * flags for which ones are interacting.
+ */
+__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, __global float4* posq, __global unsigned int* tiles, __global float4* blockCenter,
+            __global float4* blockBoundingBox, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local unsigned int* flags) {
+    unsigned int totalWarps = get_global_size(0)/BlockSize;
+    unsigned int warp = get_global_id(0)/BlockSize;
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+    unsigned int index = get_local_id(0) & (BlockSize - 1);
+
+    unsigned int lasty = 0xFFFFFFFF;
+    float4 apos;
+    while (pos < end)
+    {
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = tiles[pos];
+        unsigned int y = ((x >> 2) & 0x7fff);
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17);
+        if (x == y || bExclusionFlag)
+        {
+            // Assume this block will be dense.
+
+            if (index == 0)
+                interactionFlags[pos] = 0xFFFFFFFF;
+        }
+        else
+        {
+            // Load the bounding box for x and the atom positions for y.
+
+            float4 center = blockCenter[x];
+            float4 boxSize = blockBoundingBox[x];
+            if (y != lasty)
+                apos = posq[(y*BlockSize)+index];
+
+            // Find the distance of the atom from the bounding box.
+
+            float4 delta = apos-center;
+#ifdef USE_PERIODIC
+            delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
+            delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
+            delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+            delta = max((float4) 0.0f, fabs(delta)-boxSize);
+            int thread = get_local_id(0);
+            flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index);
+
+            // Sum the flags.
+
+            if (index % 2 == 0)
+                flags[thread] += flags[thread+1];
+            if (index % 4 == 0)
+                flags[thread] += flags[thread+2];
+            if (index % 8 == 0)
+                flags[thread] += flags[thread+4];
+            if (index % 16 == 0)
+                flags[thread] += flags[thread+8];
+            if (index == 0)
+            {
+                unsigned int allFlags = flags[thread] + flags[thread+16];
+
+                // Count how many flags are set, and based on that decide whether to compute all interactions
+                // or only a fraction of them.
+
+                unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
+                bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
+                bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
+                bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
+                bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
+                interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
+            }
+            lasty = y;
+        }
+        pos++;
+    }
+}
--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -84,16 +84,16 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu

            // Write results
            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*paddedNumAtoms;
-            of = forceBuffers[offset];
-            of.xyz += af.xyz;
-            forceBuffers[offset] = of;
-#else
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
            of.xyz = af.xyz;
            of.w = 0.0f;
            unsigned int offset = x + tgx + (x/TileSize) * paddedNumAtoms;
            forceBuffers[offset] = of;
+#else
+            unsigned int offset = x + tgx + warp*paddedNumAtoms;
+            of = forceBuffers[offset];
+            of.xyz += af.xyz;
+            forceBuffers[offset] = of;
 #endif
        }
        else {
@@ -169,14 +169,14 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
                    }
                }
            }
-            else  // bExclusion
+            else
 #endif
            {
                // Read fixed atom data into registers and GRF
                unsigned int xi = x/TileSize;
                unsigned int yi = y/TileSize;
                unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
-                unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
+                unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
                excl = (excl >> tgx) | (excl << (TileSize - tgx));
                for (unsigned int j = 0; j < TileSize; j++) {
                    bool isExcluded = !(excl & 0x1);
@@ -222,16 +222,7 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu

            // Write results
            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*paddedNumAtoms;
-            of = forceBuffers[offset];
-            of.xyz += af.xyz;
-            forceBuffers[offset] = of;
-            offset = y + tgx + warp*paddedNumAtoms;
-            of = forceBuffers[offset];
-            of.xyz += local_foce[get_local_id(0)].xyz;
-            forceBuffers[offset] = of;
-#else
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
            of.xyz = af.xyz;
            of.w = 0.0f;
            unsigned int offset = x + tgx + (y/TileSize) * paddedNumAtoms;
@@ -239,6 +230,15 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
            of = local_force[get_local_id(0)];
            offset = y + tgx + (x/TileSize) * paddedNumAtoms;
            forceBuffers[offset] = of;
+#else
+            unsigned int offset = x + tgx + warp*paddedNumAtoms;
+            of = forceBuffers[offset];
+            of.xyz += af.xyz;
+            forceBuffers[offset] = of;
+            offset = y + tgx + warp*paddedNumAtoms;
+            of = forceBuffers[offset];
+            of.xyz += local_force[get_local_id(0)].xyz;
+            forceBuffers[offset] = of;
 #endif
            lasty = y;
        }

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -43,7 +43,8 @@
 #include "openmm/LangevinIntegrator.h"
 #include "openmm/VerletIntegrator.h"
 #include "openmm/internal/ContextImpl.h"
-//#include "kernels/gputypes.h"
+#include "OpenCLArray.h"
+#include "OpenCLNonbondedUtilities.h"
 #include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
 #include "../src/sfmt/SFMT.h"
 #include <iostream>
@@ -438,167 +439,176 @@ void testLargeSystem() {
    ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }

-//void testBlockInteractions(bool periodic) {
-//    const int blockSize = 32;
-//    const int numBlocks = 100;
-//    const int numParticles = blockSize*numBlocks;
-//    const double cutoff = 1.0;
-//    const double boxSize = (periodic ? 5.1 : 1.1);
-//    OpenCLPlatform cl;
-//    System system;
-//    VerletIntegrator integrator(0.01);
-//    NonbondedForce* nonbonded = new NonbondedForce();
-//    vector<Vec3> positions(numParticles);
-//    init_gen_rand(0);
-//    for (int i = 0; i < numParticles; i++) {
-//        system.addParticle(1.0);
-//        nonbonded->addParticle(1.0, 0.2, 0.2);
-//        positions[i] = Vec3(boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1));
-//    }
-//    nonbonded->setNonbondedMethod(periodic ? NonbondedForce::CutoffPeriodic : NonbondedForce::CutoffNonPeriodic);
-//    nonbonded->setCutoffDistance(cutoff);
-//    system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
-//    system.addForce(nonbonded);
-//    Context context(system, integrator, cl);
-//    context.setPositions(positions);
-//    State state = context.getState(State::Positions | State::Velocities | State::Forces);
-//    ContextImpl* contextImpl = *reinterpret_cast<ContextImpl**>(&context);
-//    OpenCLPlatform::PlatformData& data = *static_cast<OpenCLPlatform::PlatformData*>(contextImpl->getPlatformData());
-//
-//    // Verify that the bounds of each block were calculated correctly.
-//
-//    data.gpu->psPosq4->Download();
-//    data.gpu->psGridBoundingBox->Download();
-//    data.gpu->psGridCenter->Download();
-//    for (int i = 0; i < numBlocks; i++) {
-//        float4 gridSize = (*data.gpu->psGridBoundingBox)[i];
-//        float4 center = (*data.gpu->psGridCenter)[i];
-//        if (periodic) {
-//            ASSERT(gridSize.x < 0.5*boxSize);
-//            ASSERT(gridSize.y < 0.5*boxSize);
-//            ASSERT(gridSize.z < 0.5*boxSize);
-//        }
-//        float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
-//        for (int j = 0; j < blockSize; j++) {
-//            float4 pos = (*data.gpu->psPosq4)[i*blockSize+j];
-//            float dx = pos.x-center.x;
-//            float dy = pos.y-center.y;
-//            float dz = pos.z-center.z;
-//            if (periodic) {
-//                dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
-//                dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
-//                dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
-//            }
-//            ASSERT(abs(dx) < gridSize.x+TOL);
-//            ASSERT(abs(dy) < gridSize.y+TOL);
-//            ASSERT(abs(dz) < gridSize.z+TOL);
-//            minx = min(minx, dx);
-//            maxx = max(maxx, dx);
-//            miny = min(miny, dy);
-//            maxy = max(maxy, dy);
-//            minz = min(minz, dz);
-//            maxz = max(maxz, dz);
-//        }
-//        ASSERT_EQUAL_TOL(-minx, gridSize.x, TOL);
-//        ASSERT_EQUAL_TOL(maxx, gridSize.x, TOL);
-//        ASSERT_EQUAL_TOL(-miny, gridSize.y, TOL);
-//        ASSERT_EQUAL_TOL(maxy, gridSize.y, TOL);
-//        ASSERT_EQUAL_TOL(-minz, gridSize.z, TOL);
-//        ASSERT_EQUAL_TOL(maxz, gridSize.z, TOL);
-//    }
-//
-//    // Verify that interactions were identified correctly.
-//
-//    data.gpu->psInteractionCount->Download();
-//    int numWithInteractions = (*data.gpu->psInteractionCount)[0];
-//    vector<bool> hasInteractions(data.gpu->sim.workUnits, false);
-//    data.gpu->psInteractingWorkUnit->Download();
-//    data.gpu->psInteractionFlag->Download();
-//    const unsigned int atoms = data.gpu->sim.paddedNumberOfAtoms;
-//    const unsigned int grid = data.gpu->grid;
-//    const unsigned int dim = (atoms+(grid-1))/grid;
-//    for (int i = 0; i < numWithInteractions; i++) {
-//        unsigned int workUnit = (*data.gpu->psInteractingWorkUnit)[i];
-//        unsigned int x = (workUnit >> 17);
-//        unsigned int y = ((workUnit >> 2) & 0x7fff);
-//        int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
-//        hasInteractions[tile] = true;
-//
-//        // Make sure this tile really should have been flagged based on bounding volumes.
-//
-//        float4 gridSize1 = (*data.gpu->psGridBoundingBox)[x];
-//        float4 gridSize2 = (*data.gpu->psGridBoundingBox)[y];
-//        float4 center1 = (*data.gpu->psGridCenter)[x];
-//        float4 center2 = (*data.gpu->psGridCenter)[y];
-//        float dx = center1.x-center2.x;
-//        float dy = center1.y-center2.y;
-//        float dz = center1.z-center2.z;
-//        if (periodic) {
-//            dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
-//            dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
-//            dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
-//        }
-//        dx = max(0.0f, abs(dx)-gridSize1.x-gridSize2.x);
-//        dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
-//        dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
-//        ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
-//
-//        // Check the interaction flags.
-//
-//        unsigned int flags = (*data.gpu->psInteractionFlag)[i];
-//        for (int atom2 = 0; atom2 < 32; atom2++) {
-//            if ((flags & 1) == 0) {
-//                float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
-//                for (int atom1 = 0; atom1 < blockSize; ++atom1) {
-//                    float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
-//                    float dx = pos2.x-pos1.x;
-//                    float dy = pos2.y-pos1.y;
-//                    float dz = pos2.z-pos1.z;
-//                    if (periodic) {
-//                        dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
-//                        dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
-//                        dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
-//                    }
-//                    if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
-//                        dx = pos2.x-center1.x;
-//                        dy = pos2.y-center1.y;
-//                        dz = pos2.z-center1.z;
-//                        dx = max(0.0f, abs(dx)-gridSize1.x);
-//                        dy = max(0.0f, abs(dy)-gridSize1.y);
-//                        dz = max(0.0f, abs(dz)-gridSize1.z);
-//                    }
-//                    ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
-//                }
-//            }
-//            flags >>= 1;
-//        }
-//    }
-//
-//    // Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
-//
-//    data.gpu->psWorkUnit->Download();
-//    for (int i = 0; i < (int)hasInteractions.size(); i++)
-//        if (!hasInteractions[i]) {
-//            unsigned int workUnit = (*data.gpu->psWorkUnit)[i];
-//            unsigned int x = (workUnit >> 17);
-//            unsigned int y = ((workUnit >> 2) & 0x7fff);
-//            for (int atom1 = 0; atom1 < blockSize; ++atom1) {
-//                float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
-//                for (int atom2 = 0; atom2 < blockSize; ++atom2) {
-//                    float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
-//                    float dx = pos1.x-pos2.x;
-//                    float dy = pos1.y-pos2.y;
-//                    float dz = pos1.z-pos2.z;
-//                    if (periodic) {
-//                        dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
-//                        dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
-//                        dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
-//                    }
-//                    ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
-//                }
-//            }
-//        }
-//}
+void testBlockInteractions(bool periodic) {
+    const int blockSize = 32;
+    const int numBlocks = 100;
+    const int numParticles = blockSize*numBlocks;
+    const double cutoff = 1.0;
+    const double boxSize = (periodic ? 5.1 : 1.1);
+    OpenCLPlatform cl;
+    System system;
+    VerletIntegrator integrator(0.01);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    vector<Vec3> positions(numParticles);
+    init_gen_rand(0);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(1.0);
+        nonbonded->addParticle(1.0, 0.2, 0.2);
+        positions[i] = Vec3(boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1));
+    }
+    nonbonded->setNonbondedMethod(periodic ? NonbondedForce::CutoffPeriodic : NonbondedForce::CutoffNonPeriodic);
+    nonbonded->setCutoffDistance(cutoff);
+    system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    system.addForce(nonbonded);
+    Context context(system, integrator, cl);
+    context.setPositions(positions);
+    State state = context.getState(State::Positions | State::Velocities | State::Forces);
+    ContextImpl* contextImpl = *reinterpret_cast<ContextImpl**>(&context);
+    OpenCLPlatform::PlatformData& data = *static_cast<OpenCLPlatform::PlatformData*>(contextImpl->getPlatformData());
+    OpenCLContext& clcontext = *data.context;
+
+    // Verify that the bounds of each block were calculated correctly.
+
+    clcontext.getPosq().download();
+    vector<mm_float4> blockCenters(numBlocks);
+    vector<mm_float4> blockBoundingBoxes(numBlocks);
+    OpenCLNonbondedUtilities& nb = clcontext.getNonbondedUtilities();
+    nb.getBlockCenters().download(blockCenters);
+    nb.getBlockBoundingBoxes().download(blockBoundingBoxes);
+    for (int i = 0; i < numBlocks; i++) {
+        mm_float4 gridSize = blockBoundingBoxes[i];
+        mm_float4 center = blockCenters[i];
+        if (periodic) {
+            ASSERT(gridSize.x < 0.5*boxSize);
+            ASSERT(gridSize.y < 0.5*boxSize);
+            ASSERT(gridSize.z < 0.5*boxSize);
+        }
+        float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
+        for (int j = 0; j < blockSize; j++) {
+            mm_float4 pos = clcontext.getPosq()[i*blockSize+j];
+            float dx = pos.x-center.x;
+            float dy = pos.y-center.y;
+            float dz = pos.z-center.z;
+            if (periodic) {
+                dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+                dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+                dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+            }
+            ASSERT(abs(dx) < gridSize.x+TOL);
+            ASSERT(abs(dy) < gridSize.y+TOL);
+            ASSERT(abs(dz) < gridSize.z+TOL);
+            minx = min(minx, dx);
+            maxx = max(maxx, dx);
+            miny = min(miny, dy);
+            maxy = max(maxy, dy);
+            minz = min(minz, dz);
+            maxz = max(maxz, dz);
+        }
+        ASSERT_EQUAL_TOL(-minx, gridSize.x, TOL);
+        ASSERT_EQUAL_TOL(maxx, gridSize.x, TOL);
+        ASSERT_EQUAL_TOL(-miny, gridSize.y, TOL);
+        ASSERT_EQUAL_TOL(maxy, gridSize.y, TOL);
+        ASSERT_EQUAL_TOL(-minz, gridSize.z, TOL);
+        ASSERT_EQUAL_TOL(maxz, gridSize.z, TOL);
+    }
+
+    // Verify that interactions were identified correctly.
+
+    vector<cl_uint> interactionCount;
+    vector<cl_uint> interactingTiles;
+    vector<cl_uint> interactionFlags;
+    nb.getInteractionCount().download(interactionCount);
+    int numWithInteractions = interactionCount[0];
+    vector<bool> hasInteractions(nb.getTiles().getSize(), false);
+    nb.getInteractingTiles().download(interactingTiles);
+    nb.getInteractionFlags().download(interactionFlags);
+    const unsigned int atoms = clcontext.getPaddedNumAtoms();
+    const unsigned int grid = OpenCLContext::TileSize;
+    const unsigned int dim = clcontext.getNumAtomBlocks();
+    printf("%d of %d\n", numWithInteractions, hasInteractions.size());
+    for (int i = 0; i < numWithInteractions; i++) {
+        unsigned int tile = interactingTiles[i];
+        unsigned int x = (tile >> 17);
+        unsigned int y = ((tile >> 2) & 0x7fff);
+        int index = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
+        hasInteractions[index] = true;
+
+        // Make sure this tile really should have been flagged based on bounding volumes.
+
+        mm_float4 gridSize1 = blockBoundingBoxes[x];
+        mm_float4 gridSize2 = blockBoundingBoxes[y];
+        mm_float4 center1 = blockCenters[x];
+        mm_float4 center2 = blockCenters[y];
+        float dx = center1.x-center2.x;
+        float dy = center1.y-center2.y;
+        float dz = center1.z-center2.z;
+        if (periodic) {
+            dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+            dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+            dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+        }
+        dx = max(0.0f, abs(dx)-gridSize1.x-gridSize2.x);
+        dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
+        dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
+        ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
+
+        // Check the interaction flags.
+
+        unsigned int flags = interactionFlags[i];
+        for (int atom2 = 0; atom2 < 32; atom2++) {
+            if ((flags & 1) == 0) {
+                mm_float4 pos2 = clcontext.getPosq()[y*blockSize+atom2];
+                for (int atom1 = 0; atom1 < blockSize; ++atom1) {
+                    mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1];
+                    float dx = pos2.x-pos1.x;
+                    float dy = pos2.y-pos1.y;
+                    float dz = pos2.z-pos1.z;
+                    if (periodic) {
+                        dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+                        dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+                        dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+                    }
+                    if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
+                        dx = pos2.x-center1.x;
+                        dy = pos2.y-center1.y;
+                        dz = pos2.z-center1.z;
+                        dx = max(0.0f, abs(dx)-gridSize1.x);
+                        dy = max(0.0f, abs(dy)-gridSize1.y);
+                        dz = max(0.0f, abs(dz)-gridSize1.z);
+                    }
+                    ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
+                }
+            }
+            flags >>= 1;
+        }
+    }
+
+    // Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
+
+    vector<cl_uint> tiles;
+    nb.getTiles().download(tiles);
+    for (int i = 0; i < (int) hasInteractions.size(); i++) 
+        if (!hasInteractions[i]) {
+            unsigned int tile = tiles[i];
+            unsigned int x = (tile >> 17);
+            unsigned int y = ((tile >> 2) & 0x7fff);
+            for (int atom1 = 0; atom1 < blockSize; ++atom1) {
+                mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1];
+                for (int atom2 = 0; atom2 < blockSize; ++atom2) {
+                    mm_float4 pos2 = clcontext.getPosq()[y*blockSize+atom2];
+                    float dx = pos1.x-pos2.x;
+                    float dy = pos1.y-pos2.y;
+                    float dz = pos1.z-pos2.z;
+                    if (periodic) {
+                        dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+                        dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+                        dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+                    }
+                    ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
+                }
+            }
+        }
+}

 int main() {
    try {
@@ -609,7 +619,7 @@ int main() {
 //        testCutoff14();
 //        testPeriodic();
 //        testLargeSystem();
-//        testBlockInteractions(false);
+        testBlockInteractions(false);
 //        testBlockInteractions(true);
    }
    catch(const exception& e) {