Began implementing nonbonded forces

3976ad91 · Peter Eastman · 4c489434 · 3976ad91 · 3976ad91 · 3976ad91
Commit 3976ad91 authored Oct 01, 2009 by Peter Eastman
10 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -28,6 +28,7 @@
 #include "OpenCLArray.h"
 #include "OpenCLForceInfo.h"
 #include "OpenCLIntegrationUtilities.h"
+#include "OpenCLNonbondedUtilities.h"
 #include "openmm/Platform.h"
 #include "openmm/System.h"
 #include <fstream>
@@ -61,8 +62,10 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
    numAtoms = numParticles;
    paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
-    numTiles = numAtomBlocks*(numAtomBlocks+1)/2;
    numThreadBlocks = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]/ThreadBlockSize;
+    nonbonded = new OpenCLNonbondedUtilities(*this);
+    posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
+    velm = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "velm", true);

    // Create utility kernels that are used in multiple places.

@@ -74,26 +77,25 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
 OpenCLContext::~OpenCLContext() {
    for (int i = 0; i < (int) forces.size(); i++)
        delete forces[i];
-    delete posq;
-    delete velm;
-    delete force;
-    delete forceBuffers;
-    delete energyBuffer;
-    delete atomIndex;
-    delete integration;
+    if (posq != NULL)
+        delete posq;
+    if (velm != NULL)
+        delete velm;
+    if (force != NULL)
+        delete force;
+    if (forceBuffers != NULL)
+        delete forceBuffers;
+    if (energyBuffer != NULL)
+        delete energyBuffer;
+    if (atomIndex != NULL)
+        delete atomIndex;
+    if (integration != NULL)
+        delete integration;
+    if (nonbonded != NULL)
+        delete nonbonded;
 }

 void OpenCLContext::initialize(const System& system) {
-//    forceBufferPerWarp = true;
-//    numForceBuffers = numThreadBlocks*ThreadBlockSize/TileSize;
-//    if (numForceBuffers >= numAtomBlocks) {
-//        // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
-//
-//        forceBufferPerWarp = false;
-//        numForceBuffers = numAtomBlocks;
-//    }
-    posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
-    velm = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "velm", true);
    for (int i = 0; i < numAtoms; i++)
        (*velm)[i].w = (float) (1.0/system.getParticleMass(i));
    velm->upload();
@@ -108,6 +110,7 @@ void OpenCLContext::initialize(const System& system) {
        (*atomIndex)[i] = i;
    atomIndex->upload();
    integration = new OpenCLIntegrationUtilities(*this, system);
+    nonbonded->initialize(system);
 }

 void OpenCLContext::addForce(OpenCLForceInfo* force) {

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -36,6 +36,7 @@ template <class T>
 class OpenCLArray;
 class OpenCLForceInfo;
 class OpenCLIntegrationUtilities;
+class OpenCLNonbondedUtilities;
 class System;

 /**
@@ -210,12 +211,6 @@ public:
    int getNumThreadBlocks() const {
        return numThreadBlocks;
    }
-    /**
-     * Get the total number of tiles used for nonbonded computation.
-     */
-    int getNumTiles() const {
-        return numTiles;
-    }
    /**
     * Get the number of force buffers.
     */
@@ -228,13 +223,18 @@ public:
    OpenCLIntegrationUtilities& getIntegrationUtilties() {
        return *integration;
    }
+    /**
+     * Get the OpenCLNonbondedUtilities for this context.
+     */
+    OpenCLNonbondedUtilities& getNonbondedUtilties() {
+        return *nonbonded;
+    }
 private:
    double time;
    int stepCount;
    int numAtoms;
    int paddedNumAtoms;
    int numAtomBlocks;
-    int numTiles;
    int numThreadBlocks;
    int numForceBuffers;
    cl::Context context;
@@ -251,6 +251,7 @@ private:
    OpenCLArray<cl_float>* energyBuffer;
    OpenCLArray<cl_int>* atomIndex;
    OpenCLIntegrationUtilities* integration;
+    OpenCLNonbondedUtilities* nonbonded;
 };

 } // namespace OpenMM

--- a/platforms/opencl/src/OpenCLKernelFactory.cpp
+++ b/platforms/opencl/src/OpenCLKernelFactory.cpp
@@ -46,8 +46,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
        return new OpenCLCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
    if (name == CalcRBTorsionForceKernel::Name())
        return new OpenCLCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
-//    if (name == CalcNonbondedForceKernel::Name())
-//        return new OpenCLCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
+    if (name == CalcNonbondedForceKernel::Name())
+        return new OpenCLCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
 //    if (name == CalcCustomNonbondedForceKernel::Name())
 //        return new OpenCLCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
 //    if (name == CalcGBSAOBCForceKernel::Name())

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -30,6 +30,7 @@
 #include "openmm/Context.h"
 #include "openmm/internal/ContextImpl.h"
 #include "OpenCLIntegrationUtilities.h"
+#include "OpenCLNonbondedUtilities.h"
 #include <cmath>

 using namespace OpenMM;
@@ -46,10 +47,12 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {

 void OpenCLCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context) {
    cl.clearBuffer(cl.getForceBuffers());
+    cl.getNonbondedUtilties().prepareInteractions();
 }

 void OpenCLCalcForcesAndEnergyKernel::finishForceComputation(ContextImpl& context) {
    cl.reduceBuffer(cl.getForceBuffers(), cl.getNumForceBuffers());
+    cl.getNonbondedUtilties().prepareInteractions();
 }

 void OpenCLCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& context) {
@@ -452,150 +455,127 @@ double OpenCLCalcRBTorsionForceKernel::executeEnergy(ContextImpl& context) {
    return 0.0;
 }

-//OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
-//}
-//
-//void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
-//    if (data.primaryKernel == NULL)
-//        data.primaryKernel = this;
-//    data.hasNonbonded = true;
-//    numParticles = force.getNumParticles();
-//    _gpuContext* gpu = data.gpu;
-//
-//    // Identify which exceptions are 1-4 interactions.
-//
-//    vector<pair<int, int> > exclusions;
-//    vector<int> exceptions;
-//    for (int i = 0; i < force.getNumExceptions(); i++) {
-//        int particle1, particle2;
-//        double chargeProd, sigma, epsilon;
-//        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
-//        exclusions.push_back(pair<int, int>(particle1, particle2));
-//        if (chargeProd != 0.0 || epsilon != 0.0)
-//            exceptions.push_back(i);
+OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
+    if (sigmaEpsilon != NULL)
+        delete sigmaEpsilon;
+}
+
+void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+
+    // Identify which exceptions are 1-4 interactions.
+
+    vector<pair<int, int> > exclusions;
+    vector<int> exceptions;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
+        exclusions.push_back(pair<int, int>(particle1, particle2));
+        if (chargeProd != 0.0 || epsilon != 0.0)
+            exceptions.push_back(i);
+    }
+
+    // Initialize nonbonded interactions.
+
+    int numParticles = force.getNumParticles();
+    sigmaEpsilon = new OpenCLArray<mm_float2>(cl, numParticles, "sigmaEpsilon");
+    OpenCLArray<mm_float4>& posq = cl.getPosq();
+    vector<mm_float2> sigmaEpsilonVector(numParticles);
+    vector<vector<int> > exclusionList(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        double charge, sigma, epsilon;
+        force.getParticleParameters(i, charge, sigma, epsilon);
+        posq[i].w = (float) charge;
+        sigmaEpsilonVector[i] = (mm_float2) {(float) (0.5*sigma), (float) (2.0*sqrt(epsilon))};
+        exclusionList[i].push_back(i);
+    }
+    for (int i = 0; i < (int) exclusions.size(); i++) {
+        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
+        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
+    }
+    posq.upload();
+    sigmaEpsilon->upload(sigmaEpsilonVector);
+    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
+    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
+//    if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
+//        method = CUTOFF;
 //    }
-//
-//    // Initialize nonbonded interactions.
-//
-//    {
-//        vector<int> particle(numParticles);
-//        vector<float> c6(numParticles);
-//        vector<float> c12(numParticles);
-//        vector<float> q(numParticles);
-//        vector<char> symbol;
-//        vector<vector<int> > exclusionList(numParticles);
-//        for (int i = 0; i < numParticles; i++) {
-//            double charge, radius, depth;
-//            force.getParticleParameters(i, charge, radius, depth);
-//            particle[i] = i;
-//            q[i] = (float) charge;
-//            c6[i] = (float) (4*depth*pow(radius, 6.0));
-//            c12[i] = (float) (4*depth*pow(radius, 12.0));
-//            exclusionList[i].push_back(i);
-//        }
-//        for (int i = 0; i < (int)exclusions.size(); i++) {
-//            exclusionList[exclusions[i].first].push_back(exclusions[i].second);
-//            exclusionList[exclusions[i].second].push_back(exclusions[i].first);
-//        }
-//        Vec3 boxVectors[3];
-//        system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
-//        gpuSetPeriodicBoxSize(gpu, (float)boxVectors[0][0], (float)boxVectors[1][1], (float)boxVectors[2][2]);
-//        OpenCLNonbondedMethod method = NO_CUTOFF;
-//        if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
-//            gpuSetNonbondedCutoff(gpu, (float)force.getCutoffDistance(), force.getReactionFieldDielectric());
-//            method = CUTOFF;
-//        }
-//        if (force.getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
-//            method = PERIODIC;
-//        }
-//        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
-//            double ewaldErrorTol = force.getEwaldErrorTolerance();
-//            double alpha = (1.0/force.getCutoffDistance())*std::sqrt(-std::log(ewaldErrorTol));
-//            double mx = boxVectors[0][0]/force.getCutoffDistance();
-//            double my = boxVectors[1][1]/force.getCutoffDistance();
-//            double mz = boxVectors[2][2]/force.getCutoffDistance();
-//            double pi = 3.1415926535897932385;
-//            int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
-//            int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
-//            int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
-//            if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
-//                if (kmaxx%2 == 0)
-//                    kmaxx++;
-//                if (kmaxy%2 == 0)
-//                    kmaxy++;
-//                if (kmaxz%2 == 0)
-//                    kmaxz++;
-//                gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
-//                method = EWALD;
-//            }
-//            else {
-//                int gridSizeX = -0.5*kmaxx*std::log(ewaldErrorTol);
-//                int gridSizeY = -0.5*kmaxy*std::log(ewaldErrorTol);
-//                int gridSizeZ = -0.5*kmaxz*std::log(ewaldErrorTol);
-////                printf("%d %d\n", gridSizeX, (int) (kmaxx*std::sqrt(-std::log(ewaldErrorTol))));
-////                gridSizeX = 0.02*mx*std::pow(-std::log(1.5*ewaldErrorTol), 3);
-////                gridSizeY = 0.02*my*std::pow(-std::log(1.5*ewaldErrorTol), 3);
-////                gridSizeZ = 0.02*mz*std::pow(-std::log(1.5*ewaldErrorTol), 3);
-////                double scale = 0.698*std::pow(ewaldErrorTol, -0.312);
-////                double scale = 0.713*std::pow(ewaldErrorTol, -0.261);
-////                printf("%f\n", scale);
-//                gridSizeX = mx*NonbondedForce::PMEscale;
-//                gridSizeY = my*NonbondedForce::PMEscale;
-//                gridSizeZ = mz*NonbondedForce::PMEscale;
-////                printf("%d %d %d\n", gridSizeX, gridSizeY, gridSizeZ);
-////                gridSizeX = mx*scale;
-////                gridSizeY = my*scale;
-////                gridSizeZ = mz*scale;
-//                gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
-//                method = PARTICLE_MESH_EWALD;
-//            }
+//    if (force.getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
+//        method = PERIODIC;
+//    }
+//    if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
+//        double ewaldErrorTol = force.getEwaldErrorTolerance();
+//        double alpha = (1.0/force.getCutoffDistance())*std::sqrt(-std::log(ewaldErrorTol));
+//        double mx = boxVectors[0][0]/force.getCutoffDistance();
+//        double my = boxVectors[1][1]/force.getCutoffDistance();
+//        double mz = boxVectors[2][2]/force.getCutoffDistance();
+//        double pi = 3.1415926535897932385;
+//        int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+//        int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+//        int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
+//        if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
+//            if (kmaxx%2 == 0)
+//                kmaxx++;
+//            if (kmaxy%2 == 0)
+//                kmaxy++;
+//            if (kmaxz%2 == 0)
+//                kmaxz++;
+//            gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
+//            method = EWALD;
 //        }
-//        data.nonbondedMethod = method;
-//        gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
-//
-//        // Compute the Ewald self energy.
-//
-//        data.ewaldSelfEnergy = 0.0;
-//        if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
-//            double selfEnergyScale = gpu->sim.epsfac*gpu->sim.alphaEwald/std::sqrt(PI);
-//                for (int i = 0; i < numParticles; i++)
-//                    data.ewaldSelfEnergy -= selfEnergyScale*q[i]*q[i];
+//        else {
+//            int gridSizeX = -0.5*kmaxx*std::log(ewaldErrorTol);
+//            int gridSizeY = -0.5*kmaxy*std::log(ewaldErrorTol);
+//            int gridSizeZ = -0.5*kmaxz*std::log(ewaldErrorTol);
+//            gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
+//            method = PARTICLE_MESH_EWALD;
 //        }
 //    }
-//
-//    // Initialize 1-4 nonbonded interactions.
-//
-//    {
-//        int numExceptions = exceptions.size();
-//        vector<int> particle1(numExceptions);
-//        vector<int> particle2(numExceptions);
-//        vector<float> c6(numExceptions);
-//        vector<float> c12(numExceptions);
-//        vector<float> q1(numExceptions);
-//        vector<float> q2(numExceptions);
-//        for (int i = 0; i < numExceptions; i++) {
-//            double charge, sig, eps;
-//            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps);
-//            c6[i] = (float) (4*eps*pow(sig, 6.0));
-//            c12[i] = (float) (4*eps*pow(sig, 12.0));
-//            q1[i] = (float) charge;
-//            q2[i] = 1.0f;
-//        }
+//    data.nonbondedMethod = method;
+//    gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
+    cl.getNonbondedUtilties().addInteraction(useCutoff, usePeriodic, force.getCutoffDistance(), exclusionList);
+    cl.getNonbondedUtilties().addParameter("sigmaEpsilon", "float2", 8, sigmaEpsilon->getDeviceBuffer());
+
+    // Compute the Ewald self energy.
+
+    ewaldSelfEnergy = 0.0;
+    if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
+//        double selfEnergyScale = gpu->sim.epsfac*gpu->sim.alphaEwald/std::sqrt(PI);
+//            for (int i = 0; i < numParticles; i++)
+//                ewaldSelfEnergy -= selfEnergyScale*q[i]*q[i];
+    }
+
+    // Initialize 1-4 nonbonded interactions.
+
+    {
+        int numExceptions = exceptions.size();
+        vector<int> particle1(numExceptions);
+        vector<int> particle2(numExceptions);
+        vector<float> c6(numExceptions);
+        vector<float> c12(numExceptions);
+        vector<float> q1(numExceptions);
+        vector<float> q2(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            double charge, sig, eps;
+            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps);
+            c6[i] = (float) (4*eps*pow(sig, 6.0));
+            c12[i] = (float) (4*eps*pow(sig, 12.0));
+            q1[i] = (float) charge;
+            q2[i] = 1.0f;
+        }
 //        gpuSetLJ14Parameters(gpu, 138.935485f, 1.0f, particle1, particle2, c6, c12, q1, q2);
-//    }
-//}
-//
-//void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
-//    if (data.primaryKernel == this)
-//        calcForces(context, data);
-//}
-//
-//double OpenCLCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
-//    if (data.primaryKernel == this)
-//        return calcEnergy(context, data, system);
-//    return 0.0;
-//}
-//
+    }
+}
+
+void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
+    cl.getNonbondedUtilties().computeInteractions();
+}
+
+double OpenCLCalcNonbondedForceKernel::executeEnergy(ContextImpl& context) {
+    executeForces(context);
+    return ewaldSelfEnergy;
+}
+
 //OpenCLCalcCustomNonbondedForceKernel::~OpenCLCalcCustomNonbondedForceKernel() {
 //}
 //

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -294,40 +294,40 @@ private:
    cl::Kernel kernel;
 };

-///**
-// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
-// */
-//class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
-//public:
-//    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform), cl(cl), system(system) {
-//    }
-//    ~OpenCLCalcNonbondedForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the NonbondedForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const NonbondedForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     */
-//    void executeForces(ContextImpl& context);
-//    /**
-//     * Execute the kernel to calculate the energy.
-//     *
-//     * @param context    the context in which to execute this kernel
-//     * @return the potential energy due to the NonbondedForce
-//     */
-//    double executeEnergy(ContextImpl& context);
-//private:
-//    OpenCLContext& cl;
-//    int numParticles;
-//    System& system;
-//};
-//
+/**
+ * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
+ */
+class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
+public:
+    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform), cl(cl) {
+    }
+    ~OpenCLCalcNonbondedForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the NonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const NonbondedForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     *
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     *
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the NonbondedForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    OpenCLContext& cl;
+    OpenCLArray<mm_float2>* sigmaEpsilon;
+    double ewaldSelfEnergy;
+};
+
 ///**
 // * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
 // */

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "OpenCLNonbondedUtilities.h"
+#include "OpenCLArray.h"
+#include <map>
+
+using namespace OpenMM;
+using namespace std;
+
+OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
+        numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL) {
+}
+
+OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
+    if (tiles != NULL)
+        delete tiles;
+    if (exclusionIndex != NULL)
+        delete exclusionIndex;
+    if (exclusions != NULL)
+        delete exclusions;
+}
+
+void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList) {
+    if (cutoff != -1.0) {
+        if (usesCutoff != useCutoff)
+            throw OpenMMException("All Forces must agree on whether to use a cutoff");
+        if (usesPeriodic != usePeriodic)
+            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
+        if (cutoffDistance != cutoff)
+            throw OpenMMException("All Forces must use the same cutoff distance");
+        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
+        for (int i = 0; i < exclusionList.size() && sameExclusions; i++) {
+            if (exclusionList[i].size() != atomExclusions[i].size())
+                sameExclusions = false;
+            for (int j = 0; j < exclusionList[i].size(); j++)
+                if (exclusionList[i][j] != atomExclusions[i][j])
+                    sameExclusions = false;
+        }
+        if (!sameExclusions)
+            throw OpenMMException("All Forces must have identical exceptions");
+    }
+    else {
+        useCutoff = usesCutoff;
+        usePeriodic = usesPeriodic;
+        cutoff = cutoffDistance;
+        atomExclusions = exclusionList;
+    }
+}
+
+void OpenCLNonbondedUtilities::addParameter(const string& name, const string& type, int size, cl::Buffer& buffer) {
+    parameters.push_back(ParameterInfo(name, type, size, buffer));
+}
+
+void OpenCLNonbondedUtilities::initialize(const System& system) {
+    if (cutoff == -1.0)
+        return; // There are no nonbonded interactions in the System.
+    
+    // Create the list of tiles.
+
+    int numAtomBlocks = context.getNumAtomBlocks();
+    int numTiles = numAtomBlocks*(numAtomBlocks+1)/2;
+    tiles = new OpenCLArray<cl_uint>(context, numTiles, "tiles");
+    vector<cl_uint> tileVec(tiles->getSize());
+    unsigned int count = 0;
+    for (unsigned int y = 0; y < numAtomBlocks; y++)
+        for (unsigned int x = y; x < numAtomBlocks; x++)
+            tileVec[count++] = (x << 17) | (y << 2);
+
+    // Decide how many force buffers to use.
+
+    bool forceBufferPerAtomBlock = false;
+    numForceBuffers = context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize/OpenCLContext::TileSize;
+    if (numForceBuffers >= numAtomBlocks) {
+        // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
+
+        forceBufferPerAtomBlock = true;
+        numForceBuffers = numAtomBlocks;
+    }
+
+    // Create kernels.
+
+    cl::Program forceProgram = context.createProgram(context.loadSourceFromFile("nonbonded.cl"));
+    forceKernel = cl::Kernel(forceProgram, "computeNonbonded");
+
+    // Mark which tiles have exclusions.
+
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/OpenCLContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/OpenCLContext::TileSize;
+            int index = (x > y ? x+y*numAtomBlocks-y*(y+1)/2 : y+x*numAtomBlocks-x*(x+1)/2);
+            tileVec[index] |= 1;
+        }
+    }
+    if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
+        int lastTile = context.getNumAtoms()/OpenCLContext::TileSize;
+        for (int i = 0; i < numTiles; ++i) {
+            int x = tileVec[i]>>17;
+            int y = (tileVec[i]>>2)&0x7FFF;
+            if (x == lastTile || y == lastTile)
+                tileVec[i] |= 1;
+        }
+    }
+
+    // Build a list of indices for the tiles with exclusions.
+
+    exclusionIndex = new OpenCLArray<cl_uint>(context, numTiles, "exclusionIndex");
+    vector<cl_uint> exclusionIndexVec(exclusionIndex->getSize());
+    int numWithExclusions = 0;
+    for (int i = 0; i < numTiles; ++i)
+        if ((tileVec[i]&1) == 1)
+            exclusionIndexVec[i] = (numWithExclusions++)*OpenCLContext::TileSize;
+
+    // Record the exclusion data.
+
+    exclusions = new OpenCLArray<cl_uint>(context, numWithExclusions*OpenCLContext::TileSize, "exclusions");
+    vector<cl_uint> exclusionVec(exclusions->getSize());
+    for (int i = 0; i < exclusions->getSize(); ++i)
+        exclusionVec[i] = 0xFFFFFFFF;
+    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
+        int x = atom1/OpenCLContext::TileSize;
+        int offset1 = atom1-x*OpenCLContext::TileSize;
+        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
+            int atom2 = atomExclusions[atom1][j];
+            int y = atom2/OpenCLContext::TileSize;
+            int offset2 = atom2-y*OpenCLContext::TileSize;
+            if (x > y) {
+                int tile = x+y*numAtomBlocks-y*(y+1)/2;
+                exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            else {
+                int tile = y+x*numAtomBlocks-x*(x+1)/2;
+                exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }
+
+    // Mark all interactions that involve a padding atom as being excluded.
+
+    for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
+        int x = atom1/OpenCLContext::TileSize;
+        int offset1 = atom1-x*OpenCLContext::TileSize;
+        for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
+            int y = atom2/OpenCLContext::TileSize;
+            int offset2 = atom2-y*OpenCLContext::TileSize;
+            if (x >= y) {
+                int tile = x+y*numAtomBlocks-y*(y+1)/2;
+                exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            if (y >= x) {
+                int tile = y+x*numAtomBlocks-x*(x+1)/2;
+                exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }
+    atomExclusions.clear(); // We won't use this again, so free the memory it used
+    tiles->upload(tileVec);
+    exclusions->upload(exclusionVec);
+    exclusionIndex->upload(exclusionIndexVec);
+    Vec3 boxVectors[3];
+    system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    periodicBoxSize = (mm_float4) {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2], 0.0f};
+}
+
+void OpenCLNonbondedUtilities::prepareInteractions() {
+    hasComputedInteractions = false;
+    if (!useCutoff)
+        return;
+    // TODO compute the neighbor list
+}
+
+void OpenCLNonbondedUtilities::computeInteractions() {
+    if (hasComputedInteractions)
+        return;
+    hasComputedInteractions = true;
+    forceKernel.setArg<cl_int>(0, tiles->getSize());
+    forceKernel.setArg<cl_int>(1, context.getPaddedNumAtoms());
+    forceKernel.setArg<cl_float>(2, cutoff*cutoff);
+    forceKernel.setArg<mm_float4>(3, periodicBoxSize);
+    forceKernel.setArg<cl::Buffer>(4, context.getForceBuffers().getDeviceBuffer());
+    forceKernel.setArg<cl::Buffer>(5, context.getEnergyBuffer().getDeviceBuffer());
+    forceKernel.setArg<cl::Buffer>(6, context.getPosq().getDeviceBuffer());
+    forceKernel.setArg<cl::Buffer>(7, tiles->getDeviceBuffer());
+    forceKernel.setArg<cl::Buffer>(8, exclusions->getDeviceBuffer());
+    forceKernel.setArg<cl::Buffer>(9, exclusionIndex->getDeviceBuffer());
+    forceKernel.setArg(10, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
+    forceKernel.setArg(11, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
+    for (int i = 0; i < (int) parameters.size(); i++) {
+        forceKernel.setArg<cl::Buffer>(i*2+12, *parameters[i].buffer);
+        forceKernel.setArg(i*2+13, OpenCLContext::ThreadBlockSize*parameters[i].size, NULL);
+    }
+    context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize);
+}
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
+#ifndef OPENMM_OPENCLNONBONDEDUTILITIES_H_
+#define OPENMM_OPENCLNONBONDEDUTILITIES_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "OpenCLContext.h"
+#include "openmm/System.h"
+#include <string>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class implements features that are used by several different force.  It provides
+ * a generic interface for calculating nonbonded interactions.
+ */
+
+class OpenCLNonbondedUtilities {
+public:
+    OpenCLNonbondedUtilities(OpenCLContext& context);
+    ~OpenCLNonbondedUtilities();
+    /**
+     * Add a nonbonded interaction.
+     *
+     * @param usesCutoff     specifies whether a cutoff should be applied to this interaction
+     * @param usesPeriodic   specifies whether periodic boundary conditions should be applied to this interaction
+     * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
+     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
+     */
+    void addInteraction(bool usesCutoff, bool usesPeriodic, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList);
+    /**
+     * Add a per-atom parameter that interactions may depend on.
+     *
+     * @param name      the name of the parameter
+     * @param type      the data type of the parameter
+     * @param size      the size of the parameter in bytes
+     * @param buffer    the buffer containing the parameter values
+     */
+    void addParameter(const std::string& name, const std::string& type, int size, cl::Buffer& buffer);
+    /**
+     * Initialize this object in preparation for a simulation.
+     */
+    void initialize(const System& system);
+    /**
+     * Get the number of force buffers required for nonbonded forces.
+     */
+    int getNumForceBuffers() {
+        return numForceBuffers;
+    }
+    /**
+     * Prepare to compute interactions.  This updates the neighbor list.
+     */
+    void prepareInteractions();
+    /**
+     * Compute the nonbonded interactions.  This will only be executed once after each call to
+     * prepareInteractions().  Additional calls return immediately without doing anything.
+     */
+    void computeInteractions();
+private:
+    class ParameterInfo;
+    OpenCLContext& context;
+    cl::Kernel forceKernel;
+    OpenCLArray<cl_uint>* tiles;
+    OpenCLArray<cl_uint>* exclusionIndex;
+    OpenCLArray<cl_uint>* exclusions;
+    std::vector<std::vector<int> > atomExclusions;
+    std::vector<ParameterInfo> parameters;
+    double cutoff;
+    bool useCutoff, usePeriodic, hasComputedInteractions;
+    int numForceBuffers;
+    mm_float4 periodicBoxSize;
+};
+
+class OpenCLNonbondedUtilities::ParameterInfo {
+public:
+    ParameterInfo(const std::string& name, const std::string& type, int size, cl::Buffer& buffer) :
+            name(name), type(type), size(size), buffer(&buffer) {
+    }
+    std::string name;
+    std::string type;
+    int size;
+    cl::Buffer* buffer;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_OPENCLNONBONDEDUTILITIES_H_*/
--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -51,7 +51,7 @@ OpenCLPlatform::OpenCLPlatform() {
    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
-//    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
+    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
 //    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
 //    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
+const unsigned int TileSize = 32;
+const float EpsilonFactor = 138.935485f;
+
+/**
+ * Compute nonbonded interactions.
+ */
+
+__kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSquared, float4 periodicBoxSize,
+        __global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* tiles,
+        __global unsigned int* exclusions,  __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force,
+        __global float2* sigmaEpsilon, __local float2* local_sigmaEpsilon) {
+    unsigned int totalWarps = get_global_size(0)/TileSize;
+    unsigned int warp = get_global_id(0)/TileSize;
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+    float energy = 0.0f;
+#ifdef USE_CUTOFF
+    float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
+#endif
+    unsigned int lasty = 0xFFFFFFFF;
+
+    while (pos < end) {
+        // Extract tile coordinates from appropriate work unit
+        unsigned int x = tiles[pos];
+        unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
+        bool hasExclusions = (x & 0x1);
+        x = (x>>17)*TileSize;
+        float4 apos;   // Local atom x, y, z, q
+        float4 af = 0.0f;     // Local atom fx, fy, fz
+        unsigned int tgx = get_local_id(0) & (TileSize-1);
+        unsigned int tbx = get_local_id(0) - tgx;
+        unsigned int tj = tgx;
+        unsigned int i = x + tgx;
+        apos = posq[i];
+        float2 a = sigmaEpsilon[i];
+        if (x == y) {
+            // Handle diagonals uniquely at 50% efficiency
+            // Read fixed atom data into registers and GRF
+
+            local_posq[get_local_id(0)] = apos;
+            local_sigmaEpsilon[get_local_id(0)] = a;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            apos.w *= EpsilonFactor;
+            unsigned int xi = x/TileSize;
+            unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2;
+            unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
+            for (unsigned int j = 0; j < TileSize; j++) {
+                bool isExcluded = !(excl & 0x1);
+                float4 delta = (float4) (local_posq[tbx+j].xyz - apos.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                float r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+                float invR = 1.0f / sqrt(r2);
+                float sig = a.x + local_sigmaEpsilon[tbx+j].x;
+                float sig2 = invR * sig;
+                sig2 *= sig2;
+                float sig6 = sig2 * sig2 * sig2;
+                float eps = a.y * local_sigmaEpsilon[tbx+j].y;
+                float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
+                float tempEnergy = eps * (sig6 - 1.0f) * sig6;
+#ifdef USE_CUTOFF
+                dEdR += apos.w * local_posq[tbx+j].w * (invR - 2.0f * cSim.reactionFieldK * r2);
+                tempEnergy += apos.w * local_posq[tbx+j].w * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+#else
+                dEdR += apos.w * local_posq[tbx+j].w * invR;
+                tempEnergy += apos.w * local_posq[tbx+j].w * invR;
+#endif
+                dEdR *= invR * invR;
+#ifdef USE_CUTOFF
+                if (isExcluded || r2 > cutoffSquared) {
+#else
+                if (isExcluded) {
+#endif
+                    dEdR = 0.0f;
+                    tempEnergy  = 0.0f;
+                }
+                energy += 0.5f*tempEnergy;
+                delta.xyz *= dEdR;
+                af.xyz -= delta.xyz;
+                excl >>= 1;
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset = x + tgx + warp*paddedNumAtoms;
+            of = forceBuffers[offset];
+            of.xyz += af.xyz;
+            forceBuffers[offset] = of;
+#else
+            of.xyz = af.xyz;
+            of.w = 0.0f;
+            unsigned int offset = x + tgx + (x/TileSize) * paddedNumAtoms;
+            forceBuffers[offset] = of;
+#endif
+        }
+        else {
+            // 100% utilization
+            // Read fixed atom data into registers and GRF
+            if (lasty != y) {
+                unsigned int j = y + tgx;
+                float2 temp1 = sigmaEpsilon[j];
+                local_posq[get_local_id(0)] = posq[j];
+                local_sigmaEpsilon[get_local_id(0)] = sigmaEpsilon[j];
+            }
+            local_force[get_local_id(0)] = 0.0f;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            apos.w *= EpsilonFactor;
+#ifdef USE_CUTOFF
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (!hasExclusions && flags != 0xFFFFFFFF) {
+                if (flags == 0) {
+                    // No interactions in this tile.
+                }
+                else {
+                    // Compute only a subset of the interactions in this tile.
+
+                    for (unsigned int j = 0; j < TileSize; j++) {
+                        if ((flags&(1<<j)) != 0) {
+                            bool isExcluded = false;
+                            float4 delta = (float4) (local_posq[tbx+j].xyz - apos.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                            delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                            delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                            delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                            float r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+                            float invR = 1.0f / sqrt(r2);
+                            float sig = a.x + local_sigmaEpsilon[tbx+j].x;
+                            float sig2 = invR * sig;
+                            sig2 *= sig2;
+                            float sig6 = sig2 * sig2 * sig2;
+                            float eps = a.y * local_sigmaEpsilon[tbx+j].Y;
+                            float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
+			    float tempEnergy = eps * (sig6 - 1.0f) * sig6;
+#ifdef USE_CUTOFF
+                            dEdR += apos.w * local_posq[tbx+j].w * (invR - 2.0f * cSim.reactionFieldK * r2);
+                            tempEnergy += apos.w * local_posq[tbx+j].w * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+#else
+                            dEdR += apos.w * local_posq[tbx+j].w * invR;
+                            tempEnergy += apos.w * local_posq[tbx+j].w * invR;
+#endif
+                            dEdR *= invR * invR;
+#ifdef USE_CUTOFF
+                            if (r2 > cutoffSquared) {
+                                dEdR = 0.0f;
+				tempEnergy = 0.0f;
+                            }
+#endif
+			    energy += tempEnergy;
+                            delta.xyz *= dEdR;
+                            af.xyz -= delta.xyz;
+                            tempBuffer[get_local_id(0)] = delta;
+
+                            // Sum the forces on atom j.
+
+                            if (tgx % 2 == 0)
+                                tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+1].xyz;
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                            if (tgx % 4 == 0)
+                                tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+2].xyz;
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                            if (tgx % 8 == 0)
+                                tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+4].xyz;
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                            if (tgx % 16 == 0)
+                                tempBuffer[get_local_id(0)].xyz += tempBuffer[get_local_id(0)+8].xyz;
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                            if (tgx == 0)
+                                local_force[tbx+j].xyz += tempBuffer[get_local_id(0)].xyz + tempBuffer[get_local_id(0)+16].xyz;
+                            barrier(CLK_LOCAL_MEM_FENCE);
+                        }
+                    }
+                }
+            }
+            else  // bExclusion
+#endif
+            {
+                // Read fixed atom data into registers and GRF
+                unsigned int xi = x/TileSize;
+                unsigned int yi = y/TileSize;
+                unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
+                unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
+                excl = (excl >> tgx) | (excl << (TileSize - tgx));
+                for (unsigned int j = 0; j < TileSize; j++) {
+                    bool isExcluded = !(excl & 0x1);
+                    float4 delta = (float4) (local_posq[tbx+tj].xyz - apos.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    float r2 = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+                    float invR = 1.0f / sqrt(r2);
+                    float sig = a.x + local_sigmaEpsilon[tbx+j].x;
+                    float sig2 = invR * sig;
+                    sig2 *= sig2;
+                    float sig6 = sig2 * sig2 * sig2;
+                    float eps = a.y * local_sigmaEpsilon[tbx+j].y;
+                    float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
+		    float tempEnergy = eps * (sig6 - 1.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR += apos.w * local_posq[tbx+j].w * (invR - 2.0f * cSim.reactionFieldK * r2);
+		    tempEnergy += apos.w * local_posq[tbx+j].w * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+#else
+                    dEdR += apos.w * local_posq[tbx+j].w * invR;
+                    tempEnergy += apos.w * local_posq[tbx+j].w * invR;
+#endif
+                    dEdR *= invR * invR;
+#ifdef USE_CUTOFF
+                    if (isExcluded || r2 > cutoffSquared) {
+#else
+                    if (isExcluded) {
+#endif
+                        dEdR = 0.0f;
+			tempEnergy  = 0.0f;
+                    }
+		    energy += tempEnergy;
+                    delta.xyz *= dEdR;
+                    af.xyz -= delta.xyz;
+                    local_force[tbx+tj].xyz += delta.xyz;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    excl >>= 1;
+                    tj = (tj + 1) & (TileSize - 1);
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset = x + tgx + warp*paddedNumAtoms;
+            of = forceBuffers[offset];
+            of.xyz += af.xyz;
+            forceBuffers[offset] = of;
+            offset = y + tgx + warp*paddedNumAtoms;
+            of = forceBuffers[offset];
+            of.xyz += local_foce[get_local_id(0)].xyz;
+            forceBuffers[offset] = of;
+#else
+            of.xyz = af.xyz;
+            of.w = 0.0f;
+            unsigned int offset = x + tgx + (y/TileSize) * paddedNumAtoms;
+            forceBuffers[offset] = of;
+            of = local_force[get_local_id(0)];
+            offset = y + tgx + (x/TileSize) * paddedNumAtoms;
+            forceBuffers[offset] = of;
+#endif
+            lasty = y;
+        }
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp