resolved conflict

73183c61 · ChayaSt · 0e218233 · 32e08b87 · 73183c61 · 73183c61
Commit 73183c61 authored May 31, 2016 by ChayaSt
20 changed files
--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), "false", 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -408,6 +408,18 @@ public:
    void setStepsSinceReorder(int steps) {
        stepsSinceReorder = steps;
    }
+    /**
+     * Get the flag that marks whether the current force evaluation is valid.
+     */
+    bool getForcesValid() const {
+        return forcesValid;
+    }
+    /**
+     * Get the flag that marks whether the current force evaluation is valid.
+     */
+    void setForcesValid(bool valid) {
+        forcesValid = valid;
+    }
    /**
     * Get the number of atoms.
     */
@@ -684,7 +696,7 @@ private:
    int numThreadBlocks;
    int numForceBuffers;
    int simdWidth;
-    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic;
+    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic, forcesValid;
    mm_float4 periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ;
    mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble, periodicBoxVecXDouble, periodicBoxVecYDouble, periodicBoxVecZDouble;
    std::string defaultOptimizationOptions;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -156,7 +156,7 @@ public:
     * @param b      the vector defining the second edge of the periodic box
     * @param c      the vector defining the third edge of the periodic box
     */
-    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
+    void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c);
    /**
     * Create a checkpoint recording the current state of the Context.
     * 
@@ -698,7 +698,7 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
 private:
-    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource);
+    void initInteractionGroups(const CustomNonbondedForce& force, const std::string& interactionSource, const std::vector<std::string>& tableTypes);
    OpenCLContext& cl;
    OpenCLParameterSet* params;
    OpenCLArray* globals;

--- a/platforms/opencl/include/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/include/OpenCLNonbondedUtilities.h
@@ -281,6 +281,9 @@ private:
    OpenCLArray* oldPositions;
    OpenCLArray* rebuildNeighborList;
    OpenCLSort* blockSorter;
+    cl::Event downloadCountEvent;
+    cl::Buffer* pinnedCountBuffer;
+    int* pinnedCountMemory;
    std::vector<std::vector<int> > atomExclusions;
    std::vector<ParameterInfo> parameters;
    std::vector<ParameterInfo> arguments;

--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -58,14 +58,14 @@ public:
     * This is the name of the parameter for selecting which OpenCL device or devices to use.
     */
    static const std::string& OpenCLDeviceIndex() {
-        static const std::string key = "OpenCLDeviceIndex";
+        static const std::string key = "DeviceIndex";
        return key;
    }
    /**
     * This is the name of the parameter that reports the OpenCL device or devices being used.
     */
    static const std::string& OpenCLDeviceName() {
-        static const std::string key = "OpenCLDeviceName";
+        static const std::string key = "DeviceName";
        return key;
    }
    /**
@@ -86,21 +86,21 @@ public:
     * This is the name of the parameter for selecting what numerical precision to use.
     */
    static const std::string& OpenCLPrecision() {
-        static const std::string key = "OpenCLPrecision";
+        static const std::string key = "Precision";
        return key;
    }
    /**
     * This is the name of the parameter for selecting whether to use the CPU based PME calculation.
     */
    static const std::string& OpenCLUseCpuPme() {
-        static const std::string key = "OpenCLUseCpuPme";
+        static const std::string key = "UseCpuPme";
        return key;
    }
    /**
     * This is the name of the parameter for selecting whether to disable use of a separate stream for PME.
     */
    static const std::string& OpenCLDisablePmeStream() {
-        static const std::string key = "OpenCLDisablePmeStream";
+        static const std::string key = "DisablePmeStream";
        return key;
    }
 };

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -84,7 +84,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        useMixedPrecision = false;
    }
    else
-        throw OpenMMException("Illegal value for OpenCLPrecision: "+precision);
+        throw OpenMMException("Illegal value for Precision: "+precision);
    try {
        contextIndex = platformData.contexts.size();
        std::vector<cl::Platform> platforms;
@@ -105,7 +105,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            vector<cl::Device> devices;
            platforms[j].getDevices(CL_DEVICE_TYPE_ALL, &devices);
            if (deviceIndex < -1 || deviceIndex >= (int) devices.size())
-                throw OpenMMException("Illegal value for OpenCLDeviceIndex: "+intToString(deviceIndex));
+                throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));

            for (int i = 0; i < (int) devices.size(); i++) {
                // If they supplied a valid deviceIndex, we only look through that one
@@ -113,6 +113,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    continue;
                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
                    continue; // The CPU device on OS X won't work correctly.
+                if (useMixedPrecision || useDoublePrecision) {
+                    bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
+                    if (!supportsDouble)
+                        continue; // This device does not support double precision.
+                }
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
                int processingElementsPerComputeUnit = 8;
                if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
@@ -1047,7 +1052,6 @@ void OpenCLContext::reorderAtoms() {
        reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>();
    else
        reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>();
-    nonbonded->updateNeighborListSize();
 }

 template <class Real, class Real4, class Mixed, class Mixed4>

--- a/platforms/opencl/src/OpenCLExpressionUtilities.cpp
+++ b/platforms/opencl/src/OpenCLExpressionUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -174,8 +174,8 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
                    out << "if (x >= " << paramsFloat[2] << " && x <= " << paramsFloat[3] << " && y >= " << paramsFloat[4] << " && y <= " << paramsFloat[5] << ") {\n";
                    out << "x = (x - " << paramsFloat[2] << ")*" << paramsFloat[6] << ";\n";
                    out << "y = (y - " << paramsFloat[4] << ")*" << paramsFloat[7] << ";\n";
-                    out << "int s = min((int) floor(x), " << paramsInt[0] << ");\n";
-                    out << "int t = min((int) floor(y), " << paramsInt[1] << ");\n";
+                    out << "int s = min((int) floor(x), " << paramsInt[0] << "-1);\n";
+                    out << "int t = min((int) floor(y), " << paramsInt[1] << "-1);\n";
                    out << "int coeffIndex = 4*(s+" << paramsInt[0] << "*t);\n";
                    out << "float4 c[4];\n";
                    for (int j = 0; j < 4; j++)
@@ -217,9 +217,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
                    out << "x = (x - " << paramsFloat[3] << ")*" << paramsFloat[9] << ";\n";
                    out << "y = (y - " << paramsFloat[5] << ")*" << paramsFloat[10] << ";\n";
                    out << "z = (z - " << paramsFloat[7] << ")*" << paramsFloat[11] << ";\n";
-                    out << "int s = min((int) floor(x), " << paramsInt[0] << ");\n";
-                    out << "int t = min((int) floor(y), " << paramsInt[1] << ");\n";
-                    out << "int u = min((int) floor(z), " << paramsInt[2] << ");\n";
+                    out << "int s = min((int) floor(x), " << paramsInt[0] << "-1);\n";
+                    out << "int t = min((int) floor(y), " << paramsInt[1] << "-1);\n";
+                    out << "int u = min((int) floor(z), " << paramsInt[2] << "-1);\n";
                    out << "int coeffIndex = 16*(s+" << paramsInt[0] << "*(t+" << paramsInt[1] << "*u));\n";
                    out << "float4 c[16];\n";
                    for (int j = 0; j < 16; j++)
@@ -254,7 +254,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
                            for (int k = 3; k >= 0; k--)
                                for (int m = 0; m < 4; m++) {
                                    int base = 4*m;
-                                    string suffix = suffixes[m];
+                                    string suffix = suffixes[k];
                                    out << "derivy[" << m << "] = da*derivy[" << m << "] + (3*c[" << (base+3) << "]" << suffix << "*db + 2*c[" << (base+2) << "]" << suffix << ")*db + c[" << (base+1) << "]" << suffix << ";\n";
                                }
                            out << nodeNames[j] << " = derivy[0] + dc*(derivy[1] + dc*(derivy[2] + dc*derivy[3]));\n";
@@ -271,7 +271,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
                            out << nodeNames[j] << " *= " << paramsFloat[11] << ";\n";
                        }
                        else
-                            throw OpenMMException("Unsupported derivative order for Continuous2DFunction");
+                            throw OpenMMException("Unsupported derivative order for Continuous3DFunction");
                    }
                    out << "}\n";
                }

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -57,7 +57,7 @@ private:
 OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
    // Decide how many thread blocks and force buffers to use.

    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
@@ -90,6 +90,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
            numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
        }
    }
+    pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(int));
+    pinnedCountMemory = (int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
 }

 OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
@@ -123,6 +125,8 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
        delete rebuildNeighborList;
    if (blockSorter != NULL)
        delete blockSorter;
+    if (pinnedCountBuffer != NULL)
+        delete pinnedCountBuffer;
 }

 void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
@@ -357,20 +361,16 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {

    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
-    bool rebuild = false;
-    do {
-        setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
-        context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
-        blockSorter->sort(*sortedBlocks);
-        kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
-        context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
-        setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
-        context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
-        forceRebuildNeighborList = false;
-        if (context.getComputeForceCount() == 1)
-            rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
-    } while (rebuild);
+    setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
+    context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
+    blockSorter->sort(*sortedBlocks);
+    kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
+    context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
+    setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
+    context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
+    forceRebuildNeighborList = false;
    lastCutoff = kernels.cutoffDistance;
+    context.getQueue().enqueueReadBuffer(interactionCount->getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent); 
 }

 void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
@@ -385,20 +385,22 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
            setPeriodicBoxArgs(context, kernel, 9);
        context.executeKernel(kernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
+    if (useCutoff && numTiles > 0) {
+        downloadCountEvent.wait();
+        updateNeighborListSize();
+    }
 }

 bool OpenCLNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
-    unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer();
-    interactionCount->download(pinnedInteractionCount);
-    if (pinnedInteractionCount[0] <= (unsigned int) interactingTiles->getSize())
+    if (pinnedCountMemory[0] <= (unsigned int) interactingTiles->getSize())
        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

-    int maxTiles = (int) (1.2*pinnedInteractionCount[0]);
+    int maxTiles = (int) (1.2*pinnedCountMemory[0]);
    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
    if (maxTiles > totalTiles)
        maxTiles = totalTiles;
@@ -430,6 +432,7 @@ bool OpenCLNonbondedUtilities::updateNeighborListSize() {
        kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
    }
    forceRebuildNeighborList = true;
+    context.setForcesValid(false);
    return true;
 }


--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -56,6 +56,11 @@ extern "C" OPENMM_EXPORT_OPENCL void registerPlatforms() {
 #endif

 OpenCLPlatform::OpenCLPlatform() {
+    deprecatedPropertyReplacements["OpenCLDeviceIndex"] = OpenCLDeviceIndex();
+    deprecatedPropertyReplacements["OpenCLDeviceName"] = OpenCLDeviceName();
+    deprecatedPropertyReplacements["OpenCLPrecision"] = OpenCLPrecision();
+    deprecatedPropertyReplacements["OpenCLUseCpuPme"] = OpenCLUseCpuPme();
+    deprecatedPropertyReplacements["OpenCLDisablePmeStream"] = OpenCLDisablePmeStream();
    OpenCLKernelFactory* factory = new OpenCLKernelFactory();
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
@@ -139,7 +144,10 @@ bool OpenCLPlatform::isPlatformSupported() {
 const string& OpenCLPlatform::getPropertyValue(const Context& context, const string& property) const {
    const ContextImpl& impl = getContextImpl(context);
    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
-    map<string, string>::const_iterator value = data->propertyValues.find(property);
+    string propertyName = property;
+    if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
+        propertyName = deprecatedPropertyReplacements.find(property)->second;
+    map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
    if (value != data->propertyValues.end())
        return value->second;
    return Platform::getPropertyValue(context, property);

--- a/platforms/opencl/src/kernels/angleForce.cl
+++ b/platforms/opencl/src/kernels/angleForce.cl
 real4 v0 = pos2-pos1;
 real4 v1 = pos2-pos3;
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+#endif
 real4 cp = cross(v0, v1);
 real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
 rp = max(SQRT(rp), (real) 1.0e-06f);

--- a/platforms/opencl/src/kernels/bondForce.cl
+++ b/platforms/opencl/src/kernels/bondForce.cl
 real4 delta = pos2-pos1;
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
 real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
 COMPUTE_FORCE
 dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;

--- a/platforms/opencl/src/kernels/cmapTorsionForce.cl
+++ b/platforms/opencl/src/kernels/cmapTorsionForce.cl
@@ -5,6 +5,11 @@ const real PI = 3.14159265358979323846f;
 real4 v0a = (real4) (pos1.xyz-pos2.xyz, 0.0f);
 real4 v1a = (real4) (pos3.xyz-pos2.xyz, 0.0f);
 real4 v2a = (real4) (pos3.xyz-pos4.xyz, 0.0f);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0a)
+APPLY_PERIODIC_TO_DELTA(v1a)
+APPLY_PERIODIC_TO_DELTA(v2a)
+#endif
 real4 cp0a = cross(v0a, v1a);
 real4 cp1a = cross(v1a, v2a);
 real cosangle = dot(normalize(cp0a), normalize(cp1a));
@@ -28,6 +33,11 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
 real4 v0b = (real4) (pos5.xyz-pos6.xyz, 0.0f);
 real4 v1b = (real4) (pos7.xyz-pos6.xyz, 0.0f);
 real4 v2b = (real4) (pos7.xyz-pos8.xyz, 0.0f);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0b)
+APPLY_PERIODIC_TO_DELTA(v1b)
+APPLY_PERIODIC_TO_DELTA(v2b)
+#endif
 real4 cp0b = cross(v0b, v1b);
 real4 cp1b = cross(v1b, v2b);
 cosangle = dot(normalize(cp0b), normalize(cp1b));

--- a/platforms/opencl/src/kernels/customCentroidBond.cl
+++ b/platforms/opencl/src/kernels/customCentroidBond.cl
@@ -70,8 +70,11 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-real4 delta(real4 vec1, real4 vec2) {
+real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    if (periodic)
+        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
@@ -110,7 +113,7 @@ real4 computeCross(real4 vec1, real4 vec2) {
 * Compute the forces on groups based on the bonds.
 */
 __kernel void computeGroupForces(__global long* restrict groupForce, __global mixed* restrict energyBuffer, __global const real4* restrict centerPositions,
-        __global const int* restrict bondGroups
+        __global const int* restrict bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        EXTRA_ARGS) {
    mixed energy = 0;
    for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) {

--- a/platforms/opencl/src/kernels/customCompoundBond.cl
+++ b/platforms/opencl/src/kernels/customCompoundBond.cl
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-real4 ccb_delta(real4 vec1, real4 vec2) {
+real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    if (periodic)
+        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }

--- a/platforms/opencl/src/kernels/customGBEnergyN2.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2.cl
@@ -181,6 +181,8 @@ __kernel void computeN2Energy(

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -204,42 +206,38 @@ __kernel void computeN2Energy(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            x = tiles[pos];
-            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
+        }

-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.

+        SYNC_WARPS;
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                SYNC_WARPS;
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[get_local_id(0)] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
-                SYNC_WARPS;
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                ushort2 tile = exclusionTiles[skipBase+tgx];
+                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
-            while (skipTiles[currentSkipIndex] < pos)
-                currentSkipIndex++;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            else
+                skipTiles[get_local_id(0)] = end;
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
+            SYNC_WARPS;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -201,6 +201,8 @@ __kernel void computeN2Energy(

 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
@@ -220,35 +222,31 @@ __kernel void computeN2Energy(
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            x = tiles[pos];
-            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
+        }

-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.

-            while (nextToSkip < pos) {
-                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[currentSkipIndex++];
-                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    nextToSkip = end;
+        while (nextToSkip < pos) {
+            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
-            includeTile = (nextToSkip != pos);
+            else
+                nextToSkip = end;
        }
+        includeTile = (nextToSkip != pos);
+#endif
        if (includeTile) {
            // Load the data for this tile.


--- a/platforms/opencl/src/kernels/customGBValueN2.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2.cl
@@ -157,6 +157,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
 #else
@@ -178,42 +180,38 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            x = tiles[pos];
-            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
+        }

-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.

+        SYNC_WARPS;
+        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            SYNC_WARPS;
-            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
-                SYNC_WARPS;
-                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[skipBase+tgx];
-                    skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    skipTiles[get_local_id(0)] = end;
-                skipBase += TILE_SIZE;            
-                currentSkipIndex = tbx;
-                SYNC_WARPS;
+            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
+                ushort2 tile = exclusionTiles[skipBase+tgx];
+                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
-            while (skipTiles[currentSkipIndex] < pos)
-                currentSkipIndex++;
-            includeTile = (skipTiles[currentSkipIndex] != pos);
+            else
+                skipTiles[get_local_id(0)] = end;
+            skipBase += TILE_SIZE;            
+            currentSkipIndex = tbx;
+            SYNC_WARPS;
        }
+        while (skipTiles[currentSkipIndex] < pos)
+            currentSkipIndex++;
+        includeTile = (skipTiles[currentSkipIndex] != pos);
+#endif
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;


--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
@@ -170,6 +170,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
+    if (numTiles > maxTiles)
+        return; // There wasn't enough memory for the neighbor list.
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
@@ -188,35 +190,31 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
-        if (numTiles <= maxTiles) {
-            x = tiles[pos];
-            real4 blockSizeX = blockSize[x];
-            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
-                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
-                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
-        }
-        else
-#endif
-        {
-            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = tiles[pos];
+        real4 blockSizeX = blockSize[x];
+        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
+                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
+                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
+#else
+        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+            y += (x < y ? -1 : 1);
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
-                y += (x < y ? -1 : 1);
-                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
-            }
+        }

-            // Skip over tiles that have exclusions, since they were already processed.
+        // Skip over tiles that have exclusions, since they were already processed.

-            while (nextToSkip < pos) {
-                if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
-                    ushort2 tile = exclusionTiles[currentSkipIndex++];
-                    nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
-                }
-                else
-                    nextToSkip = end;
+        while (nextToSkip < pos) {
+            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
+                ushort2 tile = exclusionTiles[currentSkipIndex++];
+                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
-            includeTile = (nextToSkip != pos);
+            else
+                nextToSkip = end;
        }
+        includeTile = (nextToSkip != pos);
+#endif
        if (includeTile) {
            // Load the data for this tile.


--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl