Common compute framework to unify CUDA and OpenCL code (#2488)

* Began creating common compute framework to unify code between CUDA and OpenCL * Began OpenCL implementation of common compute framework * Common implementation of CMMotionRemover * CUDA implementation of common compute interface * Converted HarmonicBondForce to common compute API * Converted standard bonded forces to common compute API * Converted ExpressionUtilities to common compute API * Created ComputeParameterSet * Converted custom bonded forces to common compute API * Converted CustomCentroidBondForce to common compute API * Converted CustomManyParticleForce to common compute API * Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext * Converted GayBerneForce to common compute API * Removed obsolete kernels * Converted verlet integrators to common compute API * Converted Langevin and Brownian integrators to common compute API * Converted CustomIntegrator to common compute API * Converted CustomNonbondedForce to common compute API * Removed uses of a deprecated API * Fixed failing test cases * Converted GBSAOBCForce to common compute API * Began converting CustomGBForce to common compute API * Finished converting CustomGBForce to common compute API * Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities * Converted RMSDForce and AndersenThermostat to common compute API * Converted CustomHbondForce to common compute API * Merged scripts for encoding kernel sources * Converted Drude plugin to common compute API * Fixed errors in CMake scripts * Attempt at fixing errors on Windows * Added discussion of common compute API to developer guide * Added Windows export macro for common classes * Fixed error in CMMotionRemover * Ubdated travis to newer Ubuntu version * Fixed errors on CPU OpenCL * Fixed Windows linking errors * Added missing pragma for 32 bit atomics * Replaced long long with mm_long * More fixes to Windows linking * Bug fix

Common compute framework to unify CUDA and OpenCL code (#2488)
* Began creating common compute framework to unify code between CUDA and OpenCL * Began OpenCL implementation of common compute framework * Common implementation of CMMotionRemover * CUDA implementation of common compute interface * Converted HarmonicBondForce to common compute API * Converted standard bonded forces to common compute API * Converted ExpressionUtilities to common compute API * Created ComputeParameterSet * Converted custom bonded forces to common compute API * Converted CustomCentroidBondForce to common compute API * Converted CustomManyParticleForce to common compute API * Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext * Converted GayBerneForce to common compute API * Removed obsolete kernels * Converted verlet integrators to common compute API * Converted Langevin and Brownian integrators to common compute API * Converted CustomIntegrator to common compute API * Converted CustomNonbondedForce to common compute API * Removed uses of a deprecated API * Fixed failing test cases * Converted GBSAOBCForce to common compute API * Began converting CustomGBForce to common compute API * Finished converting CustomGBForce to common compute API * Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities * Converted RMSDForce and AndersenThermostat to common compute API * Converted CustomHbondForce to common compute API * Merged scripts for encoding kernel sources * Converted Drude plugin to common compute API * Fixed errors in CMake scripts * Attempt at fixing errors on Windows * Added discussion of common compute API to developer guide * Added Windows export macro for common classes * Fixed error in CMMotionRemover * Ubdated travis to newer Ubuntu version * Fixed errors on CPU OpenCL * Fixed Windows linking errors * Added missing pragma for 32 bit atomics * Replaced long long with mm_long * More fixes to Windows linking * Bug fix
edbc8407 · peastman · GitHub · 38beeefe · edbc8407 · edbc8407
Unverified Commit edbc8407 authored Jan 08, 2020 by peastman Committed by GitHub Jan 08, 2020
20 changed files
--- a/platforms/common/include/openmm/common/NonbondedUtilities.h
+++ b/platforms/common/include/openmm/common/NonbondedUtilities.h
+#ifndef OPENMM_NONBONDEDUTILITIES_H_
+#define OPENMM_NONBONDEDUTILITIES_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ArrayInterface.h"
+#include "openmm/common/ComputeParameterInfo.h"
+#include <string>
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class provides a generic interface for calculating nonbonded interactions.  Clients only need
+ * to provide the code for evaluating a single interaction and the list of parameters it depends on.
+ * A complete kernel is then synthesized using an appropriate algorithm to evaluate all interactions on
+ * all atoms.  Call addInteraction() to define a nonbonded interaction, and addParameter() to define
+ * per-particle parameters that the interaction depends on.
+ *
+ * During each force or energy evaluation, the following sequence of steps takes place:
+ *
+ * 1. Data structures (e.g. neighbor lists) are calculated to allow nonbonded interactions to be evaluated
+ * quickly.
+ *
+ * 2. calcForcesAndEnergy() is called on each ForceImpl in the System.
+ *
+ * 3. Finally, the default interaction kernel is invoked to calculate all interactions that were added
+ * to it.
+ *
+ * This sequence means that the default interaction kernel may depend on quantities that were calculated
+ * by ForceImpls during calcForcesAndEnergy().
+ */
+
+class OPENMM_EXPORT_COMMON NonbondedUtilities {
+public:
+    virtual ~NonbondedUtilities() {
+    }
+    /**
+     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
+     *
+     * @param usesCutoff     specifies whether a cutoff should be applied to this interaction
+     * @param usesPeriodic   specifies whether periodic boundary conditions should be applied to this interaction
+     * @param usesExclusions specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
+     * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
+     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
+     * @param kernel         the code to evaluate the interaction
+     * @param forceGroup     the force group in which the interaction should be calculated
+     */
+    virtual void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup) = 0;
+    /**
+     * Add a per-atom parameter that the default interaction kernel may depend on.
+     */
+    virtual void addParameter(ComputeParameterInfo parameter) = 0;
+    /**
+     * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
+     */
+    virtual void addArgument(ComputeParameterInfo parameter) = 0;
+    /**
+     * Register that the interaction kernel will be computing the derivative of the potential energy
+     * with respect to a parameter.
+     * 
+     * @param param   the name of the parameter
+     * @return the variable that will be used to accumulate the derivative.  Any code you pass to addInteraction() should
+     * add its contributions to this variable.
+     */
+    virtual std::string addEnergyParameterDerivative(const std::string& param) = 0;
+    /**
+     * Get the number of force buffers required for nonbonded forces.
+     */
+    virtual int getNumForceBuffers() const = 0;
+    /**
+     * Get whether a cutoff is being used.
+     */
+    virtual bool getUseCutoff() = 0;
+    /**
+     * Get whether periodic boundary conditions are being used.
+     */
+    virtual bool getUsePeriodic() = 0;
+    /**
+     * Get the number of thread blocks used for computing nonbonded forces.
+     */
+    virtual int getNumForceThreadBlocks() = 0;
+    /**
+     * Get the size of each thread block used for computing nonbonded forces.
+     */
+    virtual int getForceThreadBlockSize() = 0;
+    /**
+     * Get the maximum cutoff distance used by any interaction.
+     */
+    virtual double getMaxCutoffDistance() = 0;
+    /**
+     * Given a nonbonded cutoff, get the padded cutoff distance used in computing
+     * the neighbor list.
+     */
+    virtual double padCutoff(double cutoff) = 0;
+    /**
+     * Get the array containing the center of each atom block.
+     */
+    virtual ArrayInterface& getBlockCenters() = 0;
+    /**
+     * Get the array containing the dimensions of each atom block.
+     */
+    virtual ArrayInterface& getBlockBoundingBoxes() = 0;
+    /**
+     * Get the array whose first element contains the number of tiles with interactions.
+     */
+    virtual ArrayInterface& getInteractionCount() = 0;
+    /**
+     * Get the array containing tiles with interactions.
+     */
+    virtual ArrayInterface& getInteractingTiles() = 0;
+    /**
+     * Get the array containing the atoms in each tile with interactions.
+     */
+    virtual ArrayInterface& getInteractingAtoms() = 0;
+    /**
+     * Get the array containing exclusion flags.
+     */
+    virtual ArrayInterface& getExclusions() = 0;
+    /**
+     * Get the array containing tiles with exclusions.
+     */
+    virtual ArrayInterface& getExclusionTiles() = 0;
+    /**
+     * Get the array containing the index into the exclusion array for each tile.
+     */
+    virtual ArrayInterface& getExclusionIndices() = 0;
+    /**
+     * Get the array listing where the exclusion data starts for each row.
+     */
+    virtual ArrayInterface& getExclusionRowIndices() = 0;
+    /**
+     * Get the array containing a flag for whether the neighbor list was rebuilt
+     * on the most recent call to prepareInteractions().
+     */
+    virtual ArrayInterface& getRebuildNeighborList() = 0;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_NONBONDEDUTILITIES_H_*/
--- a/platforms/opencl/include/windowsExportOpenCL.h
+++ b/platforms/opencl/include/windowsExportOpenCL.h
-#ifndef OPENMM_WINDOWSEXPORTOPENCL_H_
-#define OPENMM_WINDOWSEXPORTOPENCL_H_
-
-/*
- * Shared libraries are messy in Visual Studio. We have to distinguish three
- * cases:
- *   (1) this header is being used to build the OpenMM shared library
- *       (dllexport)
- *   (2) this header is being used by a *client* of the OpenMM shared
- *       library (dllimport)
- *   (3) we are building the OpenMM static library, or the client is
- *       being compiled with the expectation of linking with the
- *       OpenMM static library (nothing special needed)
- * In the CMake script for building this library, we define one of the symbols
- *     OPENMM_OPENCL_BUILDING_{SHARED|STATIC}_LIBRARY
- * Client code normally has no special symbol defined, in which case we'll
- * assume it wants to use the shared library. However, if the client defines
- * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
- * that the client code can be linked with static libraries. Note that
- * the client symbol is not library dependent, while the library symbols
- * affect only the OpenMM library, meaning that other libraries can
- * be clients of this one. However, we are assuming all-static or all-shared.
- */
-
-#ifdef _MSC_VER
-    // We don't want to hear about how sprintf is "unsafe".
-    #pragma warning(disable:4996)
-    // Keep MS VC++ quiet about lack of dll export of private members.
-    #pragma warning(disable:4251)
-    #if defined(OPENMM_OPENCL_BUILDING_SHARED_LIBRARY)
-        #define OPENMM_EXPORT_OPENCL __declspec(dllexport)
-    #elif defined(OPENMM_OPENCL_BUILDING_STATIC_LIBRARY) || defined(OPENMM_OPENCL_USE_STATIC_LIBRARIES)
-        #define OPENMM_EXPORT_OPENCL
-    #else
-        #define OPENMM_EXPORT_OPENCL __declspec(dllimport)   // i.e., a client of a shared library
-    #endif
-#else
-    #define OPENMM_EXPORT_OPENCL // Linux, Mac
-#endif
-
-#endif // OPENMM_WINDOWSEXPORTOPENCL_H_
+#ifndef OPENMM_WINDOWSEXPORTCOMMON_H_
+#define OPENMM_WINDOWSEXPORTCOMMON_H_
+
+/*
+ * Shared libraries are messy in Visual Studio. We have to distinguish three
+ * cases:
+ *   (1) this header is being used to build the OpenMM shared library
+ *       (dllexport)
+ *   (2) this header is being used by a *client* of the OpenMM shared
+ *       library (dllimport)
+ *   (3) we are building the OpenMM static library, or the client is
+ *       being compiled with the expectation of linking with the
+ *       OpenMM static library (nothing special needed)
+ * In the CMake script for building this library, we define one of the symbols
+ *     OPENMM_COMMON_BUILDING_{SHARED|STATIC}_LIBRARY
+ * Client code normally has no special symbol defined, in which case we'll
+ * assume it wants to use the shared library. However, if the client defines
+ * the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
+ * that the client code can be linked with static libraries. Note that
+ * the client symbol is not library dependent, while the library symbols
+ * affect only the OpenMM library, meaning that other libraries can
+ * be clients of this one. However, we are assuming all-static or all-shared.
+ */
+
+#ifdef _MSC_VER
+    // We don't want to hear about how sprintf is "unsafe".
+    #pragma warning(disable:4996)
+    // Keep MS VC++ quiet about lack of dll export of private members.
+    #pragma warning(disable:4251)
+    #if defined(OPENMM_COMMON_BUILDING_SHARED_LIBRARY)
+        #define OPENMM_EXPORT_COMMON __declspec(dllexport)
+    #elif defined(OPENMM_COMMON_BUILDING_STATIC_LIBRARY) || defined(OPENMM_COMMON_USE_STATIC_LIBRARIES)
+        #define OPENMM_EXPORT_COMMON
+    #else
+        #define OPENMM_EXPORT_COMMON __declspec(dllimport)   // i.e., a client of a shared library
+    #endif
+#else
+    #define OPENMM_EXPORT_COMMON // Linux, Mac
+#endif
+
+#endif // OPENMM_WINDOWSEXPORTCOMMON_H_
--- a/plugins/drude/platforms/opencl/src/OpenCLDrudeKernelSources.cpp.in
+++ b/plugins/drude/platforms/opencl/src/OpenCLDrudeKernelSources.cpp.in
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -24,7 +24,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "OpenCLDrudeKernelSources.h"
+#include "CommonKernelSources.h"

 using namespace OpenMM;
 using namespace std;

--- a/plugins/drude/platforms/opencl/src/OpenCLDrudeKernelSources.h.in
+++ b/plugins/drude/platforms/opencl/src/OpenCLDrudeKernelSources.h.in
-#ifndef OPENMM_OPENCLDRUDEKERNELSOURCES_H_
-#define OPENMM_OPENCLDRUDEKERNELSOURCES_H_
+#ifndef OPENMM_COMMONKERNELSOURCES_H_
+#define OPENMM_COMMONKERNELSOURCES_H_

 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -27,21 +27,22 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

+#include "openmm/common/windowsExportCommon.h"
 #include <string>

 namespace OpenMM {

 /**
- * This class is a central holding place for the source code of OpenCL kernels.
- * The CMake build script inserts declarations into it based on the .cu files in the
+ * This class is a central holding place for the source code of common kernels.
+ * The CMake build script inserts declarations into it based on the .cc files in the
 * kernels subfolder.
 */

-class OpenCLDrudeKernelSources {
+class OPENMM_EXPORT_COMMON CommonKernelSources {
 public:
-@CL_FILE_DECLARATIONS@
+@KERNEL_FILE_DECLARATIONS@
 };

 } // namespace OpenMM

-#endif /*OPENMM_OPENCLDRUDEKERNELSOURCES_H_*/
+#endif /*OPENMM_COMMONKERNELSOURCES_H_*/
--- a/platforms/common/src/CommonKernels.cpp
+++ b/platforms/common/src/CommonKernels.cpp
--- a/platforms/common/src/ComputeArray.cpp
+++ b/platforms/common/src/ComputeArray.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ComputeArray.h"
+#include "openmm/common/ComputeContext.h"
+
+using namespace OpenMM;
+
+ComputeArray::ComputeArray() : impl(NULL) {
+}
+
+ComputeArray::~ComputeArray() {
+    if (impl != NULL)
+        delete impl;
+}
+
+ArrayInterface& ComputeArray::getArray() {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    return *impl;
+}
+
+void ComputeArray::initialize(ComputeContext& context, int size, int elementSize, const std::string& name) {
+    if (impl != NULL)
+        throw OpenMMException("The array "+getName()+" has already been initialized");
+    impl = context.createArray();
+    impl->initialize(context, size, elementSize, name);
+}
+
+void ComputeArray::resize(int size) {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    impl->resize(size);
+}
+
+bool ComputeArray::isInitialized() const {
+    return (impl != NULL);
+}
+
+int ComputeArray::getSize() const {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    return impl->getSize();
+}
+
+int ComputeArray::getElementSize() const {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    return impl->getElementSize();
+}
+
+const std::string& ComputeArray::getName() const {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    return impl->getName();
+}
+
+ComputeContext& ComputeArray::getContext() {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    return impl->getContext();
+}
+
+void ComputeArray::upload(const void* data, bool blocking) {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    impl->upload(data, blocking);
+}
+
+void ComputeArray::download(void* data, bool blocking) const {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    impl->download(data, blocking);
+}
+
+void ComputeArray::copyTo(ArrayInterface& dest) const {
+    if (impl == NULL)
+        throw OpenMMException("ComputeArray has not been initialized");
+    impl->copyTo(dest);
+}
\ No newline at end of file
--- a/platforms/common/src/ComputeContext.cpp
+++ b/platforms/common/src/ComputeContext.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2019 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ComputeContext.h"
+#include "openmm/System.h"
+#include "openmm/VirtualSite.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/ThreadPool.h"
+#include "hilbert.h"
+#include <algorithm>
+#include <cmath>
+#include <set>
+#include <sstream>
+#include <utility>
+
+using namespace OpenMM;
+using namespace std;
+
+ComputeContext::ComputeContext(const System& system) : system(system), time(0.0), stepCount(0), computeForceCount(0), stepsSinceReorder(99999),
+        atomsWereReordered(false), forcesValid(false), thread(NULL) {
+    thread = new WorkThread();
+}
+
+ComputeContext::~ComputeContext() {
+    if (thread != NULL)
+        delete thread;
+}
+
+void ComputeContext::addForce(ComputeForceInfo* force) {
+    forces.push_back(force);
+}
+
+string ComputeContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
+    static set<char> symbolChars;
+    if (symbolChars.size() == 0) {
+        symbolChars.insert('_');
+        for (char c = 'a'; c <= 'z'; c++)
+            symbolChars.insert(c);
+        for (char c = 'A'; c <= 'Z'; c++)
+            symbolChars.insert(c);
+        for (char c = '0'; c <= '9'; c++)
+            symbolChars.insert(c);
+    }
+    string result = input;
+    for (auto& pair : replacements) {
+        int index = 0;
+        int size = pair.first.size();
+        do {
+            index = result.find(pair.first, index);
+            if (index != result.npos) {
+                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
+                    // We have found a complete symbol, not part of a longer symbol.
+
+                    result.replace(index, size, pair.second);
+                    index += pair.second.size();
+                }
+                else
+                    index++;
+            }
+        } while (index != result.npos);
+    }
+    return result;
+}
+
+string ComputeContext::doubleToString(double value) const {
+    stringstream s;
+    s.precision(getUseDoublePrecision() ? 16 : 8);
+    s << scientific << value;
+    if (!getUseDoublePrecision())
+        s << "f";
+    return s.str();
+}
+
+string ComputeContext::intToString(int value) const {
+    stringstream s;
+    s << value;
+    return s.str();
+}
+
+/**
+ * This class ensures that atom reordering doesn't break virtual sites.
+ */
+class ComputeContext::VirtualSiteInfo : public ComputeForceInfo {
+public:
+    VirtualSiteInfo(const System& system) {
+        for (int i = 0; i < system.getNumParticles(); i++) {
+            if (system.isVirtualSite(i)) {
+                const VirtualSite& vsite = system.getVirtualSite(i);
+                siteTypes.push_back(&typeid(vsite));
+                vector<int> particles;
+                particles.push_back(i);
+                for (int j = 0; j < vsite.getNumParticles(); j++)
+                    particles.push_back(vsite.getParticle(j));
+                siteParticles.push_back(particles);
+                vector<double> weights;
+                if (dynamic_cast<const TwoParticleAverageSite*>(&vsite) != NULL) {
+                    // A two particle average.
+
+                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(vsite);
+                    weights.push_back(site.getWeight(0));
+                    weights.push_back(site.getWeight(1));
+                }
+                else if (dynamic_cast<const ThreeParticleAverageSite*>(&vsite) != NULL) {
+                    // A three particle average.
+
+                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(vsite);
+                    weights.push_back(site.getWeight(0));
+                    weights.push_back(site.getWeight(1));
+                    weights.push_back(site.getWeight(2));
+                }
+                else if (dynamic_cast<const OutOfPlaneSite*>(&vsite) != NULL) {
+                    // An out of plane site.
+
+                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(vsite);
+                    weights.push_back(site.getWeight12());
+                    weights.push_back(site.getWeight13());
+                    weights.push_back(site.getWeightCross());
+                }
+                siteWeights.push_back(weights);
+            }
+        }
+    }
+    int getNumParticleGroups() {
+        return siteTypes.size();
+    }
+    void getParticlesInGroup(int index, std::vector<int>& particles) {
+        particles = siteParticles[index];
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        if (siteTypes[group1] != siteTypes[group2])
+            return false;
+        int numParticles = siteWeights[group1].size();
+        if (siteWeights[group2].size() != numParticles)
+            return false;
+        for (int i = 0; i < numParticles; i++)
+            if (siteWeights[group1][i] != siteWeights[group2][i])
+                return false;
+        return true;
+    }
+private:
+    vector<const type_info*> siteTypes;
+    vector<vector<int> > siteParticles;
+    vector<vector<double> > siteWeights;
+};
+
+void ComputeContext::findMoleculeGroups() {
+    // The first time this is called, we need to identify all the molecules in the system.
+
+    if (moleculeGroups.size() == 0) {
+        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
+
+        addForce(new VirtualSiteInfo(system));
+
+        // First make a list of every other atom to which each atom is connect by a constraint or force group.
+
+        vector<vector<int> > atomBonds(system.getNumParticles());
+        for (int i = 0; i < system.getNumConstraints(); i++) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(i, particle1, particle2, distance);
+            atomBonds[particle1].push_back(particle2);
+            atomBonds[particle2].push_back(particle1);
+        }
+        for (auto force : forces) {
+            for (int j = 0; j < force->getNumParticleGroups(); j++) {
+                vector<int> particles;
+                force->getParticlesInGroup(j, particles);
+                for (int k = 0; k < (int) particles.size(); k++)
+                    for (int m = 0; m < (int) particles.size(); m++)
+                        if (k != m)
+                            atomBonds[particles[k]].push_back(particles[m]);
+            }
+        }
+
+        // Now identify atoms by which molecule they belong to.
+
+        vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
+        int numMolecules = atomIndices.size();
+        vector<int> atomMolecule(numAtoms);
+        for (int i = 0; i < (int) atomIndices.size(); i++)
+            for (int j = 0; j < (int) atomIndices[i].size(); j++)
+                atomMolecule[atomIndices[i][j]] = i;
+
+        // Construct a description of each molecule.
+
+        molecules.resize(numMolecules);
+        for (int i = 0; i < numMolecules; i++) {
+            molecules[i].atoms = atomIndices[i];
+            molecules[i].groups.resize(forces.size());
+        }
+        for (int i = 0; i < system.getNumConstraints(); i++) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(i, particle1, particle2, distance);
+            molecules[atomMolecule[particle1]].constraints.push_back(i);
+        }
+        for (int i = 0; i < (int) forces.size(); i++)
+            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
+                vector<int> particles;
+                forces[i]->getParticlesInGroup(j, particles);
+                if (particles.size() > 0)
+                    molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
+            }
+    }
+
+    // Sort them into groups of identical molecules.
+
+    vector<Molecule> uniqueMolecules;
+    vector<vector<int> > moleculeInstances;
+    vector<vector<int> > moleculeOffsets;
+    for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
+        Molecule& mol = molecules[molIndex];
+
+        // See if it is identical to another molecule.
+
+        bool isNew = true;
+        for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
+            Molecule& mol2 = uniqueMolecules[j];
+            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
+
+            // See if the atoms are identical.
+
+            int atomOffset = mol2.atoms[0]-mol.atoms[0];
+            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
+                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
+                    identical = false;
+                for (int k = 0; k < (int) forces.size(); k++)
+                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
+                        identical = false;
+            }
+
+            // See if the constraints are identical.
+
+            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
+                int c1particle1, c1particle2, c2particle1, c2particle2;
+                double distance1, distance2;
+                system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
+                system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
+                if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
+                    identical = false;
+            }
+
+            // See if the force groups are identical.
+
+            for (int i = 0; i < (int) forces.size() && identical; i++) {
+                if (mol.groups[i].size() != mol2.groups[i].size())
+                    identical = false;
+                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
+                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
+                        identical = false;
+                    vector<int> p1, p2;
+                    forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
+                    forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
+                    for (int m = 0; m < p1.size(); m++)
+                        if (p1[m] != p2[m]-atomOffset)
+                            identical = false;
+                }
+            }
+            if (identical) {
+                moleculeInstances[j].push_back(molIndex);
+                moleculeOffsets[j].push_back(mol.atoms[0]);
+                isNew = false;
+            }
+        }
+        if (isNew) {
+            uniqueMolecules.push_back(mol);
+            moleculeInstances.push_back(vector<int>());
+            moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
+            moleculeOffsets.push_back(vector<int>());
+            moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
+        }
+    }
+    moleculeGroups.resize(moleculeInstances.size());
+    for (int i = 0; i < (int) moleculeInstances.size(); i++)
+    {
+        moleculeGroups[i].instances = moleculeInstances[i];
+        moleculeGroups[i].offsets = moleculeOffsets[i];
+        vector<int>& atoms = uniqueMolecules[i].atoms;
+        moleculeGroups[i].atoms.resize(atoms.size());
+        for (int j = 0; j < (int) atoms.size(); j++)
+            moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
+    }
+}
+
+void ComputeContext::invalidateMolecules() {
+    for (int i = 0; i < forces.size(); i++)
+        if (invalidateMolecules(forces[i]))
+            return;
+}
+
+bool ComputeContext::invalidateMolecules(ComputeForceInfo* force) {
+    if (numAtoms == 0 || !getNonbondedUtilities().getUseCutoff())
+        return false;
+    bool valid = true;
+    int forceIndex = -1;
+    for (int i = 0; i < forces.size(); i++)
+        if (forces[i] == force)
+            forceIndex = i;
+    getThreadPool().execute([&] (ThreadPool& threads, int threadIndex) {
+        for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
+            MoleculeGroup& mol = moleculeGroups[group];
+            vector<int>& instances = mol.instances;
+            vector<int>& offsets = mol.offsets;
+            vector<int>& atoms = mol.atoms;
+            int numMolecules = instances.size();
+            Molecule& m1 = molecules[instances[0]];
+            int offset1 = offsets[0];
+            int numThreads = threads.getNumThreads();
+            int start = max(1, threadIndex*numMolecules/numThreads);
+            int end = (threadIndex+1)*numMolecules/numThreads;
+            for (int j = start; j < end; j++) {
+                // See if the atoms are identical.
+
+                Molecule& m2 = molecules[instances[j]];
+                int offset2 = offsets[j];
+                for (int i = 0; i < (int) atoms.size() && valid; i++) {
+                    if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
+                        valid = false;
+                }
+
+                // See if the force groups are identical.
+
+                if (valid && forceIndex > -1) {
+                    for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
+                        if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
+                            valid = false;
+                }
+            }
+        }
+    });
+    getThreadPool().waitForThreads();
+    if (valid)
+        return false;
+
+    // The list of which molecules are identical is no longer valid.  We need to restore the
+    // atoms to their original order, rebuild the list of identical molecules, and sort them
+    // again.
+
+    vector<mm_int4> newCellOffsets(numAtoms);
+    if (getUseDoublePrecision()) {
+        vector<mm_double4> oldPosq(paddedNumAtoms);
+        vector<mm_double4> newPosq(paddedNumAtoms, mm_double4(0,0,0,0));
+        vector<mm_double4> oldVelm(paddedNumAtoms);
+        vector<mm_double4> newVelm(paddedNumAtoms, mm_double4(0,0,0,0));
+        getPosq().download(oldPosq);
+        getVelm().download(oldVelm);
+        for (int i = 0; i < numAtoms; i++) {
+            int index = atomIndex[i];
+            newPosq[index] = oldPosq[i];
+            newVelm[index] = oldVelm[i];
+            newCellOffsets[index] = posCellOffsets[i];
+        }
+        getPosq().upload(newPosq);
+        getVelm().upload(newVelm);
+    }
+    else if (getUseMixedPrecision()) {
+        vector<mm_float4> oldPosq(paddedNumAtoms);
+        vector<mm_float4> newPosq(paddedNumAtoms, mm_float4(0,0,0,0));
+        vector<mm_float4> oldPosqCorrection(paddedNumAtoms);
+        vector<mm_float4> newPosqCorrection(paddedNumAtoms, mm_float4(0,0,0,0));
+        vector<mm_double4> oldVelm(paddedNumAtoms);
+        vector<mm_double4> newVelm(paddedNumAtoms, mm_double4(0,0,0,0));
+        getPosq().download(oldPosq);
+        getVelm().download(oldVelm);
+        for (int i = 0; i < numAtoms; i++) {
+            int index = atomIndex[i];
+            newPosq[index] = oldPosq[i];
+            newPosqCorrection[index] = oldPosqCorrection[i];
+            newVelm[index] = oldVelm[i];
+            newCellOffsets[index] = posCellOffsets[i];
+        }
+        getPosq().upload(newPosq);
+        getPosqCorrection().upload(newPosqCorrection);
+        getVelm().upload(newVelm);
+    }
+    else {
+        vector<mm_float4> oldPosq(paddedNumAtoms);
+        vector<mm_float4> newPosq(paddedNumAtoms, mm_float4(0,0,0,0));
+        vector<mm_float4> oldVelm(paddedNumAtoms);
+        vector<mm_float4> newVelm(paddedNumAtoms, mm_float4(0,0,0,0));
+        getPosq().download(oldPosq);
+        getVelm().download(oldVelm);
+        for (int i = 0; i < numAtoms; i++) {
+            int index = atomIndex[i];
+            newPosq[index] = oldPosq[i];
+            newVelm[index] = oldVelm[i];
+            newCellOffsets[index] = posCellOffsets[i];
+        }
+        getPosq().upload(newPosq);
+        getVelm().upload(newVelm);
+    }
+    for (int i = 0; i < numAtoms; i++) {
+        atomIndex[i] = i;
+        posCellOffsets[i] = newCellOffsets[i];
+    }
+    getAtomIndexArray().upload(atomIndex);
+    findMoleculeGroups();
+    for (auto listener : reorderListeners)
+        listener->execute();
+    reorderAtoms();
+    return true;
+}
+
+void ComputeContext::reorderAtoms() {
+    atomsWereReordered = false;
+    if (numAtoms == 0 || !getNonbondedUtilities().getUseCutoff() || stepsSinceReorder < 250) {
+        stepsSinceReorder++;
+        return;
+    }
+    atomsWereReordered = true;
+    stepsSinceReorder = 0;
+    if (getUseDoublePrecision())
+        reorderAtomsImpl<double, mm_double4, double, mm_double4>();
+    else if (getUseMixedPrecision())
+        reorderAtomsImpl<float, mm_float4, double, mm_double4>();
+    else
+        reorderAtomsImpl<float, mm_float4, float, mm_float4>();
+}
+
+template <class Real, class Real4, class Mixed, class Mixed4>
+void ComputeContext::reorderAtomsImpl() {
+
+    // Find the range of positions and the number of bins along each axis.
+
+    vector<Real4> oldPosq(paddedNumAtoms);
+    vector<Real4> oldPosqCorrection(paddedNumAtoms);
+    vector<Mixed4> oldVelm(paddedNumAtoms);
+    getPosq().download(oldPosq);
+    getVelm().download(oldVelm);
+    if (getUseMixedPrecision())
+        getPosqCorrection().download(oldPosqCorrection);
+    Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
+    Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
+    Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
+    Vec3 periodicBoxX, periodicBoxY, periodicBoxZ;
+    getPeriodicBoxVectors(periodicBoxX, periodicBoxY, periodicBoxZ);
+    Vec3 invPeriodicBoxSize(1.0/periodicBoxX[0], 1.0/periodicBoxY[1], 1.0/periodicBoxZ[2]);
+    if (getNonbondedUtilities().getUsePeriodic()) {
+        minx = miny = minz = 0.0;
+        maxx = periodicBoxX[0];
+        maxy = periodicBoxY[1];
+        maxz = periodicBoxZ[2];
+    }
+    else {
+        for (int i = 1; i < numAtoms; i++) {
+            const Real4& pos = oldPosq[i];
+            minx = min(minx, pos.x);
+            maxx = max(maxx, pos.x);
+            miny = min(miny, pos.y);
+            maxy = max(maxy, pos.y);
+            minz = min(minz, pos.z);
+            maxz = max(maxz, pos.z);
+        }
+    }
+
+    // Loop over each group of identical molecules and reorder them.
+
+    
+    vector<int> originalIndex(numAtoms);
+    vector<Real4> newPosq(paddedNumAtoms, Real4(0,0,0,0));
+    vector<Real4> newPosqCorrection(paddedNumAtoms, Real4(0,0,0,0));
+    vector<Mixed4> newVelm(paddedNumAtoms, Mixed4(0,0,0,0));
+    vector<mm_int4> newCellOffsets(numAtoms);
+    for (auto& mol : moleculeGroups) {
+        // Find the center of each molecule.
+
+        int numMolecules = mol.offsets.size();
+        vector<int>& atoms = mol.atoms;
+        vector<Real4> molPos(numMolecules);
+        Real invNumAtoms = (Real) (1.0/atoms.size());
+        for (int i = 0; i < numMolecules; i++) {
+            molPos[i].x = 0.0f;
+            molPos[i].y = 0.0f;
+            molPos[i].z = 0.0f;
+            for (int j = 0; j < (int)atoms.size(); j++) {
+                int atom = atoms[j]+mol.offsets[i];
+                const Real4& pos = oldPosq[atom];
+                molPos[i].x += pos.x;
+                molPos[i].y += pos.y;
+                molPos[i].z += pos.z;
+            }
+            molPos[i].x *= invNumAtoms;
+            molPos[i].y *= invNumAtoms;
+            molPos[i].z *= invNumAtoms;
+            if (molPos[i].x != molPos[i].x)
+                throw OpenMMException("Particle coordinate is nan");
+        }
+        if (getNonbondedUtilities().getUsePeriodic()) {
+            // Move each molecule position into the same box.
+
+            for (int i = 0; i < numMolecules; i++) {
+                Real4 center = molPos[i];
+                int zcell = (int) floor(center.z*invPeriodicBoxSize[2]);
+                center.x -= zcell*periodicBoxZ[0];
+                center.y -= zcell*periodicBoxZ[1];
+                center.z -= zcell*periodicBoxZ[2];
+                int ycell = (int) floor(center.y*invPeriodicBoxSize[1]);
+                center.x -= ycell*periodicBoxY[0];
+                center.y -= ycell*periodicBoxY[1];
+                int xcell = (int) floor(center.x*invPeriodicBoxSize[0]);
+                center.x -= xcell*periodicBoxX[0];
+                if (xcell != 0 || ycell != 0 || zcell != 0) {
+                    Real dx = molPos[i].x-center.x;
+                    Real dy = molPos[i].y-center.y;
+                    Real dz = molPos[i].z-center.z;
+                    molPos[i] = center;
+                    for (int j = 0; j < (int) atoms.size(); j++) {
+                        int atom = atoms[j]+mol.offsets[i];
+                        Real4 p = oldPosq[atom];
+                        p.x -= dx;
+                        p.y -= dy;
+                        p.z -= dz;
+                        oldPosq[atom] = p;
+                        posCellOffsets[atom].x -= xcell;
+                        posCellOffsets[atom].y -= ycell;
+                        posCellOffsets[atom].z -= zcell;
+                    }
+                }
+            }
+        }
+
+        // Select a bin for each molecule, then sort them by bin.
+
+        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
+        Real binWidth;
+        if (useHilbert)
+            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
+        else
+            binWidth = (Real) (0.2*getNonbondedUtilities().getMaxCutoffDistance());
+        Real invBinWidth = (Real) (1.0/binWidth);
+        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
+        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
+        vector<pair<int, int> > molBins(numMolecules);
+        bitmask_t coords[3];
+        for (int i = 0; i < numMolecules; i++) {
+            int x = (int) ((molPos[i].x-minx)*invBinWidth);
+            int y = (int) ((molPos[i].y-miny)*invBinWidth);
+            int z = (int) ((molPos[i].z-minz)*invBinWidth);
+            int bin;
+            if (useHilbert) {
+                coords[0] = x;
+                coords[1] = y;
+                coords[2] = z;
+                bin = (int) hilbert_c2i(3, 8, coords);
+            }
+            else {
+                int yodd = y&1;
+                int zodd = z&1;
+                bin = z*xbins*ybins;
+                bin += (zodd ? ybins-y : y)*xbins;
+                bin += (yodd ? xbins-x : x);
+            }
+            molBins[i] = pair<int, int>(bin, i);
+        }
+        sort(molBins.begin(), molBins.end());
+
+        // Reorder the atoms.
+
+        for (int i = 0; i < numMolecules; i++) {
+            for (int atom : atoms) {
+                int oldIndex = mol.offsets[molBins[i].second]+atom;
+                int newIndex = mol.offsets[i]+atom;
+                originalIndex[newIndex] = atomIndex[oldIndex];
+                newPosq[newIndex] = oldPosq[oldIndex];
+                if (getUseMixedPrecision())
+                    newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
+                newVelm[newIndex] = oldVelm[oldIndex];
+                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
+            }
+        }
+    }
+
+    // Update the arrays.
+
+    for (int i = 0; i < numAtoms; i++) {
+        atomIndex[i] = originalIndex[i];
+        posCellOffsets[i] = newCellOffsets[i];
+    }
+    getPosq().upload(newPosq);
+    if (getUseMixedPrecision())
+        getPosqCorrection().upload(newPosqCorrection);
+    getVelm().upload(newVelm);
+    getAtomIndexArray().upload(atomIndex);
+    for (auto listener : reorderListeners)
+        listener->execute();
+}
+
+void ComputeContext::addReorderListener(ReorderListener* listener) {
+    reorderListeners.push_back(listener);
+}
+
+void ComputeContext::addPreComputation(ForcePreComputation* computation) {
+    preComputations.push_back(computation);
+}
+
+void ComputeContext::addPostComputation(ForcePostComputation* computation) {
+    postComputations.push_back(computation);
+}
+
+struct ComputeContext::WorkThread::ThreadData {
+    ThreadData(std::queue<ComputeContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
+            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
+        tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
+        waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
+    }
+    std::queue<ComputeContext::WorkTask*>& tasks;
+    bool& waiting;
+    bool& finished;
+    pthread_mutex_t& queueLock;
+    pthread_cond_t& waitForTaskCondition;
+    pthread_cond_t& queueEmptyCondition;
+};
+
+static void* threadBody(void* args) {
+    ComputeContext::WorkThread::ThreadData& data = *reinterpret_cast<ComputeContext::WorkThread::ThreadData*>(args);
+    while (!data.finished || data.tasks.size() > 0) {
+        pthread_mutex_lock(&data.queueLock);
+        while (data.tasks.empty() && !data.finished) {
+            data.waiting = true;
+            pthread_cond_signal(&data.queueEmptyCondition);
+            pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
+        }
+        ComputeContext::WorkTask* task = NULL;
+        if (!data.tasks.empty()) {
+            data.waiting = false;
+            task = data.tasks.front();
+            data.tasks.pop();
+        }
+        pthread_mutex_unlock(&data.queueLock);
+        if (task != NULL) {
+            task->execute();
+            delete task;
+        }
+    }
+    data.waiting = true;
+    pthread_cond_signal(&data.queueEmptyCondition);
+    delete &data;
+    return 0;
+}
+
+ComputeContext::WorkThread::WorkThread() : waiting(true), finished(false) {
+    pthread_mutex_init(&queueLock, NULL);
+    pthread_cond_init(&waitForTaskCondition, NULL);
+    pthread_cond_init(&queueEmptyCondition, NULL);
+    ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
+    pthread_create(&thread, NULL, threadBody, data);
+}
+
+ComputeContext::WorkThread::~WorkThread() {
+    pthread_mutex_lock(&queueLock);
+    finished = true;
+    pthread_cond_broadcast(&waitForTaskCondition);
+    pthread_mutex_unlock(&queueLock);
+    pthread_join(thread, NULL);
+    pthread_mutex_destroy(&queueLock);
+    pthread_cond_destroy(&waitForTaskCondition);
+    pthread_cond_destroy(&queueEmptyCondition);
+}
+
+void ComputeContext::WorkThread::addTask(ComputeContext::WorkTask* task) {
+    pthread_mutex_lock(&queueLock);
+    tasks.push(task);
+    waiting = false;
+    pthread_cond_signal(&waitForTaskCondition);
+    pthread_mutex_unlock(&queueLock);
+}
+
+bool ComputeContext::WorkThread::isWaiting() {
+    return waiting;
+}
+
+bool ComputeContext::WorkThread::isFinished() {
+    return finished;
+}
+
+void ComputeContext::WorkThread::flush() {
+    pthread_mutex_lock(&queueLock);
+    while (!waiting)
+       pthread_cond_wait(&queueEmptyCondition, &queueLock);
+    pthread_mutex_unlock(&queueLock);
+}
--- a/platforms/opencl/src/OpenCLForceInfo.cpp
+++ b/platforms/opencl/src/OpenCLForceInfo.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2012-2019 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -24,23 +24,23 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "OpenCLForceInfo.h"
+#include "openmm/common/ComputeForceInfo.h"

 using namespace OpenMM;
 using namespace std;

-bool OpenCLForceInfo::areParticlesIdentical(int particle1, int particle2) {
+bool ComputeForceInfo::areParticlesIdentical(int particle1, int particle2) {
    return true;
 }

-int OpenCLForceInfo::getNumParticleGroups() {
+int ComputeForceInfo::getNumParticleGroups() {
    return 0;
 }

-void OpenCLForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
+void ComputeForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
    return;
 }

-bool OpenCLForceInfo::areGroupsIdentical(int group1, int group2) {
+bool ComputeForceInfo::areGroupsIdentical(int group1, int group2) {
    return true;
 }
--- a/platforms/common/src/ComputeParameterSet.cpp
+++ b/platforms/common/src/ComputeParameterSet.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/ComputeParameterSet.h"
+#include "openmm/OpenMMException.h"
+#include <cmath>
+#include <sstream>
+
+using namespace OpenMM;
+using namespace std;
+
+ComputeParameterSet::ComputeParameterSet(ComputeContext& context, int numParameters, int numObjects, const string& name, bool arrayPerParameter, bool useDoublePrecision) :
+            context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
+    int params = numParameters;
+    int bufferCount = 0;
+    elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
+    string elementType = (useDoublePrecision ? "double" : "float");
+    if (!arrayPerParameter) {
+        while (params > 2) {
+            std::stringstream name;
+            name << "param" << (++bufferCount);
+            arrays.push_back(context.createArray());
+            arrays.back()->initialize(context, numObjects, elementSize*4, name.str());
+            params -= 4;
+        }
+        if (params > 1) {
+            std::stringstream name;
+            name << "param" << (++bufferCount);
+            arrays.push_back(context.createArray());
+            arrays.back()->initialize(context, numObjects, elementSize*2, name.str());
+            params -= 2;
+        }
+    }
+    while (params > 0) {
+        std::stringstream name;
+        name << "param" << (++bufferCount);
+            arrays.push_back(context.createArray());
+        arrays.back()->initialize(context, numObjects, elementSize, name.str());
+        params--;
+    }
+    for (ArrayInterface* array : arrays)
+        parameters.push_back(ComputeParameterInfo(*array, array->getName(), elementType, array->getElementSize()/elementSize));
+}
+
+ComputeParameterSet::~ComputeParameterSet() {
+    for (ArrayInterface* array : arrays)
+        delete array;
+}
+
+template <class T>
+void ComputeParameterSet::getParameterValues(vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called getParameterValues() with vector of wrong type");
+    values.resize(numObjects);
+    for (int i = 0; i < numObjects; i++)
+        values[i].resize(numParameters);
+    int base = 0;
+    for (int i = 0; i < (int) arrays.size(); i++) {
+        if (arrays[i]->getElementSize() == 4*elementSize) {
+            vector<T> data(4*numObjects);
+            arrays[i]->download(data.data());
+            for (int j = 0; j < numObjects; j++) {
+                values[j][base] = data[4*j];
+                if (base+1 < numParameters)
+                    values[j][base+1] = data[4*j+1];
+                if (base+2 < numParameters)
+                    values[j][base+2] = data[4*j+2];
+                if (base+3 < numParameters)
+                    values[j][base+3] = data[4*j+3];
+            }
+            base += 4;
+        }
+        else if (arrays[i]->getElementSize() == 2*elementSize) {
+            vector<T> data(2*numObjects);
+            arrays[i]->download(data.data());
+            for (int j = 0; j < numObjects; j++) {
+                values[j][base] = data[2*j];
+                if (base+1 < numParameters)
+                    values[j][base+1] = data[2*j+1];
+            }
+            base += 2;
+        }
+        else if (arrays[i]->getElementSize() == elementSize) {
+            vector<T> data(numObjects);
+            arrays[i]->download(data.data());
+            for (int j = 0; j < numObjects; j++)
+                values[j][base] = data[j];
+            base++;
+        }
+        else
+            throw OpenMMException("Internal error: Unknown buffer type in ComputeParameterSet");
+    }
+}
+
+template <class T>
+void ComputeParameterSet::setParameterValues(const vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called setParameterValues() with vector of wrong type");
+    int base = 0;
+    for (int i = 0; i < (int) arrays.size(); i++) {
+        if (arrays[i]->getElementSize() == 4*elementSize) {
+            vector<T> data(4*numObjects);
+            for (int j = 0; j < numObjects; j++) {
+                data[4*j] = values[j][base];
+                if (base+1 < numParameters)
+                    data[4*j+1] = values[j][base+1];
+                if (base+2 < numParameters)
+                    data[4*j+2] = values[j][base+2];
+                if (base+3 < numParameters)
+                    data[4*j+3] = values[j][base+3];
+            }
+            arrays[i]->upload(data.data());
+            base += 4;
+        }
+        else if (arrays[i]->getElementSize() == 2*elementSize) {
+            vector<T> data(2*numObjects);
+            for (int j = 0; j < numObjects; j++) {
+                data[2*j] = values[j][base];
+                if (base+1 < numParameters)
+                    data[2*j+1] = values[j][base+1];
+            }
+            arrays[i]->upload(data.data());
+            base += 2;
+        }
+        else if (arrays[i]->getElementSize() == elementSize) {
+            vector<T> data(numObjects);
+            for (int j = 0; j < numObjects; j++)
+                data[j] = values[j][base];
+            arrays[i]->upload(data.data());
+            base++;
+        }
+        else
+            throw OpenMMException("Internal error: Unknown buffer type in ComputeParameterSet");
+    }
+}
+
+string ComputeParameterSet::getParameterSuffix(int index, const std::string& extraSuffix) const {
+    const string suffixes[] = {".x", ".y", ".z", ".w"};
+    int buffer = -1;
+    for (int i = 0; buffer == -1 && i < (int) parameters.size(); i++) {
+        if (index*elementSize < parameters[i].getSize())
+            buffer = i;
+        else
+            index -= parameters[i].getSize()/elementSize;
+    }
+    if (buffer == -1)
+        throw OpenMMException("Internal error: Illegal argument to ComputeParameterSet::getParameterSuffix() ("+name+")");
+    stringstream suffix;
+    suffix << (buffer+1) << extraSuffix;
+    if (parameters[buffer].getSize() != elementSize)
+        suffix << suffixes[index];
+    return suffix.str();
+}
+
+/**
+ * Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
+ */
+namespace OpenMM {
+template void ComputeParameterSet::getParameterValues<float>(vector<vector<float> >& values);
+template void ComputeParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
+template void ComputeParameterSet::getParameterValues<double>(vector<vector<double> >& values);
+template void ComputeParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
+}
\ No newline at end of file
--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -24,7 +24,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "CudaExpressionUtilities.h"
+#include "openmm/common/ExpressionUtilities.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/SplineFitter.h"
 #include "lepton/Operation.h"
@@ -33,10 +33,10 @@ using namespace OpenMM;
 using namespace Lepton;
 using namespace std;

-CudaExpressionUtilities::CudaExpressionUtilities(CudaContext& context) : context(context), fp1(1), fp2(2), fp3(3), periodicDistance(6) {
+ExpressionUtilities::ExpressionUtilities(ComputeContext& context) : context(context), fp1(1), fp2(2), fp3(3), periodicDistance(6) {
 }

-string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
+string ExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
        const vector<const TabulatedFunction*>& functions, const vector<pair<string, string> >& functionNames, const string& prefix, const string& tempType) {
    vector<pair<ExpressionTreeNode, string> > variableNodes;
    for (map<string, string>::const_iterator iter = variables.begin(); iter != variables.end(); ++iter)
@@ -44,7 +44,7 @@ string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpres
    return createExpressions(expressions, variableNodes, functions, functionNames, prefix, tempType);
 }

-string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const vector<pair<ExpressionTreeNode, string> >& variables,
+string ExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const vector<pair<ExpressionTreeNode, string> >& variables,
        const vector<const TabulatedFunction*>& functions, const vector<pair<string, string> >& functionNames, const string& prefix, const string& tempType) {
    stringstream out;
    vector<ParsedExpression> allExpressions;
@@ -59,7 +59,7 @@ string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpres
    return out.str();
 }

-void CudaExpressionUtilities::processExpression(stringstream& out, const ExpressionTreeNode& node, vector<pair<ExpressionTreeNode, string> >& temps,
+void ExpressionUtilities::processExpression(stringstream& out, const ExpressionTreeNode& node, vector<pair<ExpressionTreeNode, string> >& temps,
        const vector<const TabulatedFunction*>& functions, const vector<pair<string, string> >& functionNames, const string& prefix, const vector<vector<double> >& functionParams,
        const vector<ParsedExpression>& allExpressions, const string& tempType) {
    for (int i = 0; i < (int) temps.size(); i++)
@@ -662,7 +662,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
        temps.push_back(make_pair(node, name));
 }

-string CudaExpressionUtilities::getTempName(const ExpressionTreeNode& node, const vector<pair<ExpressionTreeNode, string> >& temps) {
+string ExpressionUtilities::getTempName(const ExpressionTreeNode& node, const vector<pair<ExpressionTreeNode, string> >& temps) {
    for (int i = 0; i < (int) temps.size(); i++)
        if (temps[i].first == node)
            return temps[i].second;
@@ -671,7 +671,7 @@ string CudaExpressionUtilities::getTempName(const ExpressionTreeNode& node, cons
    throw OpenMMException(out.str());
 }

-void CudaExpressionUtilities::findRelatedCustomFunctions(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode,
+void ExpressionUtilities::findRelatedCustomFunctions(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode,
            vector<const Lepton::ExpressionTreeNode*>& nodes) {
    if (searchNode.getOperation().getId() == Operation::CUSTOM && node.getOperation().getName() == searchNode.getOperation().getName()) {
        // Make sure the arguments are identical.
@@ -695,7 +695,7 @@ void CudaExpressionUtilities::findRelatedCustomFunctions(const ExpressionTreeNod
            findRelatedCustomFunctions(node, searchNode.getChildren()[i], nodes);
 }

-void CudaExpressionUtilities::findRelatedPowers(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode, map<int, const ExpressionTreeNode*>& powers) {
+void ExpressionUtilities::findRelatedPowers(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode, map<int, const ExpressionTreeNode*>& powers) {
    if (searchNode.getOperation().getId() == Operation::POWER_CONSTANT && node.getChildren()[0] == searchNode.getChildren()[0]) {
        double realPower = dynamic_cast<const Operation::PowerConstant*>(&searchNode.getOperation())->getValue();
        int power = (int) realPower;
@@ -712,7 +712,7 @@ void CudaExpressionUtilities::findRelatedPowers(const ExpressionTreeNode& node,
            findRelatedPowers(node, searchNode.getChildren()[i], powers);
 }

-vector<float> CudaExpressionUtilities::computeFunctionCoefficients(const TabulatedFunction& function, int& width) {
+vector<float> ExpressionUtilities::computeFunctionCoefficients(const TabulatedFunction& function, int& width) {
    if (dynamic_cast<const Continuous1DFunction*>(&function) != NULL) {
        // Compute the spline coefficients.

@@ -827,7 +827,7 @@ vector<float> CudaExpressionUtilities::computeFunctionCoefficients(const Tabulat
    throw OpenMMException("computeFunctionCoefficients: Unknown function type");
 }

-vector<vector<double> > CudaExpressionUtilities::computeFunctionParameters(const vector<const TabulatedFunction*>& functions) {
+vector<vector<double> > ExpressionUtilities::computeFunctionParameters(const vector<const TabulatedFunction*>& functions) {
    vector<vector<double> > params(functions.size());
    for (int i = 0; i < (int) functions.size(); i++) {
        if (dynamic_cast<const Continuous1DFunction*>(functions[i]) != NULL) {
@@ -903,7 +903,7 @@ vector<vector<double> > CudaExpressionUtilities::computeFunctionParameters(const
    return params;
 }

-Lepton::CustomFunction* CudaExpressionUtilities::getFunctionPlaceholder(const TabulatedFunction& function) {
+Lepton::CustomFunction* ExpressionUtilities::getFunctionPlaceholder(const TabulatedFunction& function) {
    if (dynamic_cast<const Continuous1DFunction*>(&function) != NULL)
        return &fp1;
    if (dynamic_cast<const Continuous2DFunction*>(&function) != NULL)
@@ -919,11 +919,11 @@ Lepton::CustomFunction* CudaExpressionUtilities::getFunctionPlaceholder(const Ta
    throw OpenMMException("getFunctionPlaceholder: Unknown function type");
 }

-Lepton::CustomFunction* CudaExpressionUtilities::getPeriodicDistancePlaceholder() {
+Lepton::CustomFunction* ExpressionUtilities::getPeriodicDistancePlaceholder() {
    return &periodicDistance;
 }

-void CudaExpressionUtilities::callFunction(stringstream& out, string singleFn, string doubleFn, const string& arg, const string& tempType) {
+void ExpressionUtilities::callFunction(stringstream& out, string singleFn, string doubleFn, const string& arg, const string& tempType) {
    bool isDouble = (tempType[0] == 'd');
    bool isVector = (tempType[tempType.size()-1] == '3');
    string fn = (isDouble ? doubleFn : singleFn);
@@ -933,7 +933,7 @@ void CudaExpressionUtilities::callFunction(stringstream& out, string singleFn, s
        out<<fn<<"("<<arg<<")";
 }

-void CudaExpressionUtilities::callFunction2(stringstream& out, string singleFn, string doubleFn, const string& arg1, const string& arg2, const string& tempType) {
+void ExpressionUtilities::callFunction2(stringstream& out, string singleFn, string doubleFn, const string& arg1, const string& arg2, const string& tempType) {
    bool isDouble = (tempType[0] == 'd');
    bool isVector = (tempType[tempType.size()-1] == '3');
    string fn = (isDouble ? doubleFn : singleFn);

--- a/platforms/common/src/IntegrationUtilities.cpp
+++ b/platforms/common/src/IntegrationUtilities.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2019 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/common/IntegrationUtilities.h"
+#include "openmm/common/ComputeContext.h"
+#include "CommonKernelSources.h"
+#include "openmm/internal/OSRngSeed.h"
+#include "openmm/HarmonicAngleForce.h"
+#include "openmm/VirtualSite.h"
+#include "quern.h"
+#include "ReferenceCCMAAlgorithm.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <map>
+
+using namespace OpenMM;
+using namespace std;
+
+struct IntegrationUtilities::ShakeCluster {
+    int centralID;
+    int peripheralID[3];
+    int size;
+    bool valid;
+    double distance;
+    double centralInvMass, peripheralInvMass;
+    ShakeCluster() : valid(true) {
+    }
+    ShakeCluster(int centralID, double invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
+    }
+    void addAtom(int id, double dist, double invMass) {
+        if (size == 3 || (size > 0 && abs(dist-distance)/distance > 1e-8) || (size > 0 && abs(invMass-peripheralInvMass)/peripheralInvMass > 1e-8))
+            valid = false;
+        else {
+            peripheralID[size++] = id;
+            distance = dist;
+            peripheralInvMass = invMass;
+        }
+    }
+    void markInvalid(map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
+    {
+        valid = false;
+        invalidForShake[centralID] = true;
+        for (int i = 0; i < size; i++) {
+            invalidForShake[peripheralID[i]] = true;
+            map<int, ShakeCluster>::iterator otherCluster = allClusters.find(peripheralID[i]);
+            if (otherCluster != allClusters.end() && otherCluster->second.valid)
+                otherCluster->second.markInvalid(allClusters, invalidForShake);
+        }
+    }
+};
+
+struct IntegrationUtilities::ConstraintOrderer : public binary_function<int, int, bool> {
+    const vector<int>& atom1;
+    const vector<int>& atom2;
+    const vector<int>& constraints;
+    ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
+    }
+    bool operator()(int x, int y) {
+        int ix = constraints[x];
+        int iy = constraints[y];
+        if (atom1[ix] != atom1[iy])
+            return atom1[ix] < atom1[iy];
+        return atom2[ix] < atom2[iy];
+    }
+};
+
+IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System& system) : context(context),
+        randomPos(0), hasOverlappingVsites(false) {
+    // Create workspace arrays.
+
+    lastStepSize = mm_double2(0.0, 0.0);
+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
+        posDelta.initialize<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
+        vector<mm_double4> deltas(posDelta.getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
+        posDelta.upload(deltas);
+        stepSize.initialize<mm_double2>(context, 1, "stepSize");
+        stepSize.upload(&lastStepSize);
+    }
+    else {
+        posDelta.initialize<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
+        vector<mm_float4> deltas(posDelta.getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
+        posDelta.upload(deltas);
+        stepSize.initialize<mm_float2>(context, 1, "stepSize");
+        mm_float2 lastStepSizeFloat = mm_float2(0.0f, 0.0f);
+        stepSize.upload(&lastStepSizeFloat);
+    }
+
+    // Record the set of constraints and how many constraints each atom is involved in.
+
+    vector<int> atom1;
+    vector<int> atom2;
+    vector<double> distance;
+    vector<int> constraintCount(context.getNumAtoms(), 0);
+    for (int i = 0; i < system.getNumConstraints(); i++) {
+        int p1, p2;
+        double d;
+        system.getConstraintParameters(i, p1, p2, d);
+        if (system.getParticleMass(p1) != 0 || system.getParticleMass(p2) != 0) {
+            atom1.push_back(p1);
+            atom2.push_back(p2);
+            distance.push_back(d);
+            constraintCount[p1]++;
+            constraintCount[p2]++;
+        }
+    }
+
+    // Identify clusters of three atoms that can be treated with SETTLE.  First, for every
+    // atom that might be part of such a cluster, make a list of the two other atoms it is
+    // connected to.
+
+    int numAtoms = system.getNumParticles();
+    vector<map<int, float> > settleConstraints(numAtoms);
+    for (int i = 0; i < (int)atom1.size(); i++) {
+        if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
+            settleConstraints[atom1[i]][atom2[i]] = (float) distance[i];
+            settleConstraints[atom2[i]][atom1[i]] = (float) distance[i];
+        }
+    }
+
+    // Now remove the ones that don't actually form closed loops of three atoms.
+
+    vector<int> settleClusters;
+    for (int i = 0; i < (int)settleConstraints.size(); i++) {
+        if (settleConstraints[i].size() == 2) {
+            int partner1 = settleConstraints[i].begin()->first;
+            int partner2 = (++settleConstraints[i].begin())->first;
+            if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
+                    settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
+                settleConstraints[i].clear();
+            else if (i < partner1 && i < partner2)
+                settleClusters.push_back(i);
+        }
+        else
+            settleConstraints[i].clear();
+    }
+
+    // Record the SETTLE clusters.
+
+    vector<bool> isShakeAtom(numAtoms, false);
+    if (settleClusters.size() > 0) {
+        vector<mm_int4> atoms;
+        vector<mm_float2> params;
+        for (int i = 0; i < (int) settleClusters.size(); i++) {
+            int atom1 = settleClusters[i];
+            int atom2 = settleConstraints[atom1].begin()->first;
+            int atom3 = (++settleConstraints[atom1].begin())->first;
+            float dist12 = settleConstraints[atom1].find(atom2)->second;
+            float dist13 = settleConstraints[atom1].find(atom3)->second;
+            float dist23 = settleConstraints[atom2].find(atom3)->second;
+            if (dist12 == dist13) {
+                // atom1 is the central atom
+                atoms.push_back(mm_int4(atom1, atom2, atom3, 0));
+                params.push_back(mm_float2(dist12, dist23));
+            }
+            else if (dist12 == dist23) {
+                // atom2 is the central atom
+                atoms.push_back(mm_int4(atom2, atom1, atom3, 0));
+                params.push_back(mm_float2(dist12, dist13));
+            }
+            else if (dist13 == dist23) {
+                // atom3 is the central atom
+                atoms.push_back(mm_int4(atom3, atom1, atom2, 0));
+                params.push_back(mm_float2(dist13, dist12));
+            }
+            else
+                continue; // We can't handle this with SETTLE
+            isShakeAtom[atom1] = true;
+            isShakeAtom[atom2] = true;
+            isShakeAtom[atom3] = true;
+        }
+        if (atoms.size() > 0) {
+            settleAtoms.initialize<mm_int4>(context, atoms.size(), "settleAtoms");
+            settleParams.initialize<mm_float2>(context, params.size(), "settleParams");
+            settleAtoms.upload(atoms);
+            settleParams.upload(params);
+        }
+    }
+
+    // Find clusters consisting of a central atom with up to three peripheral atoms.
+
+    map<int, ShakeCluster> clusters;
+    vector<bool> invalidForShake(numAtoms, false);
+    for (int i = 0; i < (int) atom1.size(); i++) {
+        if (isShakeAtom[atom1[i]])
+            continue; // This is being taken care of with SETTLE.
+
+        // Determine which is the central atom.
+
+        bool firstIsCentral;
+        if (constraintCount[atom1[i]] > 1)
+            firstIsCentral = true;
+        else if (constraintCount[atom2[i]] > 1)
+            firstIsCentral = false;
+        else if (atom1[i] < atom2[i])
+            firstIsCentral = true;
+        else
+            firstIsCentral = false;
+        int centralID, peripheralID;
+        if (firstIsCentral) {
+            centralID = atom1[i];
+            peripheralID = atom2[i];
+        }
+        else {
+            centralID = atom2[i];
+            peripheralID = atom1[i];
+        }
+
+        // Add it to the cluster.
+
+        if (clusters.find(centralID) == clusters.end()) {
+            clusters[centralID] = ShakeCluster(centralID, 1.0/system.getParticleMass(centralID));
+        }
+        ShakeCluster& cluster = clusters[centralID];
+        cluster.addAtom(peripheralID, distance[i], 1.0/system.getParticleMass(peripheralID));
+        if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
+            cluster.markInvalid(clusters, invalidForShake);
+            map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
+            if (otherCluster != clusters.end() && otherCluster->second.valid)
+                otherCluster->second.markInvalid(clusters, invalidForShake);
+        }
+    }
+    int validShakeClusters = 0;
+    for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        ShakeCluster& cluster = iter->second;
+        if (cluster.valid) {
+            cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
+            for (int i = 0; i < cluster.size; i++)
+                if (invalidForShake[cluster.peripheralID[i]])
+                    cluster.valid = false;
+            if (cluster.valid)
+                ++validShakeClusters;
+        }
+    }
+
+    // Record the SHAKE clusters.
+
+    if (validShakeClusters > 0) {
+        vector<mm_int4> atoms;
+        vector<mm_float4> params;
+        int index = 0;
+        for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+            const ShakeCluster& cluster = iter->second;
+            if (!cluster.valid)
+                continue;
+            atoms.push_back(mm_int4(cluster.centralID, cluster.peripheralID[0], (cluster.size > 1 ? cluster.peripheralID[1] : -1), (cluster.size > 2 ? cluster.peripheralID[2] : -1)));
+            params.push_back(mm_float4((float) cluster.centralInvMass, (float) (0.5/(cluster.centralInvMass+cluster.peripheralInvMass)), (float) (cluster.distance*cluster.distance), (float) cluster.peripheralInvMass));
+            isShakeAtom[cluster.centralID] = true;
+            isShakeAtom[cluster.peripheralID[0]] = true;
+            if (cluster.size > 1)
+                isShakeAtom[cluster.peripheralID[1]] = true;
+            if (cluster.size > 2)
+                isShakeAtom[cluster.peripheralID[2]] = true;
+            ++index;
+        }
+        shakeAtoms.initialize<mm_int4>(context, atoms.size(), "shakeAtoms");
+        shakeParams.initialize<mm_float4>(context, params.size(), "shakeParams");
+        shakeAtoms.upload(atoms);
+        shakeParams.upload(params);
+    }
+
+    // Find connected constraints for CCMA.
+
+    vector<int> ccmaConstraints;
+    for (unsigned i = 0; i < atom1.size(); i++)
+        if (!isShakeAtom[atom1[i]])
+            ccmaConstraints.push_back(i);
+
+    // Record the connections between constraints.
+
+    int numCCMA = (int) ccmaConstraints.size();
+    if (numCCMA > 0) {
+        // Record information needed by ReferenceCCMAAlgorithm.
+        
+        vector<pair<int, int> > refIndices(numCCMA);
+        vector<double> refDistance(numCCMA);
+        for (int i = 0; i < numCCMA; i++) {
+            int index = ccmaConstraints[i];
+            refIndices[i] = make_pair(atom1[index], atom2[index]);
+            refDistance[i] = distance[index];
+        }
+        vector<double> refMasses(numAtoms);
+        for (int i = 0; i < numAtoms; ++i)
+            refMasses[i] = system.getParticleMass(i);
+
+        // Look up angles for CCMA.
+        
+        vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
+        for (int i = 0; i < system.getNumForces(); i++) {
+            const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
+            if (force != NULL) {
+                for (int j = 0; j < force->getNumAngles(); j++) {
+                    int atom1, atom2, atom3;
+                    double angle, k;
+                    force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
+                    angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, angle));
+                }
+            }
+        }
+        
+        // Create a ReferenceCCMAAlgorithm.  It will build and invert the constraint matrix for us.
+        
+        ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
+        vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
+        int maxRowElements = 0;
+        for (unsigned i = 0; i < matrix.size(); i++)
+            maxRowElements = max(maxRowElements, (int) matrix[i].size());
+        maxRowElements++;
+
+        // Build the list of constraints for each atom.
+
+        vector<vector<int> > atomConstraints(context.getNumAtoms());
+        for (int i = 0; i < numCCMA; i++) {
+            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
+            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
+        }
+        int maxAtomConstraints = 0;
+        for (unsigned i = 0; i < atomConstraints.size(); i++)
+            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
+
+        // Sort the constraints.
+
+        vector<int> constraintOrder(numCCMA);
+        for (int i = 0; i < numCCMA; ++i)
+            constraintOrder[i] = i;
+        sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
+        vector<int> inverseOrder(numCCMA);
+        for (int i = 0; i < numCCMA; ++i)
+            inverseOrder[constraintOrder[i]] = i;
+        for (int i = 0; i < (int)matrix.size(); ++i)
+            for (int j = 0; j < (int)matrix[i].size(); ++j)
+                matrix[i][j].first = inverseOrder[matrix[i][j].first];
+
+        // Record the CCMA data structures.
+
+        ccmaAtoms.initialize<mm_int2>(context, numCCMA, "CcmaAtoms");
+        ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
+        ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
+        ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
+        ccmaConverged.initialize<int>(context, 2, "ccmaConverged");
+        vector<mm_int2> atomsVec(ccmaAtoms.getSize());
+        vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
+        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
+        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
+        int elementSize = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+        ccmaDistance.initialize(context, numCCMA, 4*elementSize, "CcmaDistance");
+        ccmaDelta1.initialize(context, numCCMA, elementSize, "CcmaDelta1");
+        ccmaDelta2.initialize(context, numCCMA, elementSize, "CcmaDelta2");
+        ccmaReducedMass.initialize(context, numCCMA, elementSize, "CcmaReducedMass");
+        ccmaConstraintMatrixValue.initialize(context, numCCMA*maxRowElements, elementSize, "ConstraintMatrixValue");
+        vector<mm_double4> distanceVec(ccmaDistance.getSize());
+        vector<double> reducedMassVec(ccmaReducedMass.getSize());
+        vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
+        for (int i = 0; i < numCCMA; i++) {
+            int index = constraintOrder[i];
+            int c = ccmaConstraints[index];
+            atomsVec[i].x = atom1[c];
+            atomsVec[i].y = atom2[c];
+            distanceVec[i].w = distance[c];
+            reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+            for (unsigned int j = 0; j < matrix[index].size(); j++) {
+                constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+                constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
+            }
+            constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
+        }
+        ccmaDistance.upload(distanceVec, true);
+        ccmaReducedMass.upload(reducedMassVec, true);
+        ccmaConstraintMatrixValue.upload(constraintMatrixValueVec, true);
+        for (unsigned int i = 0; i < atomConstraints.size(); i++) {
+            numAtomConstraintsVec[i] = atomConstraints[i].size();
+            for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
+                bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
+                atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
+            }
+        }
+        ccmaAtoms.upload(atomsVec);
+        ccmaAtomConstraints.upload(atomConstraintsVec);
+        ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
+        ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
+    }
+    
+    // Build the list of virtual sites.
+    
+    vector<mm_int4> vsite2AvgAtomVec;
+    vector<mm_double2> vsite2AvgWeightVec;
+    vector<mm_int4> vsite3AvgAtomVec;
+    vector<mm_double4> vsite3AvgWeightVec;
+    vector<mm_int4> vsiteOutOfPlaneAtomVec;
+    vector<mm_double4> vsiteOutOfPlaneWeightVec;
+    vector<int> vsiteLocalCoordsIndexVec;
+    vector<int> vsiteLocalCoordsAtomVec;
+    vector<int> vsiteLocalCoordsStartVec;
+    vector<double> vsiteLocalCoordsWeightVec;
+    vector<mm_double4> vsiteLocalCoordsPosVec;
+    for (int i = 0; i < numAtoms; i++) {
+        if (system.isVirtualSite(i)) {
+            if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                // A two particle average.
+                
+                const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
+                vsite2AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), 0));
+                vsite2AvgWeightVec.push_back(mm_double2(site.getWeight(0), site.getWeight(1)));
+            }
+            else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                // A three particle average.
+                
+                const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
+                vsite3AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
+                vsite3AvgWeightVec.push_back(mm_double4(site.getWeight(0), site.getWeight(1), site.getWeight(2), 0.0));
+            }
+            else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
+                // An out of plane site.
+                
+                const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
+                vsiteOutOfPlaneAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
+                vsiteOutOfPlaneWeightVec.push_back(mm_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
+            }
+            else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
+                // A local coordinates site.
+                
+                const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
+                int numParticles = site.getNumParticles();
+                vector<double> origin, x, y;
+                site.getOriginWeights(origin);
+                site.getXWeights(x);
+                site.getYWeights(y);
+                vsiteLocalCoordsIndexVec.push_back(i);
+                vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
+                for (int j = 0; j < numParticles; j++) {
+                    vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
+                    vsiteLocalCoordsWeightVec.push_back(origin[j]);
+                    vsiteLocalCoordsWeightVec.push_back(x[j]);
+                    vsiteLocalCoordsWeightVec.push_back(y[j]);
+                }
+                Vec3 pos = site.getLocalPosition();
+                vsiteLocalCoordsPosVec.push_back(mm_double4(pos[0], pos[1], pos[2], 0.0));
+            }
+        }
+    }
+    vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
+    int num2Avg = vsite2AvgAtomVec.size();
+    int num3Avg = vsite3AvgAtomVec.size();
+    int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
+    int numLocalCoords = vsiteLocalCoordsPosVec.size();
+    numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords;
+    vsite2AvgAtoms.initialize<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
+    vsite3AvgAtoms.initialize<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
+    vsiteOutOfPlaneAtoms.initialize<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
+    vsiteLocalCoordsIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
+    vsiteLocalCoordsAtoms.initialize<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
+    vsiteLocalCoordsStartIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
+    if (num2Avg > 0)
+        vsite2AvgAtoms.upload(vsite2AvgAtomVec);
+    if (num3Avg > 0)
+        vsite3AvgAtoms.upload(vsite3AvgAtomVec);
+    if (numOutOfPlane > 0)
+        vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec);
+    if (numLocalCoords > 0) {
+        vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec);
+        vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
+        vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
+    }
+    int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
+    vsite2AvgWeights.initialize(context, max(1, num2Avg), 2*elementSize, "vsite2AvgWeights");
+    vsite3AvgWeights.initialize(context, max(1, num3Avg), 4*elementSize, "vsite3AvgWeights");
+    vsiteOutOfPlaneWeights.initialize(context, max(1, numOutOfPlane), 4*elementSize, "vsiteOutOfPlaneWeights");
+    vsiteLocalCoordsWeights.initialize(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), elementSize, "vsiteLocalCoordsWeights");
+    vsiteLocalCoordsPos.initialize(context, max(1, (int) vsiteLocalCoordsPosVec.size()), 4*elementSize, "vsiteLocalCoordsPos");
+    if (num2Avg > 0)
+        vsite2AvgWeights.upload(vsite2AvgWeightVec, true);
+    if (num3Avg > 0)
+        vsite3AvgWeights.upload(vsite3AvgWeightVec, true);
+    if (numOutOfPlane > 0)
+        vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec, true);
+    if (numLocalCoords > 0) {
+        vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec, true);
+        vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec, true);
+    }
+
+    // If multiple virtual sites depend on the same particle, make sure the force distribution
+    // can be done safely.
+    
+    vector<int> atomCounts(numAtoms, 0);
+    for (int i = 0; i < numAtoms; i++)
+        if (system.isVirtualSite(i))
+            for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
+                atomCounts[system.getVirtualSite(i).getParticle(j)]++;
+    for (int i = 0; i < numAtoms; i++)
+        if (atomCounts[i] > 1)
+            hasOverlappingVsites = true;
+    if (hasOverlappingVsites && !context.getSupports64BitGlobalAtomics())
+        throw OpenMMException("This device does not support 64 bit atomics.  Cannot have multiple virtual sites that depend on the same atom.");
+
+    // Create the kernels used by this class.
+
+    map<string, string> defines;
+    defines["NUM_CCMA_CONSTRAINTS"] = context.intToString(numCCMA);
+    defines["NUM_ATOMS"] = context.intToString(numAtoms);
+    defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
+    defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
+    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
+    defines["NUM_LOCAL_COORDS"] = context.intToString(numLocalCoords);
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    if (hasOverlappingVsites)
+        defines["HAS_OVERLAPPING_VSITES"] = "1";
+    ComputeProgram program = context.compileProgram(CommonKernelSources::integrationUtilities, defines);
+    settlePosKernel = program->createKernel("applySettleToPositions");
+    settleVelKernel = program->createKernel("applySettleToVelocities");
+    shakePosKernel = program->createKernel("applyShakeToPositions");
+    shakeVelKernel = program->createKernel("applyShakeToVelocities");
+    ccmaDirectionsKernel = program->createKernel("computeCCMAConstraintDirections");
+    ccmaPosForceKernel = program->createKernel("computeCCMAPositionConstraintForce");
+    ccmaVelForceKernel = program->createKernel("computeCCMAVelocityConstraintForce");
+    ccmaMultiplyKernel = program->createKernel("multiplyByCCMAConstraintMatrix");
+    ccmaUpdateKernel = program->createKernel("updateCCMAAtomPositions");
+    vsitePositionKernel = program->createKernel("computeVirtualSites");
+    vsiteForceKernel = program->createKernel("distributeVirtualSiteForces");
+    vsiteSaveForcesKernel = program->createKernel("saveDistributedForces");
+    randomKernel = program->createKernel("generateRandomNumbers");
+    timeShiftKernel = program->createKernel("timeShiftVelocities");
+
+    // Set arguments for virtual site kernels.
+
+    vsitePositionKernel->addArg(context.getPosq());
+    if (context.getUseMixedPrecision())
+        vsitePositionKernel->addArg(context.getPosqCorrection());
+    else
+        vsitePositionKernel->addArg(NULL);
+    vsitePositionKernel->addArg(vsite2AvgAtoms);
+    vsitePositionKernel->addArg(vsite2AvgWeights);
+    vsitePositionKernel->addArg(vsite3AvgAtoms);
+    vsitePositionKernel->addArg(vsite3AvgWeights);
+    vsitePositionKernel->addArg(vsiteOutOfPlaneAtoms);
+    vsitePositionKernel->addArg(vsiteOutOfPlaneWeights);
+    vsitePositionKernel->addArg(vsiteLocalCoordsIndex);
+    vsitePositionKernel->addArg(vsiteLocalCoordsAtoms);
+    vsitePositionKernel->addArg(vsiteLocalCoordsWeights);
+    vsitePositionKernel->addArg(vsiteLocalCoordsPos);
+    vsitePositionKernel->addArg(vsiteLocalCoordsStartIndex);
+    vsiteForceKernel->addArg(context.getPosq());
+    if (context.getUseMixedPrecision())
+        vsiteForceKernel->addArg(context.getPosqCorrection());
+    else
+        vsiteForceKernel->addArg(NULL);
+    vsiteForceKernel->addArg(); // Skip argument 2: the force array hasn't been created yet.
+    vsiteForceKernel->addArg(vsite2AvgAtoms);
+    vsiteForceKernel->addArg(vsite2AvgWeights);
+    vsiteForceKernel->addArg(vsite3AvgAtoms);
+    vsiteForceKernel->addArg(vsite3AvgWeights);
+    vsiteForceKernel->addArg(vsiteOutOfPlaneAtoms);
+    vsiteForceKernel->addArg(vsiteOutOfPlaneWeights);
+    vsiteForceKernel->addArg(vsiteLocalCoordsIndex);
+    vsiteForceKernel->addArg(vsiteLocalCoordsAtoms);
+    vsiteForceKernel->addArg(vsiteLocalCoordsWeights);
+    vsiteForceKernel->addArg(vsiteLocalCoordsPos);
+    vsiteForceKernel->addArg(vsiteLocalCoordsStartIndex);
+    for (int i = 0; i < 3; i++)
+        vsiteSaveForcesKernel->addArg();
+
+    // Set arguments for constraint kernels.
+
+    if (settleAtoms.isInitialized()) {
+        settlePosKernel->addArg(settleAtoms.getSize());
+        settlePosKernel->addArg();
+        settlePosKernel->addArg(context.getPosq());
+        settlePosKernel->addArg(posDelta);
+        settlePosKernel->addArg(context.getVelm());
+        settlePosKernel->addArg(settleAtoms);
+        settlePosKernel->addArg(settleParams);
+        if (context.getUseMixedPrecision())
+            settlePosKernel->addArg(context.getPosqCorrection());
+        settleVelKernel->addArg(settleAtoms.getSize());
+        settleVelKernel->addArg();
+        settleVelKernel->addArg(context.getPosq());
+        settleVelKernel->addArg(posDelta);
+        settleVelKernel->addArg(context.getVelm());
+        settleVelKernel->addArg(settleAtoms);
+        settleVelKernel->addArg(settleParams);
+        if (context.getUseMixedPrecision())
+            settleVelKernel->addArg(context.getPosqCorrection());
+    }
+    if (shakeAtoms.isInitialized()) {
+        shakePosKernel->addArg(shakeAtoms.getSize());
+        shakePosKernel->addArg();
+        shakePosKernel->addArg(context.getPosq());
+        shakePosKernel->addArg(posDelta);
+        shakePosKernel->addArg(shakeAtoms);
+        shakePosKernel->addArg(shakeParams);
+        if (context.getUseMixedPrecision())
+            shakePosKernel->addArg(context.getPosqCorrection());
+        shakeVelKernel->addArg(shakeAtoms.getSize());
+        shakeVelKernel->addArg();
+        shakeVelKernel->addArg(context.getPosq());
+        shakeVelKernel->addArg(context.getVelm());
+        shakeVelKernel->addArg(shakeAtoms);
+        shakeVelKernel->addArg(shakeParams);
+        if (context.getUseMixedPrecision())
+            shakeVelKernel->addArg(context.getPosqCorrection());
+    }
+    if (ccmaAtoms.isInitialized()) {
+        ccmaDirectionsKernel->addArg(ccmaAtoms);
+        ccmaDirectionsKernel->addArg(ccmaDistance);
+        ccmaDirectionsKernel->addArg(context.getPosq());
+        ccmaDirectionsKernel->addArg(ccmaConverged);
+        if (context.getUseMixedPrecision())
+            ccmaDirectionsKernel->addArg(context.getPosqCorrection());
+        ccmaPosForceKernel->addArg(ccmaAtoms);
+        ccmaPosForceKernel->addArg(ccmaDistance);
+        ccmaPosForceKernel->addArg(posDelta);
+        ccmaPosForceKernel->addArg(ccmaReducedMass);
+        ccmaPosForceKernel->addArg(ccmaDelta1);
+        ccmaPosForceKernel->addArg(ccmaConverged);
+        ccmaPosForceKernel->addArg();
+        ccmaPosForceKernel->addArg();
+        ccmaPosForceKernel->addArg();
+        ccmaVelForceKernel->addArg(ccmaAtoms);
+        ccmaVelForceKernel->addArg(ccmaDistance);
+        ccmaVelForceKernel->addArg(context.getVelm());
+        ccmaVelForceKernel->addArg(ccmaReducedMass);
+        ccmaVelForceKernel->addArg(ccmaDelta1);
+        ccmaVelForceKernel->addArg(ccmaConverged);
+        ccmaVelForceKernel->addArg();
+        ccmaVelForceKernel->addArg();
+        ccmaVelForceKernel->addArg();
+        ccmaMultiplyKernel->addArg(ccmaDelta1);
+        ccmaMultiplyKernel->addArg(ccmaDelta2);
+        ccmaMultiplyKernel->addArg(ccmaConstraintMatrixColumn);
+        ccmaMultiplyKernel->addArg(ccmaConstraintMatrixValue);
+        ccmaMultiplyKernel->addArg(ccmaConverged);
+        ccmaMultiplyKernel->addArg();
+        ccmaUpdateKernel->addArg(ccmaNumAtomConstraints);
+        ccmaUpdateKernel->addArg(ccmaAtomConstraints);
+        ccmaUpdateKernel->addArg(ccmaDistance);
+        ccmaUpdateKernel->addArg();
+        ccmaUpdateKernel->addArg(context.getVelm());
+        ccmaUpdateKernel->addArg(ccmaDelta1);
+        ccmaUpdateKernel->addArg(ccmaDelta2);
+        ccmaUpdateKernel->addArg(ccmaConverged);
+        ccmaUpdateKernel->addArg();
+    }
+
+    // Arguments for time shift kernel will be set later.
+    
+    for (int i = 0; i < 3; i++)
+        timeShiftKernel->addArg();
+}
+
+void IntegrationUtilities::setNextStepSize(double size) {
+    if (size != lastStepSize.x || size != lastStepSize.y) {
+        lastStepSize = mm_double2(size, size);
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            stepSize.upload(&lastStepSize);
+        else {
+            mm_float2 lastStepSizeFloat = mm_float2((float) size, (float) size);
+            stepSize.upload(&lastStepSizeFloat);
+        }
+    }
+}
+
+double IntegrationUtilities::getLastStepSize() {
+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+        stepSize.download(&lastStepSize);
+    else {
+        mm_float2 lastStepSizeFloat;
+        stepSize.download(&lastStepSizeFloat);
+        lastStepSize = mm_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
+    }
+    return lastStepSize.y;
+}
+
+void IntegrationUtilities::applyConstraints(double tol) {
+    applyConstraintsImpl(false, tol);
+}
+
+void IntegrationUtilities::applyVelocityConstraints(double tol) {
+    applyConstraintsImpl(true, tol);
+}
+
+void IntegrationUtilities::computeVirtualSites() {
+    if (numVsites > 0)
+        vsitePositionKernel->execute(numVsites);
+}
+
+void IntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
+    if (random.isInitialized()) {
+        if (randomNumberSeed != lastSeed)
+           throw OpenMMException("IntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
+        return;
+    }
+
+    // Create the random number arrays.
+
+    lastSeed = randomNumberSeed;
+    random.initialize<mm_float4>(context, 4*context.getPaddedNumAtoms(), "random");
+    randomSeed.initialize<mm_int4>(context, context.getNumThreadBlocks()*64, "randomSeed");
+    randomPos = random.getSize();
+    randomKernel->addArg(random.getSize());
+    randomKernel->addArg(random);
+    randomKernel->addArg(randomSeed);
+
+    // Use a quick and dirty RNG to pick seeds for the real random number generator.
+
+    vector<mm_int4> seed(randomSeed.getSize());
+    unsigned int r = randomNumberSeed;
+    if (r == 0)
+        r = (unsigned int) osrngseed(); // A seed of 0 means use a unique one
+    for (int i = 0; i < randomSeed.getSize(); i++) {
+        seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+    }
+    randomSeed.upload(seed);
+}
+
+int IntegrationUtilities::prepareRandomNumbers(int numValues) {
+    if (randomPos+numValues <= random.getSize()) {
+        int oldPos = randomPos;
+        randomPos += numValues;
+        return oldPos;
+    }
+    if (numValues > random.getSize()) {
+        random.resize(numValues);
+        randomKernel->setArg(0, numValues);
+    }
+    randomKernel->execute(random.getSize(), 64);
+    randomPos = numValues;
+    return 0;
+}
+
+void IntegrationUtilities::createCheckpoint(ostream& stream) {
+    int numChains = noseHooverChainState.size();
+    bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
+    stream.write((char*) &numChains, sizeof(int));
+    for (auto &chainState: noseHooverChainState){
+        int chainID = chainState.first;
+        int chainLength = chainState.second.getSize();
+        stream.write((char*) &chainID, sizeof(int));
+        stream.write((char*) &chainLength, sizeof(int));
+        if (useDouble) {
+            vector<mm_double2> stateVec;
+            chainState.second.download(stateVec);
+            stream.write((char*) stateVec.data(), sizeof(mm_double2)*chainLength);
+        }
+        else {
+            vector<mm_float2> stateVec;
+            chainState.second.download(stateVec);
+            stream.write((char*) stateVec.data(), sizeof(mm_float2)*chainLength);
+        }
+    }
+    if (!random.isInitialized())
+        return;
+    stream.write((char*) &randomPos, sizeof(int));
+    vector<mm_float4> randomVec;
+    random.download(randomVec);
+    stream.write((char*) &randomVec[0], sizeof(mm_float4)*random.getSize());
+    vector<mm_int4> randomSeedVec;
+    randomSeed.download(randomSeedVec);
+    stream.write((char*) &randomSeedVec[0], sizeof(mm_int4)*randomSeed.getSize());
+}
+
+void IntegrationUtilities::loadCheckpoint(istream& stream) {
+    int numChains;
+    bool useDouble = context.getUseDoublePrecision() || context.getUseMixedPrecision();
+    stream.read((char*) &numChains, sizeof(int));
+    noseHooverChainState.clear();
+    for (int i = 0; i < numChains; i++) {
+        int chainID, chainLength;
+        stream.read((char*) &chainID, sizeof(int));
+        stream.read((char*) &chainLength, sizeof(int));
+        if (useDouble) {
+            noseHooverChainState[chainID] = ComputeArray();
+            noseHooverChainState[chainID].initialize<mm_double2>(context, chainLength, "chainState" + to_string(chainID));
+            vector<mm_double2> stateVec(chainLength);
+            stream.read((char*) &stateVec[0], sizeof(mm_double2)*chainLength);
+            noseHooverChainState[chainID].upload(stateVec);
+        }
+        else {
+            noseHooverChainState[chainID] = ComputeArray();
+            noseHooverChainState[chainID].initialize<mm_float2>(context, chainLength, "chainState" + to_string(chainID));
+            vector<mm_float2> stateVec(chainLength);
+            stream.read((char*) &stateVec[0], sizeof(mm_float2)*chainLength);
+            noseHooverChainState[chainID].upload(stateVec);
+        }
+    }
+    if (!random.isInitialized())
+        return;
+    stream.read((char*) &randomPos, sizeof(int));
+    vector<mm_float4> randomVec(random.getSize());
+    stream.read((char*) &randomVec[0], sizeof(mm_float4)*random.getSize());
+    random.upload(randomVec);
+    vector<mm_int4> randomSeedVec(randomSeed.getSize());
+    stream.read((char*) &randomSeedVec[0], sizeof(mm_int4)*randomSeed.getSize());
+    randomSeed.upload(randomSeedVec);
+}
+
+double IntegrationUtilities::computeKineticEnergy(double timeShift) {
+    int numParticles = context.getNumAtoms();
+    if (timeShift != 0) {
+        // Copy the velocities into the posDelta array while we temporarily modify them.
+
+        context.getVelm().copyTo(posDelta);
+
+        // Apply the time shift.
+
+        timeShiftKernel->setArg(0, context.getVelm());
+        timeShiftKernel->setArg(1, context.getLongForceBuffer());
+        if (context.getUseDoublePrecision())
+            timeShiftKernel->setArg(2, timeShift);
+        else
+            timeShiftKernel->setArg(2, (float) timeShift);
+        timeShiftKernel->execute(numParticles);
+        applyConstraintsImpl(true, 1e-4);
+    }
+    
+    // Compute the kinetic energy.
+    
+    double energy = 0.0;
+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
+        vector<mm_double4> velm;
+        context.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++) {
+            mm_double4 v = velm[i];
+            if (v.w != 0)
+                energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
+        }
+    }
+    else {
+        vector<mm_float4> velm;
+        context.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++) {
+            mm_float4 v = velm[i];
+            if (v.w != 0)
+                energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
+        }
+    }
+    
+    // Restore the velocities.
+    
+    if (timeShift != 0)
+        posDelta.copyTo(context.getVelm());
+    return 0.5*energy;
+}
--- a/platforms/cuda/src/kernels/andersenThermostat.cu
+++ b/platforms/cuda/src/kernels/andersenThermostat.cu
@@ -2,11 +2,11 @@
 * Apply the Andersen thermostat to adjust particle velocities.
 */

-extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
-        unsigned int randomIndex, const int* __restrict__ atomGroups) {
-    float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
-    float randomRange = erff(collisionProbability/sqrtf(2.0f));
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, GLOBAL mixed4* velm, real stepSize, GLOBAL const float4* RESTRICT random,
+        unsigned int randomIndex, GLOBAL const int* RESTRICT atomGroups) {
+    float collisionProbability = (float) (1-EXP(-collisionFrequency*stepSize));
+    float randomRange = (float) erf(collisionProbability/SQRT(2.0f));
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];

--- a/platforms/common/src/kernels/angleForce.cc
+++ b/platforms/common/src/kernels/angleForce.cc
+real3 v0 = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+real3 v1 = make_real3(pos2.x-pos3.x, pos2.y-pos3.y, pos2.z-pos3.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+#endif
+real3 cp = cross(v0, v1);
+real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
+rp = max(SQRT(rp), (real) 1.0e-06f);
+real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
+real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
+real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
+real cosine = min(max(dot*RSQRT(r21*r23), (real) -1), (real) 1);
+real theta = ACOS(cosine);
+COMPUTE_FORCE
+real3 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
+real3 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
+real3 force2 = -force1-force3;
--- a/platforms/cuda/src/kernels/baoab.cu
+++ b/platforms/cuda/src/kernels/baoab.cu
@@ -4,11 +4,11 @@ enum {VelScale, NoiseScale};
 * Perform the first part of BAOAB integration: velocity half step, then position half step.
 */

-extern "C" __global__ void integrateBAOABPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
-        mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
+KERNEL void integrateBAOABPart1(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt) {
    mixed halfdt = 0.5*dt[0].y;
    mixed fscale = halfdt/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            velocity.x += fscale*velocity.w*force[index];
@@ -27,13 +27,17 @@ extern "C" __global__ void integrateBAOABPart1(int numAtoms, int paddedNumAtoms,
 * then position half step.
 */

-extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, mixed4* __restrict__ posDelta,
-        mixed4* __restrict__ oldDelta, const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+KERNEL void integrateBAOABPart2(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed* RESTRICT paramBuffer, GLOBAL const mixed2* RESTRICT dt, GLOBAL const float4* RESTRICT random, unsigned int randomIndex
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
    mixed vscale = paramBuffer[VelScale];
    mixed noisescale = paramBuffer[NoiseScale];
    mixed halfdt = 0.5*dt[0].y;
    mixed invHalfdt = 1/halfdt;
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = GLOBAL_ID;
    randomIndex += index;
    while (index < numAtoms) {
        mixed4 velocity = velm[index];
@@ -67,8 +71,8 @@ extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__
            posDelta[index] = delta;
            oldDelta[index] = delta;
        }
-        randomIndex += blockDim.x*gridDim.x;
-        index += blockDim.x*gridDim.x;
+        randomIndex += GLOBAL_SIZE;
+        index += GLOBAL_SIZE;
    }
 }

@@ -77,11 +81,15 @@ extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__
 * the constrained positions in preparation for computing forces.
 */

-extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,
-        mixed4* __restrict__ posDelta, mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
+KERNEL void integrateBAOABPart3(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
+         GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
    mixed halfdt = 0.5*dt[0].y;
    mixed invHalfdt = 1/halfdt;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            mixed4 delta = posDelta[index];
@@ -113,11 +121,11 @@ extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__
 * Perform the fourth part of BAOAB integration: velocity half step.
 */

-extern "C" __global__ void integrateBAOABPart4(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm,
-        const long long* __restrict__ force, const mixed2* __restrict__ dt) {
+KERNEL void integrateBAOABPart4(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm,
+        GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed2* RESTRICT dt) {
    mixed halfdt = 0.5*dt[0].y;
    mixed fscale = halfdt/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            velocity.x += fscale*velocity.w*force[index];

--- a/platforms/common/src/kernels/bondForce.cc
+++ b/platforms/common/src/kernels/bondForce.cc
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+COMPUTE_FORCE
+dEdR = (r > 0) ? (dEdR / r) : 0;
+delta *= dEdR;
+real3 force1 = delta;
+real3 force2 = -delta;
--- a/platforms/cuda/src/kernels/brownian.cu
+++ b/platforms/cuda/src/kernels/brownian.cu
@@ -2,18 +2,18 @@
 * Perform the first step of Brownian integration.
 */

-extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
-        mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
-    randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, GLOBAL const mm_long* RESTRICT force,
+        GLOBAL mixed4* RESTRICT posDelta, GLOBAL const mixed4* RESTRICT velm, GLOBAL const float4* RESTRICT random, unsigned int randomIndex) {
+    randomIndex += GLOBAL_ID;
    const mixed fscale = tauDeltaT/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed invMass = velm[index].w;
        if (invMass != 0) {
            posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
            posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
            posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
        }
-        randomIndex += blockDim.x*gridDim.x;
+        randomIndex += GLOBAL_SIZE;
    }
 }

@@ -21,9 +21,12 @@ extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAto
 * Perform the second step of Brownian integration.
 */

-extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
-    const mixed oneOverDeltaT = RECIP(deltaT);
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void integrateBrownianPart2(int numAtoms, mixed oneOverDeltaT, GLOBAL real4* posq, GLOBAL mixed4* velm, GLOBAL const mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        if (velm[index].w != 0) {
            mixed4 delta = posDelta[index];
            velm[index].x = oneOverDeltaT*delta.x;

--- a/platforms/cuda/src/kernels/cmapTorsionForce.cu
+++ b/platforms/cuda/src/kernels/cmapTorsionForce.cu
--- a/platforms/opencl/src/kernels/customCentroidBond.cl
+++ b/platforms/opencl/src/kernels/customCentroidBond.cl
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
 /**
 * Compute the center of each group.
 */
-__kernel void computeGroupCenters(__global const real4* restrict posq, __global const int* restrict groupParticles,
-        __global const real* restrict groupWeights, __global const int* restrict groupOffsets, __global real4* restrict centerPositions) {
-    __local volatile real3 temp[64];
-    for (int group = get_group_id(0); group < NUM_GROUPS; group += get_num_groups(0)) {
+KERNEL void computeGroupCenters(int numParticleGroups, GLOBAL const real4* RESTRICT posq, GLOBAL const int* RESTRICT groupParticles,
+        GLOBAL const real* RESTRICT groupWeights, GLOBAL const int* RESTRICT groupOffsets, GLOBAL real4* RESTRICT centerPositions) {
+    LOCAL volatile real3 temp[64];
+    for (int group = GROUP_ID; group < numParticleGroups; group += NUM_GROUPS) {
        // The threads in this block work together to compute the center one group.

        int firstIndex = groupOffsets[group];
        int lastIndex = groupOffsets[group+1];
-        real3 center = (real3) 0;
-        for (int index = get_local_id(0); index < lastIndex-firstIndex; index += get_local_size(0)) {
+        real3 center = make_real3(0);
+        for (int index = LOCAL_ID; index < lastIndex-firstIndex; index += LOCAL_SIZE) {
            int atom = groupParticles[firstIndex+index];
            real weight = groupWeights[firstIndex+index];
            real4 pos = posq[atom];
@@ -23,18 +21,16 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global

        // Sum the values.

-        int thread = get_local_id(0);
+        int thread = LOCAL_ID;
        temp[thread].x = center.x;
        temp[thread].y = center.y;
        temp[thread].z = center.z;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
+        SYNC_THREADS;
        if (thread < 32) {
            temp[thread].x += temp[thread+32].x;
            temp[thread].y += temp[thread+32].y;
            temp[thread].z += temp[thread+32].z;
        }
-
        SYNC_WARPS;
        if (thread < 16) {
            temp[thread].x += temp[thread+16].x;
@@ -47,7 +43,6 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
            temp[thread].y += temp[thread+8].y;
            temp[thread].z += temp[thread+8].z;
        }
-
        SYNC_WARPS;
        if (thread < 4) {
            temp[thread].x += temp[thread+4].x;
@@ -60,19 +55,18 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
            temp[thread].y += temp[thread+2].y;
            temp[thread].z += temp[thread+2].z;
        }
-
        SYNC_WARPS;
        if (thread == 0)
-            centerPositions[group] = (real4) (temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
+            centerPositions[group] = make_real4(temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
    }
 }

 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+DEVICE real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    if (periodic)
        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
@@ -82,65 +76,64 @@ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-real computeAngle(real4 vec1, real4 vec2) {
+DEVICE real computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        real4 crossProduct = cross(vec1, vec2);
+        real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0)
            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }

 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-real4 computeCross(real4 vec1, real4 vec2) {
-    real4 result = cross(vec1, vec2);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
+DEVICE real4 computeCross(real4 vec1, real4 vec2) {
+    real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
+    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }

 /**
 * Compute the forces on groups based on the bonds.
 */
-__kernel void computeGroupForces(__global long* restrict groupForce, __global mixed* restrict energyBuffer, __global const real4* restrict centerPositions,
-        __global const int* restrict bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
+KERNEL void computeGroupForces(int numParticleGroups, GLOBAL mm_ulong* RESTRICT groupForce, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT centerPositions,
+        GLOBAL const int* RESTRICT bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        EXTRA_ARGS) {
    mixed energy = 0;
    INIT_PARAM_DERIVS
-    for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) {
+    for (int index = GLOBAL_ID; index < NUM_BONDS; index += GLOBAL_SIZE) {
        COMPUTE_FORCE
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
    SAVE_PARAM_DERIVS
 }

 /**
 * Apply the forces from the group centers to the individual atoms.
 */
-__kernel void applyForcesToAtoms(__global const int* restrict groupParticles, __global const real* restrict groupWeights, __global const int* restrict groupOffsets,
-        __global const long* restrict groupForce, __global long* restrict atomForce) {
-    for (int group = get_group_id(0); group < NUM_GROUPS; group += get_num_groups(0)) {
-        long fx = groupForce[group];
-        long fy = groupForce[group+NUM_GROUPS];
-        long fz = groupForce[group+NUM_GROUPS*2];
+KERNEL void applyForcesToAtoms(int numParticleGroups, GLOBAL const int* RESTRICT groupParticles, GLOBAL const real* RESTRICT groupWeights, GLOBAL const int* RESTRICT groupOffsets,
+        GLOBAL const mm_long* RESTRICT groupForce, GLOBAL mm_ulong* RESTRICT atomForce) {
+    for (int group = GROUP_ID; group < numParticleGroups; group += NUM_GROUPS) {
+        mm_long fx = groupForce[group];
+        mm_long fy = groupForce[group+numParticleGroups];
+        mm_long fz = groupForce[group+numParticleGroups*2];
        int firstIndex = groupOffsets[group];
        int lastIndex = groupOffsets[group+1];
-        for (int index = get_local_id(0); index < lastIndex-firstIndex; index += get_local_size(0)) {
+        for (int index = LOCAL_ID; index < lastIndex-firstIndex; index += LOCAL_SIZE) {
            int atom = groupParticles[firstIndex+index];
            real weight = groupWeights[firstIndex+index];
-            atom_add(&atomForce[atom], (long) (fx*weight));
-            atom_add(&atomForce[atom+PADDED_NUM_ATOMS], (long) (fy*weight));
-            atom_add(&atomForce[atom+2*PADDED_NUM_ATOMS], (long) (fz*weight));
+            ATOMIC_ADD(&atomForce[atom], (mm_ulong) ((mm_long) (fx*weight)));
+            ATOMIC_ADD(&atomForce[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (fy*weight)));
+            ATOMIC_ADD(&atomForce[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (fz*weight)));
        }
    }
 }
--- a/platforms/cuda/src/kernels/customCompoundBond.cu
+++ b/platforms/cuda/src/kernels/customCompoundBond.cu
-/**
- * Convert a real4 to a real3 by removing its last element.
- */
-inline __device__ real3 ccb_trim(real4 v) {
-    return make_real3(v.x, v.y, v.z);
-}
-
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+DEVICE real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    if (periodic)
@@ -20,17 +13,17 @@ inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 p
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-__device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
+DEVICE real ccb_computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        real3 crossProduct = cross(vec1, vec2);
+        real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
        real scale = vec1.w*vec2.w;
        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
+        if (cosine < 0)
            angle = M_PI-angle;
    }
    else
@@ -41,7 +34,8 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
-    real3 cp = cross(vec1, vec2);
+DEVICE real4 ccb_computeCross(real4 vec1, real4 vec2) {
+    real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }
+
--- a/platforms/cuda/src/kernels/customExternalForce.cu
+++ b/platforms/cuda/src/kernels/customExternalForce.cu