Beginnings of mixed/double precision support in OpenCL

8d6a2a01 · Peter Eastman · a3d5f834 · 8d6a2a01 · 8d6a2a01 · 8d6a2a01
Commit 8d6a2a01 authored Oct 16, 2012 by Peter Eastman
20 changed files
--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -68,11 +68,18 @@ public:
        static const std::string key = "OpenCLPlatformIndex";
        return key;
    }
+    /**
+     * This is the name of the parameter for selecting what numerical precision to use.
+     */
+    static const std::string& OpenCLPrecision() {
+        static const std::string key = "OpenCLPrecision";
+        return key;
+    }
 };

 class OPENMM_EXPORT OpenCLPlatform::PlatformData {
 public:
-    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty);
+    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -65,10 +65,24 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
    std::cerr << "OpenCL internal error: " << errinfo << std::endl;
 }

-OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData) :
+OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
        velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
        bonded(NULL), nonbonded(NULL), thread(NULL) {
+    if (precision == "single") {
+        useDoublePrecision = false;
+        useMixedPrecision = false;
+    }
+    else if (precision == "mixed") {
+        useDoublePrecision = false;
+        useMixedPrecision = true;
+    }
+    else if (precision == "double") {
+        useDoublePrecision = true;
+        useMixedPrecision = false;
+    }
+    else
+        throw OpenMMException("Illegal value for OpenCLPrecision: "+precision);
    try {
        contextIndex = platformData.contexts.size();
        std::vector<cl::Platform> platforms;
@@ -217,8 +231,27 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
        bonded = new OpenCLBondedUtilities(*this);
        nonbonded = new OpenCLNonbondedUtilities(*this);
+        if (useDoublePrecision) {
+            posq = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "posq");
+            velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
+            compilationDefines["USE_DOUBLE_PRECISION"] = "1";
+            compilationDefines["convert_real4"] = "convert_double4";
+            compilationDefines["convert_mixed4"] = "convert_double4";
+        }
+        else if (useMixedPrecision) {
+            posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
+            posqCorrection = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
+            velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
+            compilationDefines["USE_MIXED_PRECISION"] = "1";
+            compilationDefines["convert_real4"] = "convert_float4";
+            compilationDefines["convert_mixed4"] = "convert_double4";
+        }
+        else {
            posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
            velm = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "velm");
+            compilationDefines["convert_real4"] = "convert_float4";
+            compilationDefines["convert_mixed4"] = "convert_float4";
+        }
        posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
    }
    catch (cl::Error err) {
@@ -241,6 +274,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device

    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.

+    if (!useDoublePrecision) {
        cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
        OpenCLArray valuesArray(*this, 20, sizeof(mm_float8), "values");
        vector<mm_float8> values(valuesArray.getSize());
@@ -269,6 +303,14 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
        compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
        compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
+    }
+    else {
+        compilationDefines["SQRT"] = "sqrt";
+        compilationDefines["RSQRT"] = "rsqrt";
+        compilationDefines["RECIP"] = "1.0/";
+        compilationDefines["EXP"] = "exp";
+        compilationDefines["LOG"] = "log";
+    }
    
    // Create the work thread used for parallelization when running on multiple devices.
    
@@ -311,18 +353,21 @@ OpenCLContext::~OpenCLContext() {
 }

 void OpenCLContext::initialize() {
-    vector<mm_float4> v(paddedNumAtoms, mm_float4(0, 0, 0, 0));
-    for (int i = 0; i < numAtoms; i++) {
-        double mass = system.getParticleMass(i);
-        v[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
-    }
-    velm->upload(v);
    bonded->initialize(system);
    numForceBuffers = platformData.contexts.size();
    numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
+    if (useDoublePrecision) {
+        forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
+        force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
+        energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
+    }
+    else {
        forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
+        force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
+        energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
+    }
    if (supports64BitGlobalAtomics) {
        longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
        reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
@@ -332,12 +377,18 @@ void OpenCLContext::initialize() {
        addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
    }
    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
-    force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
-    energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
    addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
-    int bufferBytes = max(posq->getSize()*sizeof(mm_float4), energyBuffer->getSize()*sizeof(cl_float));
+    int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
    pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
    pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
+    for (int i = 0; i < numAtoms; i++) {
+        double mass = system.getParticleMass(i);
+        if (useDoublePrecision || useMixedPrecision)
+            ((mm_double4*) pinnedMemory)[i] = mm_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
+        else
+            ((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
+    }
+    velm->upload(pinnedMemory);
    atomIndexDevice = OpenCLArray::create<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
@@ -382,6 +433,28 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
    }
    if (!compilationDefines.empty())
        src << endl;
+    if (supportsDoublePrecision)
+        src << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+    if (useDoublePrecision) {
+        src << "typedef double real;\n";
+        src << "typedef double2 real2;\n";
+        src << "typedef double4 real4;\n";
+    }
+    else {
+        src << "typedef float real;\n";
+        src << "typedef float2 real2;\n";
+        src << "typedef float4 real4;\n";
+    }
+    if (useDoublePrecision || useMixedPrecision) {
+        src << "typedef double mixed;\n";
+        src << "typedef double2 mixed2;\n";
+        src << "typedef double4 mixed4;\n";
+    }
+    else {
+        src << "typedef float mixed;\n";
+        src << "typedef float2 mixed2;\n";
+        src << "typedef float4 mixed4;\n";
+    }
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())
@@ -764,11 +837,47 @@ void OpenCLContext::validateMolecules() {
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
    
+    vector<mm_int4> newCellOffsets(numAtoms);
+    if (useDoublePrecision) {
+        vector<mm_double4> oldPosq(paddedNumAtoms);
+        vector<mm_double4> newPosq(paddedNumAtoms);
+        vector<mm_double4> oldVelm(paddedNumAtoms);
+        vector<mm_double4> newVelm(paddedNumAtoms);
+        posq->download(oldPosq);
+        velm->download(oldVelm);
+        for (int i = 0; i < numAtoms; i++) {
+            int index = atomIndex[i];
+            newPosq[index] = oldPosq[i];
+            newVelm[index] = oldVelm[i];
+            newCellOffsets[index] = posCellOffsets[i];
+        }
+        posq->upload(newPosq);
+        velm->upload(newVelm);
+    }
+    else if (useMixedPrecision) {
+        vector<mm_float4> oldPosq(paddedNumAtoms);
+        vector<mm_float4> newPosq(paddedNumAtoms);
+        vector<mm_float4> oldPosqCorrection(paddedNumAtoms);
+        vector<mm_float4> newPosqCorrection(paddedNumAtoms);
+        vector<mm_double4> oldVelm(paddedNumAtoms);
+        vector<mm_double4> newVelm(paddedNumAtoms);
+        posq->download(oldPosq);
+        velm->download(oldVelm);
+        for (int i = 0; i < numAtoms; i++) {
+            int index = atomIndex[i];
+            newPosq[index] = oldPosq[i];
+            newPosqCorrection[index] = oldPosqCorrection[i];
+            newVelm[index] = oldVelm[i];
+            newCellOffsets[index] = posCellOffsets[i];
+        }
+        posq->upload(newPosq);
+        velm->upload(newVelm);
+    }
+    else {
        vector<mm_float4> oldPosq(paddedNumAtoms);
        vector<mm_float4> newPosq(paddedNumAtoms);
        vector<mm_float4> oldVelm(paddedNumAtoms);
        vector<mm_float4> newVelm(paddedNumAtoms);
-    vector<mm_int4> newCellOffsets(numAtoms);
        posq->download(oldPosq);
        velm->download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
@@ -779,12 +888,11 @@ void OpenCLContext::validateMolecules() {
        }
        posq->upload(newPosq);
        velm->upload(newVelm);
+    }
    for (int i = 0; i < numAtoms; i++) {
        atomIndex[i] = i;
        posCellOffsets[i] = newCellOffsets[i];
    }
-    posq->upload(newPosq);
-    velm->upload(newVelm);
    atomIndexDevice->upload(atomIndex);
    findMoleculeGroups();
    for (int i = 0; i < (int) reorderListeners.size(); i++)
@@ -797,16 +905,29 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
    if (moleculesInvalid)
        validateMolecules();
    atomsWereReordered = true;
+    if (useDoublePrecision)
+        reorderAtomsImpl<cl_double, mm_double4, cl_double, mm_double4>(enforcePeriodic);
+    else if (useMixedPrecision)
+        reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>(enforcePeriodic);
+    else
+        reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>(enforcePeriodic);
+}
+
+template <class Real, class Real4, class Mixed, class Mixed4>
+void OpenCLContext::reorderAtomsImpl(bool enforcePeriodic) {

    // Find the range of positions and the number of bins along each axis.

-    vector<mm_float4> oldPosq(paddedNumAtoms);
-    vector<mm_float4> oldVelm(paddedNumAtoms);
+    vector<Real4> oldPosq(paddedNumAtoms);
+    vector<Real4> oldPosqCorrection(paddedNumAtoms);
+    vector<Mixed4> oldVelm(paddedNumAtoms);
    posq->download(oldPosq);
    velm->download(oldVelm);
-    float minx = oldPosq[0].x, maxx = oldPosq[0].x;
-    float miny = oldPosq[0].y, maxy = oldPosq[0].y;
-    float minz = oldPosq[0].z, maxz = oldPosq[0].z;
+    if (useMixedPrecision)
+        posqCorrection->download(oldPosqCorrection);
+    Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
+    Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
+    Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
    if (nonbonded->getUsePeriodic()) {
        minx = miny = minz = 0.0;
        maxx = periodicBoxSize.x;
@@ -815,7 +936,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
    }
    else {
        for (int i = 1; i < numAtoms; i++) {
-            const mm_float4& pos = oldPosq[i];
+            const Real4& pos = oldPosq[i];
            minx = min(minx, pos.x);
            maxx = max(maxx, pos.x);
            miny = min(miny, pos.y);
@@ -828,8 +949,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
    // Loop over each group of identical molecules and reorder them.

    vector<int> originalIndex(numAtoms);
-    vector<mm_float4> newPosq(paddedNumAtoms);
-    vector<mm_float4> newVelm(paddedNumAtoms);
+    vector<Real4> newPosq(paddedNumAtoms);
+    vector<Real4> newPosqCorrection(paddedNumAtoms);
+    vector<Mixed4> newVelm(paddedNumAtoms);
    vector<mm_int4> newCellOffsets(numAtoms);
    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
        // Find the center of each molecule.
@@ -837,15 +959,15 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
        MoleculeGroup& mol = moleculeGroups[group];
        int numMolecules = mol.offsets.size();
        vector<int>& atoms = mol.atoms;
-        vector<mm_float4> molPos(numMolecules);
-        float invNumAtoms = 1.0f/atoms.size();
+        vector<Real4> molPos(numMolecules);
+        Real invNumAtoms = (Real) (1.0/atoms.size());
        for (int i = 0; i < numMolecules; i++) {
            molPos[i].x = 0.0f;
            molPos[i].y = 0.0f;
            molPos[i].z = 0.0f;
            for (int j = 0; j < (int)atoms.size(); j++) {
                int atom = atoms[j]+mol.offsets[i];
-                const mm_float4& pos = oldPosq[atom];
+                const Real4& pos = oldPosq[atom];
                molPos[i].x += pos.x;
                molPos[i].y += pos.y;
                molPos[i].z += pos.z;
@@ -861,9 +983,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
                int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
                int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
                int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
-                float dx = xcell*periodicBoxSize.x;
-                float dy = ycell*periodicBoxSize.y;
-                float dz = zcell*periodicBoxSize.z;
+                Real dx = xcell*periodicBoxSize.x;
+                Real dy = ycell*periodicBoxSize.y;
+                Real dz = zcell*periodicBoxSize.z;
                if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
                    molPos[i].x -= dx;
                    molPos[i].y -= dy;
@@ -871,7 +993,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
                    if (enforcePeriodic) {
                        for (int j = 0; j < (int) atoms.size(); j++) {
                            int atom = atoms[j]+mol.offsets[i];
-                            mm_float4 p = oldPosq[atom];
+                            Real4 p = oldPosq[atom];
                            p.x -= dx;
                            p.y -= dy;
                            p.z -= dz;
@@ -888,12 +1010,12 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
        // Select a bin for each molecule, then sort them by bin.

        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
-        float binWidth;
+        Real binWidth;
        if (useHilbert)
-            binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
+            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
        else
-            binWidth = (float)(0.2*nonbonded->getCutoffDistance());
-        float invBinWidth = 1.0f/binWidth;
+            binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
+        Real invBinWidth = (Real) (1.0/binWidth);
        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
        vector<pair<int, int> > molBins(numMolecules);
@@ -928,6 +1050,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
                int newIndex = mol.offsets[i]+atoms[j];
                originalIndex[newIndex] = atomIndex[oldIndex];
                newPosq[newIndex] = oldPosq[oldIndex];
+                if (useMixedPrecision)
+                    newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
                newVelm[newIndex] = oldVelm[oldIndex];
                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
            }
@@ -941,6 +1065,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
        posCellOffsets[i] = newCellOffsets[i];
    }
    posq->upload(newPosq);
+    if (useMixedPrecision)
+        posqCorrection->upload(newPosqCorrection);
    velm->upload(newVelm);
    atomIndexDevice->upload(atomIndex);
    for (int i = 0; i < (int) reorderListeners.size(); i++)

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -62,7 +62,7 @@ struct mm_float2 {
    mm_float2(cl_float x, cl_float y) : x(x), y(y) {
    }
 };
- struct mm_float4 {
+struct mm_float4 {
    cl_float x, y, z, w;
    mm_float4() {
    }
@@ -87,6 +87,20 @@ struct mm_float16 {
        s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
    }
 };
+struct mm_double2 {
+    cl_double x, y;
+    mm_double2() {
+    }
+    mm_double2(cl_double x, cl_double y) : x(x), y(y) {
+    }
+};
+struct mm_double4 {
+    cl_double x, y, z, w;
+    mm_double4() {
+    }
+    mm_double4(cl_double x, cl_double y, cl_double z, cl_double w) : x(x), y(y), z(z), w(w) {
+    }
+};
 struct mm_ushort2 {
    cl_ushort x, y;
    mm_ushort2() {
@@ -145,7 +159,7 @@ public:
    class ReorderListener;
    static const int ThreadBlockSize;
    static const int TileSize;
-    OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData);
+    OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
    ~OpenCLContext();
    /**
     * This is called to initialize internal data structures after all Forces in the system
@@ -198,6 +212,12 @@ public:
    OpenCLArray& getPosq() {
        return *posq;
    }
+    /**
+     * Get the array which contains a correction to the position of each atom.  This only exists if getUseMixedPrecision() returns true.
+     */
+    OpenCLArray& getPosqCorrection() {
+        return *posqCorrection;
+    }
    /**
     * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
     */
@@ -405,18 +425,38 @@ public:
    bool getSupportsDoublePrecision() {
        return supportsDoublePrecision;
    }
+    /**
+     * Get whether double precision is being used.
+     */
+    bool getUseDoublePrecision() {
+        return useDoublePrecision;
+    }
+    /**
+     * Get whether mixed precision is being used.
+     */
+    bool getUseMixedPrecision() {
+        return useMixedPrecision;
+    }
    /**
     * Get the size of the periodic box.
     */
    mm_float4 getPeriodicBoxSize() const {
        return periodicBoxSize;
    }
+    /**
+     * Get the size of the periodic box.
+     */
+    mm_double4 getPeriodicBoxSizeDouble() const {
+        return periodicBoxSizeDouble;
+    }
    /**
     * Set the size of the periodic box.
     */
    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
        periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
        invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
+        periodicBoxSizeDouble = mm_double4(xsize, ysize, zsize, 0);
+        invPeriodicBoxSizeDouble = mm_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0);
    }
    /**
     * Get the inverse of the size of the periodic box.
@@ -424,6 +464,12 @@ public:
    mm_float4 getInvPeriodicBoxSize() const {
        return invPeriodicBoxSize;
    }
+    /**
+     * Get the inverse of the size of the periodic box.
+     */
+    mm_double4 getInvPeriodicBoxSizeDouble() const {
+        return invPeriodicBoxSizeDouble;
+    }
    /**
     * Get the OpenCLIntegrationUtilities for this context.
     */
@@ -502,6 +548,11 @@ private:
     * of molecules and resort the atoms.
     */
    void validateMolecules();
+    /**
+     * This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
+     */
+    template <class Real, class Real4, class Mixed, class Mixed4>
+    void reorderAtomsImpl(bool enforcePeriodic);
    const System& system;
    double time;
    OpenCLPlatform::PlatformData& platformData;
@@ -515,9 +566,9 @@ private:
    int numThreadBlocks;
    int numForceBuffers;
    int simdWidth;
-    bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered, moleculesInvalid;
-    mm_float4 periodicBoxSize;
-    mm_float4 invPeriodicBoxSize;
+    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, moleculesInvalid;
+    mm_float4 periodicBoxSize, invPeriodicBoxSize;
+    mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble;
    std::string defaultOptimizationOptions;
    std::map<std::string, std::string> compilationDefines;
    cl::Context context;
@@ -538,6 +589,7 @@ private:
    cl::Buffer* pinnedBuffer;
    void* pinnedMemory;
    OpenCLArray* posq;
+    OpenCLArray* posqCorrection;
    OpenCLArray* velm;
    OpenCLArray* force;
    OpenCLArray* forceBuffers;

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in
    }
 };

+static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
+    if (cl.getUseMixedPrecision())
+        kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
+    else
+        kernel.setArg<void*>(index, NULL);
+}
+
 OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
    // Create workspace arrays.

+    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
+        posDelta = OpenCLArray::create<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
+        vector<mm_double4> deltas(posDelta->getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
+        posDelta->upload(deltas);
+        stepSize = OpenCLArray::create<mm_double2>(context, 1, "stepSize");
+        vector<mm_double2> step(1, mm_double2(0.0, 0.0));
+        stepSize->upload(step);
+    }
+    else {
        posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
-    vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0, 0.0, 0.0, 0.0));
+        vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
        posDelta->upload(deltas);
        stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
        vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
        stepSize->upload(step);
+    }

    // Create kernels for enforcing constraints.

@@ -458,23 +475,57 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        // Record the CCMA data structures.

        ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
-        ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
        ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
        ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
-        ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
+        ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
        ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
        ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
-        ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
-        ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
-        ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
        vector<mm_int2> atomsVec(ccmaAtoms->getSize());
-        vector<mm_float4> distanceVec(ccmaDistance->getSize());
        vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
-        vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
        vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
+            ccmaDistance = OpenCLArray::create<mm_double4>(context, numCCMA, "CcmaDistance");
+            ccmaDelta1 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta1");
+            ccmaDelta2 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta2");
+            ccmaReducedMass = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaReducedMass");
+            ccmaConstraintMatrixValue = OpenCLArray::create<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+            vector<mm_double4> distanceVec(ccmaDistance->getSize());
+            vector<cl_double> reducedMassVec(ccmaReducedMass->getSize());
+            vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
+            for (int i = 0; i < numCCMA; i++) {
+                int index = constraintOrder[i];
+                int c = ccmaConstraints[index];
+                atomsVec[i].x = atom1[c];
+                atomsVec[i].y = atom2[c];
+                distanceVec[i].w = distance[c];
+                reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+                for (unsigned int j = 0; j < matrix[index].size(); j++) {
+                    constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+                    constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
+                }
+                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
+            }
+            for (unsigned int i = 0; i < atomConstraints.size(); i++) {
+                numAtomConstraintsVec[i] = atomConstraints[i].size();
+                for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
+                    bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
+                    atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
+                }
+            }
+            ccmaDistance->upload(distanceVec);
+            ccmaReducedMass->upload(reducedMassVec);
+            ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
+        }
+        else {
+            ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
+            ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
+            ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
+            ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
+            ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+            vector<mm_float4> distanceVec(ccmaDistance->getSize());
+            vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
            vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
            for (int i = 0; i < numCCMA; i++) {
                int index = constraintOrder[i];
@@ -496,13 +547,14 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
                    atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
                }
            }
-        ccmaAtoms->upload(atomsVec);
            ccmaDistance->upload(distanceVec);
+            ccmaReducedMass->upload(reducedMassVec);
+            ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
+        }
+        ccmaAtoms->upload(atomsVec);
        ccmaAtomConstraints->upload(atomConstraintsVec);
        ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
-        ccmaReducedMass->upload(reducedMassVec);
        ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
-        ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);

        // Create the CCMA kernels.

@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
    vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
    vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    setPosqCorrectionArg(context, vsitePositionKernel, 1);
+    vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
    vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
    vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
-    // Skip argument 1: the force array hasn't been created yet.
-    vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    setPosqCorrectionArg(context, vsiteForceKernel, 1);
+    // Skip argument 2: the force array hasn't been created yet.
+    vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
    numVsites = num2Avg+num3Avg+numOutOfPlane;
 }

@@ -686,11 +740,18 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
        if (!hasInitialized) {
            settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
            settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
-            settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer());
-            settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
-            settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer());
-            settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer());
-        }
+            if (context.getUseMixedPrecision())
+                settleKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
+            else
+                settleKernel.setArg<void*>(3, NULL);
+            settleKernel.setArg<cl::Buffer>(4, posDelta->getDeviceBuffer());
+            settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
+            settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
+            settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
+        }
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            settleKernel.setArg<cl_double>(1, (cl_double) tol);
+        else
            settleKernel.setArg<cl_float>(1, (cl_float) tol);
        context.executeKernel(settleKernel, settleAtoms->getSize());
    }
@@ -698,10 +759,17 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
        if (!hasInitialized) {
            shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
            shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
-            shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
-            shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer());
-            shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer());
+            if (context.getUseMixedPrecision())
+                shakeKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
+            else
+                shakeKernel.setArg<void*>(3, NULL);
+            shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
+            shakeKernel.setArg<cl::Buffer>(5, shakeAtoms->getDeviceBuffer());
+            shakeKernel.setArg<cl::Buffer>(6, shakeParams->getDeviceBuffer());
        }
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            shakeKernel.setArg<cl_double>(1, (cl_double) tol);
+        else
            shakeKernel.setArg<cl_float>(1, (cl_float) tol);
        context.executeKernel(shakeKernel, shakeAtoms->getSize());
    }
@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
            ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
            ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
            ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
+            if (context.getUseMixedPrecision())
+                ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
+            else
+                ccmaDirectionsKernel.setArg<void*>(3, NULL);
            ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
@@ -730,6 +802,9 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
            ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
            ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
        }
+        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
+            ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
+        else
            ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
        context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
        const int checkInterval = 4;
@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {

 void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
-        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
+        vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
        context.executeKernel(vsiteForceKernel, numVsites);
    }
 }

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -66,6 +66,13 @@ static string intToString(int value) {
    return s.str();
 }

+static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
+    if (cl.getUseMixedPrecision())
+        kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
+    else
+        kernel.setArg<void*>(index, NULL);
+}
+
 static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
    const Lepton::Operation& op = expression.getRootNode().getOperation();
    if (op.getId() != Lepton::Operation::CONSTANT)
@@ -139,24 +146,62 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
 }

 void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
-    cl.getPosq().download(posq);
    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
-    mm_float4 periodicBoxSize = cl.getPeriodicBoxSize();
+    mm_double4 periodicBoxSize = cl.getPeriodicBoxSizeDouble();
+    if (cl.getUseDoublePrecision()) {
+        mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq);
+        for (int i = 0; i < numParticles; ++i) {
+            mm_double4 pos = posq[i];
+            mm_int4 offset = cl.getPosCellOffsets()[i];
+            positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
+        }
+    }
+    else if (cl.getUseMixedPrecision()) {
+        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+        vector<mm_float4> posCorrection;
+        cl.getPosq().download(posq);
+        cl.getPosqCorrection().download(posCorrection);
+        for (int i = 0; i < numParticles; ++i) {
+            mm_float4 pos1 = posq[i];
+            mm_float4 pos2 = posCorrection[i];
+            mm_int4 offset = cl.getPosCellOffsets()[i];
+            positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x-offset.x*periodicBoxSize.x, (double)pos1.y+(double)pos2.y-offset.y*periodicBoxSize.y, (double)pos1.z+(double)pos2.z-offset.z*periodicBoxSize.z);
+        }
+    }
+    else {
+        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; ++i) {
            mm_float4 pos = posq[i];
            mm_int4 offset = cl.getPosCellOffsets()[i];
            positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
        }
+    }
 }

 void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
-    mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
-    cl.getPosq().download(posq);
    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
+    if (cl.getUseDoublePrecision()) {
+        mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq);
+        for (int i = 0; i < numParticles; ++i) {
+            mm_double4& pos = posq[i];
+            const Vec3& p = positions[order[i]];
+            pos.x = p[0];
+            pos.y = p[1];
+            pos.z = p[2];
+        }
+        for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
+            posq[i] = mm_double4(0.0, 0.0, 0.0, 0.0);
+        cl.getPosq().upload(posq);
+    }
+    else {
+        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; ++i) {
            mm_float4& pos = posq[i];
            const Vec3& p = positions[order[i]];
@@ -167,53 +212,106 @@ void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vecto
        for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
            posq[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
        cl.getPosq().upload(posq);
+    }
+    if (cl.getUseMixedPrecision()) {
+        mm_float4* posCorrection = (mm_float4*) cl.getPinnedBuffer();
+        for (int i = 0; i < numParticles; ++i) {
+            mm_float4& c = posCorrection[i];
+            const Vec3& p = positions[order[i]];
+            c.x = (cl_float) (p[0]-(cl_float)p[0]);
+            c.y = (cl_float) (p[1]-(cl_float)p[1]);
+            c.z = (cl_float) (p[2]-(cl_float)p[2]);
+            c.w = 0;
+        }
+        for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
+            posCorrection[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        cl.getPosqCorrection().upload(posCorrection);
+    }
    for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++)
        cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0);
 }

 void OpenCLUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
-    mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
-    cl.getVelm().download(velm);
    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    velocities.resize(numParticles);
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+        mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
+        cl.getVelm().download(velm);
+        for (int i = 0; i < numParticles; ++i) {
+            mm_double4 vel = velm[i];
+            mm_int4 offset = cl.getPosCellOffsets()[i];
+            velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
+        }
+    }
+    else {
+        mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
+        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
            mm_float4 vel = velm[i];
+            mm_int4 offset = cl.getPosCellOffsets()[i];
            velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
        }
+    }
 }

 void OpenCLUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
-    mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
-    cl.getVelm().download(velm);
    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+        mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
+        cl.getVelm().download(velm);
+        for (int i = 0; i < numParticles; ++i) {
+            mm_double4& vel = velm[i];
+            const Vec3& p = velocities[order[i]];
+            vel.x = p[0];
+            vel.y = p[1];
+            vel.z = p[2];
+        }
+        for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
+            velm[i] = mm_double4(0.0, 0.0, 0.0, 0.0);
+        cl.getVelm().upload(velm);
+    }
+    else {
+        mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
+        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
            mm_float4& vel = velm[i];
            const Vec3& p = velocities[order[i]];
-        vel.x = (cl_float) p[0];
-        vel.y = (cl_float) p[1];
-        vel.z = (cl_float) p[2];
+            vel.x = p[0];
+            vel.y = p[1];
+            vel.z = p[2];
        }
        for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
            velm[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
        cl.getVelm().upload(velm);
+    }
 }

 void OpenCLUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
-    mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
-    cl.getForce().download(force);
    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    forces.resize(numParticles);
+    if (cl.getUseDoublePrecision()) {
+        mm_double4* force = (mm_double4*) cl.getPinnedBuffer();
+        cl.getForce().download(force);
+        for (int i = 0; i < numParticles; ++i) {
+            mm_double4 f = force[i];
+            forces[order[i]] = Vec3(f.x, f.y, f.z);
+        }
+    }
+    else {
+        mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
+        cl.getForce().download(force);
        for (int i = 0; i < numParticles; ++i) {
            mm_float4 f = force[i];
            forces[order[i]] = Vec3(f.x, f.y, f.z);
        }
+    }
 }

 void OpenCLUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
-    mm_float4 box = cl.getPeriodicBoxSize();
+    mm_double4 box = cl.getPeriodicBoxSizeDouble();
    a = Vec3(box.x, 0, 0);
    b = Vec3(0, box.y, 0);
    c = Vec3(0, 0, box.z);
@@ -228,6 +326,8 @@ void OpenCLUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, co
 void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
    int version = 1;
    stream.write((char*) &version, sizeof(int));
+    int precision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
+    stream.write((char*) &precision, sizeof(int));
    double time = cl.getTime();
    stream.write((char*) &time, sizeof(double));
    int stepCount = cl.getStepCount();
@@ -235,10 +335,14 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream
    int computeForceCount = cl.getComputeForceCount();
    stream.write((char*) &computeForceCount, sizeof(int));
    char* buffer = (char*) cl.getPinnedBuffer();
-    cl.getPosq().download((mm_float4*) buffer);
-    stream.write(buffer, sizeof(mm_float4)*cl.getPosq().getSize());
-    cl.getVelm().download((mm_float4*) buffer);
-    stream.write(buffer, sizeof(mm_float4)*cl.getVelm().getSize());
+    cl.getPosq().download(buffer);
+    stream.write(buffer, cl.getPosq().getSize()*cl.getPosq().getElementSize());
+    if (cl.getUseMixedPrecision()) {
+        cl.getPosqCorrection().download(buffer);
+        stream.write(buffer, cl.getPosqCorrection().getSize()*cl.getPosqCorrection().getElementSize());
+    }
+    cl.getVelm().download(buffer);
+    stream.write(buffer, cl.getVelm().getSize()*cl.getVelm().getElementSize());
    stream.write((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
    stream.write((char*) &cl.getPosCellOffsets()[0], sizeof(mm_int4)*cl.getPosCellOffsets().size());
    mm_float4 box = cl.getPeriodicBoxSize();
@@ -252,6 +356,11 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
    stream.read((char*) &version, sizeof(int));
    if (version != 1)
        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
+    int precision;
+    stream.read((char*) &precision, sizeof(int));
+    int expectedPrecision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
+    if (precision != expectedPrecision)
+        throw OpenMMException("Checkpoint was created with a different numeric precision");
    double time;
    stream.read((char*) &time, sizeof(double));
    int stepCount, computeForceCount;
@@ -264,9 +373,13 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
        contexts[i]->setComputeForceCount(computeForceCount);
    }
    char* buffer = (char*) cl.getPinnedBuffer();
-    stream.read(buffer, sizeof(mm_float4)*cl.getPosq().getSize());
+    stream.read(buffer, cl.getPosq().getSize()*cl.getPosq().getElementSize());
    cl.getPosq().upload(buffer);
-    stream.read(buffer, sizeof(mm_float4)*cl.getVelm().getSize());
+    if (cl.getUseMixedPrecision()) {
+        stream.read(buffer, cl.getPosqCorrection().getSize()*cl.getPosqCorrection().getElementSize());
+        cl.getPosqCorrection().upload(buffer);
+    }
+    stream.read(buffer, cl.getVelm().getSize()*cl.getVelm().getElementSize());
    cl.getVelm().upload(buffer);
    stream.read((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
    cl.getAtomIndexArray().upload(cl.getAtomIndex());
@@ -292,7 +405,8 @@ void OpenCLApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
        cl::Program program = cl.createProgram(OpenCLKernelSources::constraints, defines);
        applyDeltasKernel = cl::Kernel(program, "applyPositionDeltas");
        applyDeltasKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-        applyDeltasKernel.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer());
+        setPosqCorrectionArg(cl, applyDeltasKernel, 1);
+        applyDeltasKernel.setArg<cl::Buffer>(2, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer());
    }
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
    cl.clearBuffer(integration.getPosDelta());
@@ -4000,19 +4114,28 @@ void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const Verlet
        kernel1.setArg<cl_int>(0, numAtoms);
        kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
        kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
-        kernel1.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
-        kernel1.setArg<cl::Buffer>(4, cl.getForce().getDeviceBuffer());
-        kernel1.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel1, 3);
+        kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(5, cl.getForce().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(6, integration.getPosDelta().getDeviceBuffer());
        kernel2.setArg<cl_int>(0, numAtoms);
        kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
        kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel2, 3);
+        kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
    }
    if (dt != prevStepSize) {
+        if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+            vector<mm_double2> stepSizeVec(1);
+            stepSizeVec[0] = mm_double2(dt, dt);
+            cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
+        }
+        else {
            vector<mm_float2> stepSizeVec(1);
            stepSizeVec[0] = mm_float2((cl_float) dt, (cl_float) dt);
            cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
+        }
        prevStepSize = dt;
    }

@@ -4055,7 +4178,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L
    cl::Program program = cl.createProgram(OpenCLKernelSources::langevin, defines, "");
    kernel1 = cl::Kernel(program, "integrateLangevinPart1");
    kernel2 = cl::Kernel(program, "integrateLangevinPart2");
-    params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams");
+    params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
    prevStepSize = -1.0;
 }

@@ -4071,9 +4194,10 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
        kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
        kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
        kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(1, integration.getPosDelta().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(3, integration.getStepSize().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel2, 1);
+        kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
    }
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
@@ -4086,6 +4210,16 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
        double vscale = exp(-stepSize/tau);
        double fscale = (1-vscale)*tau;
        double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
+        if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+            vector<cl_double> p(params->getSize());
+            p[0] = vscale;
+            p[1] = fscale;
+            p[2] = noisescale;
+            params->upload(p);
+            mm_double2 ss = mm_double2(0, stepSize);
+            integration.getStepSize().upload(&ss);
+        }
+        else {
            vector<cl_float> p(params->getSize());
            p[0] = (cl_float) vscale;
            p[1] = (cl_float) fscale;
@@ -4093,6 +4227,7 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
            params->upload(p);
            mm_float2 ss = mm_float2(0, (float) stepSize);
            integration.getStepSize().upload(&ss);
+        }
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
@@ -4148,17 +4283,25 @@ void OpenCLIntegrateBrownianStepKernel::execute(ContextImpl& context, const Brow
        kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
        kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
        kernel2.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(3, integration.getPosDelta().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel2, 2);
+        kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
    }
    double temperature = integrator.getTemperature();
    double friction = integrator.getFriction();
    double stepSize = integrator.getStepSize();
    if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
        double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+        if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+            kernel1.setArg<cl_double>(0, tau*stepSize);
+            kernel1.setArg<cl_double>(1, sqrt(2.0f*BOLTZ*temperature*stepSize*tau));
+            kernel2.setArg<cl_double>(0, 1.0/stepSize);
+        }
+        else {
            kernel1.setArg<cl_float>(0, (cl_float) (tau*stepSize));
            kernel1.setArg<cl_float>(1, (cl_float) (sqrt(2.0f*BOLTZ*temperature*stepSize*tau)));
            kernel2.setArg<cl_float>(0, (cl_float) (1.0/stepSize));
+        }
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
@@ -4205,19 +4348,22 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
 double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
    int numAtoms = cl.getNumAtoms();
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        kernel1.setArg<cl_int>(0, numAtoms);
        kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
        kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
-        kernel1.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
-        kernel1.setArg<cl::Buffer>(4, cl.getForce().getDeviceBuffer());
-        kernel1.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel1, 3);
+        kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(5, cl.getForce().getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(6, integration.getPosDelta().getDeviceBuffer());
        kernel2.setArg<cl_int>(0, numAtoms);
        kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
        kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel2, 3);
+        kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
        selectSizeKernel.setArg<cl_int>(0, numAtoms);
        selectSizeKernel.setArg<cl::Buffer>(3, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
        selectSizeKernel.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
@@ -4227,9 +4373,16 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co

    // Select the step size to use.

-    float maxStepSize = (float)(maxTime-cl.getTime());
-    selectSizeKernel.setArg<cl_float>(1, maxStepSize);
+    double maxStepSize = maxTime-cl.getTime();
+    float maxStepSizeFloat = (float) maxStepSize;
+    if (useDouble) {
+        selectSizeKernel.setArg<cl_double>(1, maxStepSize);
+        selectSizeKernel.setArg<cl_double>(2, integrator.getErrorTolerance());
+    }
+    else {
+        selectSizeKernel.setArg<cl_float>(1, maxStepSizeFloat);
        selectSizeKernel.setArg<cl_float>(2, (cl_float) integrator.getErrorTolerance());
+    }
    cl.executeKernel(selectSizeKernel, blockSize, blockSize);

    // Call the first integration kernel.
@@ -4253,12 +4406,23 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co

    // Update the time and step count.

-    mm_float2 stepSize;
+    double dt, time;
+    if (useDouble) {
+        mm_double2 stepSize;
        cl.getIntegrationUtilities().getStepSize().download(&stepSize);
-    double dt = stepSize.y;
-    double time = cl.getTime()+dt;
+        dt = stepSize.y;
+        time = cl.getTime()+dt;
        if (dt == maxStepSize)
            time = maxTime; // Avoid round-off error
+    }
+    else {
+        mm_float2 stepSize;
+        cl.getIntegrationUtilities().getStepSize().download(&stepSize);
+        dt = stepSize.y;
+        time = cl.getTime()+dt;
+        if (dt == maxStepSizeFloat)
+            time = maxTime; // Avoid round-off error
+    }
    cl.setTime(time);
    cl.setStepCount(cl.getStepCount()+1);
    return dt;
@@ -4279,7 +4443,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
    kernel1 = cl::Kernel(program, "integrateLangevinPart1");
    kernel2 = cl::Kernel(program, "integrateLangevinPart2");
    selectSizeKernel = cl::Kernel(program, "selectLangevinStepSize");
-    params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams");
+    params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
    blockSize = min(256, system.getNumParticles());
    blockSize = max(blockSize, params->getSize());
    blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
@@ -4288,6 +4452,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
 double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
    int numAtoms = cl.getNumAtoms();
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
@@ -4297,9 +4462,10 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
        kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
        kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
        kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(1, integration.getPosDelta().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
-        kernel2.setArg<cl::Buffer>(3, integration.getStepSize().getDeviceBuffer());
+        setPosqCorrectionArg(cl, kernel2, 1);
+        kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
        selectSizeKernel.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
        selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
        selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
@@ -4310,11 +4476,20 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,

    // Select the step size to use.

-    float maxStepSize = (float)(maxTime-cl.getTime());
-    selectSizeKernel.setArg<cl_float>(0, maxStepSize);
+    double maxStepSize = maxTime-cl.getTime();
+    float maxStepSizeFloat = (float) maxStepSize;
+    if (useDouble) {
+        selectSizeKernel.setArg<cl_double>(0, maxStepSize);
+        selectSizeKernel.setArg<cl_double>(1, integrator.getErrorTolerance());
+        selectSizeKernel.setArg<cl_double>(2, integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction());
+        selectSizeKernel.setArg<cl_double>(3, BOLTZ*integrator.getTemperature());
+    }
+    else {
+        selectSizeKernel.setArg<cl_float>(0, maxStepSizeFloat);
        selectSizeKernel.setArg<cl_float>(1, (cl_float) integrator.getErrorTolerance());
        selectSizeKernel.setArg<cl_float>(2, (cl_float) (integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction()));
        selectSizeKernel.setArg<cl_float>(3, (cl_float) (BOLTZ*integrator.getTemperature()));
+    }
    cl.executeKernel(selectSizeKernel, blockSize, blockSize);

    // Call the first integration kernel.
@@ -4339,12 +4514,23 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,

    // Update the time and step count.

-    mm_float2 stepSize;
+    double dt, time;
+    if (useDouble) {
+        mm_double2 stepSize;
        cl.getIntegrationUtilities().getStepSize().download(&stepSize);
-    double dt = stepSize.y;
-    double time = cl.getTime()+dt;
+        dt = stepSize.y;
+        time = cl.getTime()+dt;
        if (dt == maxStepSize)
            time = maxTime; // Avoid round-off error
+    }
+    else {
+        mm_float2 stepSize;
+        cl.getIntegrationUtilities().getStepSize().download(&stepSize);
+        dt = stepSize.y;
+        time = cl.getTime()+dt;
+        if (dt == maxStepSizeFloat)
+            time = maxTime; // Avoid round-off error
+    }
    cl.setTime(time);
    cl.setStepCount(cl.getStepCount()+1);
    return dt;
@@ -4352,8 +4538,8 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,

 class OpenCLIntegrateCustomStepKernel::ReorderListener : public OpenCLContext::ReorderListener {
 public:
-    ReorderListener(OpenCLContext& cl, OpenCLParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValues, bool& deviceValuesAreCurrent) :
-            cl(cl), perDofValues(perDofValues), localPerDofValues(localPerDofValues), deviceValuesAreCurrent(deviceValuesAreCurrent) {
+    ReorderListener(OpenCLContext& cl, OpenCLParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValuesFloat, vector<vector<cl_double> >& localPerDofValuesDouble, bool& deviceValuesAreCurrent) :
+            cl(cl), perDofValues(perDofValues), localPerDofValuesFloat(localPerDofValuesFloat), localPerDofValuesDouble(localPerDofValuesDouble), deviceValuesAreCurrent(deviceValuesAreCurrent) {
        int numAtoms = cl.getNumAtoms();
        lastAtomOrder.resize(numAtoms);
        for (int i = 0; i < numAtoms; i++)
@@ -4365,21 +4551,39 @@ public:
        if (perDofValues.getNumParameters() == 0)
            return;
        int numAtoms = cl.getNumAtoms();
+        const vector<int>& order = cl.getAtomIndex();
+        if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+            if (deviceValuesAreCurrent)
+                perDofValues.getParameterValues(localPerDofValuesDouble);
+            vector<vector<cl_double> > swap(3*numAtoms);
+            for (int i = 0; i < numAtoms; i++) {
+                swap[3*lastAtomOrder[i]] = localPerDofValuesDouble[3*i];
+                swap[3*lastAtomOrder[i]+1] = localPerDofValuesDouble[3*i+1];
+                swap[3*lastAtomOrder[i]+2] = localPerDofValuesDouble[3*i+2];
+            }
+            for (int i = 0; i < numAtoms; i++) {
+                localPerDofValuesDouble[3*i] = swap[3*order[i]];
+                localPerDofValuesDouble[3*i+1] = swap[3*order[i]+1];
+                localPerDofValuesDouble[3*i+2] = swap[3*order[i]+2];
+            }
+            perDofValues.setParameterValues(localPerDofValuesDouble);
+        }
+        else {
            if (deviceValuesAreCurrent)
-            perDofValues.getParameterValues(localPerDofValues);
+                perDofValues.getParameterValues(localPerDofValuesFloat);
            vector<vector<cl_float> > swap(3*numAtoms);
            for (int i = 0; i < numAtoms; i++) {
-            swap[3*lastAtomOrder[i]] = localPerDofValues[3*i];
-            swap[3*lastAtomOrder[i]+1] = localPerDofValues[3*i+1];
-            swap[3*lastAtomOrder[i]+2] = localPerDofValues[3*i+2];
+                swap[3*lastAtomOrder[i]] = localPerDofValuesFloat[3*i];
+                swap[3*lastAtomOrder[i]+1] = localPerDofValuesFloat[3*i+1];
+                swap[3*lastAtomOrder[i]+2] = localPerDofValuesFloat[3*i+2];
            }
-        const vector<cl_int>& order = cl.getAtomIndex();
            for (int i = 0; i < numAtoms; i++) {
-            localPerDofValues[3*i] = swap[3*order[i]];
-            localPerDofValues[3*i+1] = swap[3*order[i]+1];
-            localPerDofValues[3*i+2] = swap[3*order[i]+2];
+                localPerDofValuesFloat[3*i] = swap[3*order[i]];
+                localPerDofValuesFloat[3*i+1] = swap[3*order[i]+1];
+                localPerDofValuesFloat[3*i+2] = swap[3*order[i]+2];
+            }
+            perDofValues.setParameterValues(localPerDofValuesFloat);
        }
-        perDofValues.setParameterValues(localPerDofValues);
        for (int i = 0; i < numAtoms; i++)
            lastAtomOrder[i] = order[i];
        deviceValuesAreCurrent = true;
@@ -4387,7 +4591,8 @@ public:
 private:
    OpenCLContext& cl;
    OpenCLParameterSet& perDofValues;
-    vector<vector<cl_float> >& localPerDofValues;
+    vector<vector<cl_float> >& localPerDofValuesFloat;
+    vector<vector<cl_double> >& localPerDofValuesDouble;
    bool& deviceValuesAreCurrent;
    vector<int> lastAtomOrder;
 };
@@ -4413,11 +4618,12 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
    cl.getPlatformData().initializeContexts(system);
    cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
-    globalValues = OpenCLArray::create<cl_float>(cl, max(1, numGlobalVariables), "globalVariables");
-    sumBuffer = OpenCLArray::create<cl_float>(cl, 3*system.getNumParticles(), "sumBuffer");
-    energy = OpenCLArray::create<cl_float>(cl, 1, "energy");
-    perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables");
-    cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValues, deviceValuesAreCurrent));
+    int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
+    globalValues = new OpenCLArray(cl, max(1, numGlobalVariables), elementSize, "globalVariables");
+    sumBuffer = new OpenCLArray(cl, 3*system.getNumParticles(), elementSize, "sumBuffer");
+    energy = new OpenCLArray(cl, 1, elementSize, "energy");
+    perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
+    cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
    prevStepSize = -1.0;
    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
 }
@@ -4492,19 +4698,31 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
    int numAtoms = cl.getNumAtoms();
    int numSteps = integrator.getNumComputations();
+    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        
        // Initialize various data structures.
        
        const map<string, double>& params = context.getParameters();
+        if (useDouble) {
+            contextParameterValues = OpenCLArray::create<cl_double>(cl, max(1, (int) params.size()), "contextParameters");
+            contextValuesDouble.resize(contextParameterValues->getSize());
+            for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
+                contextValuesDouble[parameterNames.size()] = iter->second;
+                parameterNames.push_back(iter->first);
+            }
+            contextParameterValues->upload(contextValuesDouble);
+        }
+        else {
            contextParameterValues = OpenCLArray::create<cl_float>(cl, max(1, (int) params.size()), "contextParameters");
-        contextValues.resize(contextParameterValues->getSize());
+            contextValuesFloat.resize(contextParameterValues->getSize());
            for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
-            contextValues[parameterNames.size()] = (float) iter->second;
+                contextValuesFloat[parameterNames.size()] = (float) iter->second;
                parameterNames.push_back(iter->first);
            }
-        contextParameterValues->upload(contextValues);
+            contextParameterValues->upload(contextValuesFloat);
+        }
        kernels.resize(integrator.getNumComputations());
        requiredGaussian.resize(integrator.getNumComputations(), 0);
        requiredUniform.resize(integrator.getNumComputations(), 0);
@@ -4644,7 +4862,6 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
                    compute << buffer.getType()<<" perDofy"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+1];\n";
                    compute << buffer.getType()<<" perDofz"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+2];\n";
                }
-                string convert = (cl.getSupportsDoublePrecision() ? "convert_float4(" : "(");
                int numGaussian = 0, numUniform = 0;
                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
                    compute << "{\n";
@@ -4653,15 +4870,15 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
                    if (variable[j] == "x") {
                        if (storePosAsDelta[j]) {
                            if (cl.getSupportsDoublePrecision())
-                                compute << "posDelta[index] = convert_float4(position-convert_double4(posq[index]));\n";
+                                compute << "posDelta[index] = convert_mixed4(convert_double4(position)-convert_double4(loadPos(posq, posqCorrection, index)));\n";
                            else
                                compute << "posDelta[index] = position-posq[index];\n";
                        }
                        else
-                            compute << "posq[index] = " << convert << "position);\n";
+                            compute << "storePos(posq, posqCorrection, index, position);\n";
                    }
                    else if (variable[j] == "v")
-                        compute << "velm[index] = " << convert << "velocity);\n";
+                        compute << "velm[index] = convert_mixed4(velocity);\n";
                    else {
                        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
                            const OpenCLNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
@@ -4694,6 +4911,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
                requiredUniform[step] = numUniform;
                int index = 0;
                kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
+                setPosqCorrectionArg(cl, kernel, index++);
                kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, cl.getVelm().getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, cl.getForce().getDeviceBuffer());
@@ -4711,7 +4929,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
                    // Create a second kernel for this step that sums the values.

                    program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
-                    kernel = cl::Kernel(program, "computeSum");
+                    kernel = cl::Kernel(program, useDouble ? "computeDoubleSum" : "computeFloatSum");
                    kernels[step].push_back(kernel);
                    index = 0;
                    kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
@@ -4760,6 +4978,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
                kernels[step].push_back(kernel);
                int index = 0;
                kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
+                setPosqCorrectionArg(cl, kernel, index++);
                kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
            }
        }
@@ -4767,7 +4986,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
        // Create the kernel for summing energy.

        cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
-        sumEnergyKernel = cl::Kernel(program, "computeSum");
+        sumEnergyKernel = cl::Kernel(program, cl.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
        int index = 0;
        sumEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
        sumEnergyKernel.setArg<cl::Buffer>(index++, energy->getDeviceBuffer());
@@ -4778,26 +4997,48 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
    // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
    
    if (!deviceValuesAreCurrent) {
-        perDofValues->setParameterValues(localPerDofValues);
+        if (useDouble)
+            perDofValues->setParameterValues(localPerDofValuesDouble);
+        else
+            perDofValues->setParameterValues(localPerDofValuesFloat);
        deviceValuesAreCurrent = true;
    }
    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
    if (stepSize != prevStepSize) {
+        if (useDouble) {
+            mm_double2 ss = mm_double2(0, stepSize);
+            integration.getStepSize().upload(&ss);
+        }
+        else {
            mm_float2 ss = mm_float2(0, (float) stepSize);
            integration.getStepSize().upload(&ss);
+        }
        prevStepSize = stepSize;
    }
    bool paramsChanged = false;
+    if (useDouble) {
+        for (int i = 0; i < (int) parameterNames.size(); i++) {
+            double value = context.getParameter(parameterNames[i]);
+            if (value != contextValuesDouble[i]) {
+                contextValuesDouble[i] = value;
+                paramsChanged = true;
+            }
+        }
+        if (paramsChanged)
+            contextParameterValues->upload(contextValuesDouble);
+    }
+    else {
        for (int i = 0; i < (int) parameterNames.size(); i++) {
            float value = (float) context.getParameter(parameterNames[i]);
-        if (value != contextValues[i]) {
-            contextValues[i] = value;
+            if (value != contextValuesFloat[i]) {
+                contextValuesFloat[i] = value;
                paramsChanged = true;
            }
        }
        if (paramsChanged)
-        contextParameterValues->upload(contextValues);
+            contextParameterValues->upload(contextValuesFloat);
+    }

    // Loop over computation steps in the integrator and execute them.

@@ -4826,7 +5067,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
            forcesAreValid = true;
        }
        if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
-            kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i]));
+            kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
            if (requiredUniform[i] > 0)
                cl.executeKernel(randomKernel, numAtoms);
            cl.executeKernel(kernels[i][0], numAtoms);
@@ -4837,7 +5078,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
            cl.executeKernel(kernels[i][0], 1, 1);
        }
        else if (stepType[i] == CustomIntegrator::ComputeSum) {
-            kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i]));
+            kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
            if (requiredUniform[i] > 0)
                cl.executeKernel(randomKernel, numAtoms);
            cl.executeKernel(kernels[i][0], numAtoms);
@@ -4875,11 +5116,21 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
 void OpenCLIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
    if (!modifiesParameters)
        return;
-    contextParameterValues->download(contextValues);
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+        contextParameterValues->download(contextValuesDouble);
+        for (int i = 0; i < (int) parameterNames.size(); i++) {
+            double value = context.getParameter(parameterNames[i]);
+            if (value != contextValuesDouble[i])
+                context.setParameter(parameterNames[i], contextValuesDouble[i]);
+        }
+    }
+    else {
+        contextParameterValues->download(contextValuesFloat);
        for (int i = 0; i < (int) parameterNames.size(); i++) {
            float value = (float) context.getParameter(parameterNames[i]);
-        if (value != contextValues[i])
-            context.setParameter(parameterNames[i], contextValues[i]);
+            if (value != contextValuesFloat[i])
+                context.setParameter(parameterNames[i], contextValuesFloat[i]);
+        }
    }
 }

@@ -4888,43 +5139,72 @@ void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, v
        values.resize(0);
        return;
    }
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
+        globalValues->download(values);
+    else {
        vector<cl_float> buffer;
        globalValues->download(buffer);
-    values.resize(numGlobalVariables);
        for (int i = 0; i < numGlobalVariables; i++)
            values[i] = buffer[i];
+    }
 }

 void OpenCLIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
    if (numGlobalVariables == 0)
        return;
-    vector<cl_float> valuesVec(numGlobalVariables);
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
+        globalValues->upload(values);
+    else {
+        vector<cl_float> buffer(numGlobalVariables);
        for (int i = 0; i < numGlobalVariables; i++)
-        valuesVec[i] = (float) values[i];
-    globalValues->upload(valuesVec);
+            buffer[i] = (cl_float) values[i];
+        globalValues->upload(buffer);
+    }
 }

 void OpenCLIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
+    values.resize(perDofValues->getNumObjects()/3);
+    const vector<int>& order = cl.getAtomIndex();
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
        if (!localValuesAreCurrent) {
-        perDofValues->getParameterValues(localPerDofValues);
+            perDofValues->getParameterValues(localPerDofValuesDouble);
            localValuesAreCurrent = true;
        }
-    values.resize(perDofValues->getNumObjects()/3);
-    const vector<cl_int>& order = cl.getAtomIndex();
        for (int i = 0; i < (int) values.size(); i++)
            for (int j = 0; j < 3; j++)
-            values[order[i]][j] = localPerDofValues[3*i+j][variable];
+                values[order[i]][j] = localPerDofValuesDouble[3*i+j][variable];
+    }
+    else {
+        if (!localValuesAreCurrent) {
+            perDofValues->getParameterValues(localPerDofValuesFloat);
+            localValuesAreCurrent = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++)
+            for (int j = 0; j < 3; j++)
+                values[order[i]][j] = localPerDofValuesFloat[3*i+j][variable];
+    }
 }

 void OpenCLIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
+    const vector<int>& order = cl.getAtomIndex();
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
        if (!localValuesAreCurrent) {
-        perDofValues->getParameterValues(localPerDofValues);
+            perDofValues->getParameterValues(localPerDofValuesDouble);
+            localValuesAreCurrent = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++)
+            for (int j = 0; j < 3; j++)
+                localPerDofValuesDouble[3*i+j][variable] = values[order[i]][j];
+    }
+    else {
+        if (!localValuesAreCurrent) {
+            perDofValues->getParameterValues(localPerDofValuesFloat);
            localValuesAreCurrent = true;
        }
-    const vector<cl_int>& order = cl.getAtomIndex();
        for (int i = 0; i < (int) values.size(); i++)
            for (int j = 0; j < 3; j++)
-            localPerDofValues[3*i+j][variable] = (float) values[order[i]][j];
+                localPerDofValuesFloat[3*i+j][variable] = (float) values[order[i]][j];
+    }
    deviceValuesAreCurrent = false;
 }

@@ -5035,14 +5315,24 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
    // We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
    // on the CPU.

+    const vector<cl_int>& order = cl.getAtomIndex();
+    double energy = 0.0;
+    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
+        mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
+        cl.getVelm().download(velm);
+        for (size_t i = 0; i < masses.size(); ++i) {
+            mm_double4 v = velm[i];
+            energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
+        }
+    }
+    else {
        mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
        cl.getVelm().download(velm);
-    double energy = 0.0;
-    const vector<cl_int>& order = cl.getAtomIndex();
        for (size_t i = 0; i < masses.size(); ++i) {
            mm_float4 v = velm[i];
            energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
        }
+    }
    return 0.5*energy;
 }


--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -1145,7 +1145,10 @@ private:
    OpenCLArray* uniformRandoms;
    OpenCLArray* randomSeed;
    OpenCLParameterSet* perDofValues;
-    mutable std::vector<std::vector<cl_float> > localPerDofValues;
+    mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
+    mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
+    std::vector<float> contextValuesFloat;
+    std::vector<double> contextValuesDouble;
    std::vector<float> contextValues;
    std::vector<std::vector<cl::Kernel> > kernels;
    cl::Kernel sumEnergyKernel, randomKernel;

--- a/platforms/opencl/src/OpenCLParameterSet.cpp
+++ b/platforms/opencl/src/OpenCLParameterSet.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,32 +32,34 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) :
+OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
            context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
    int params = numParameters;
    int bufferCount = 0;
+    elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
+    string elementType = (useDoublePrecision ? "double" : "float");
    try {
        if (!bufferPerParameter) {
            while (params > 2) {
-                cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4));
+                cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*4);
                std::stringstream name;
                name << "param" << (++bufferCount);
-                buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf));
+                buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, *buf));
                params -= 4;
            }
            if (params > 1) {
-                cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2));
+                cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*2);
                std::stringstream name;
                name << "param" << (++bufferCount);
-                buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf));
+                buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, *buf));
                params -= 2;
            }
        }
        while (params > 0) {
-            cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float));
+            cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize);
            std::stringstream name;
            name << "param" << (++bufferCount);
-            buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf));
+            buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, *buf));
            params--;
        }
    }
@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() {
        delete &buffers[i].getMemory();
 }

-void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) const {
+template <class T>
+void OpenCLParameterSet::getParameterValues(vector<vector<T> >& values) const {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called getParameterValues() with vector of wrong type");
    values.resize(numObjects);
    for (int i = 0; i < numObjects; i++)
        values[i].resize(numParameters);
    try {
        int base = 0;
        for (int i = 0; i < (int) buffers.size(); i++) {
-            if (buffers[i].getType() == "float4") {
-                vector<mm_float4> data(numObjects);
+            if (buffers[i].getSize() == 4*elementSize) {
+                vector<T> data(4*numObjects);
                context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
                for (int j = 0; j < numObjects; j++) {
-                    values[j][base] = data[j].x;
+                    values[j][base] = data[4*j];
                    if (base+1 < numParameters)
-                        values[j][base+1] = data[j].y;
+                        values[j][base+1] = data[4*j+1];
                    if (base+2 < numParameters)
-                        values[j][base+2] = data[j].z;
+                        values[j][base+2] = data[4*j+2];
                    if (base+3 < numParameters)
-                        values[j][base+3] = data[j].w;
+                        values[j][base+3] = data[4*j+3];
                }
                base += 4;
            }
-            else if (buffers[i].getType() == "float2") {
-                vector<mm_float2> data(numObjects);
+            else if (buffers[i].getSize() == 2*elementSize) {
+                vector<T> data(2*numObjects);
                context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
                for (int j = 0; j < numObjects; j++) {
-                    values[j][base] = data[j].x;
+                    values[j][base] = data[2*j];
                    if (base+1 < numParameters)
-                        values[j][base+1] = data[j].y;
+                        values[j][base+1] = data[2*j+1];
                }
                base += 2;
            }
-            else if (buffers[i].getType() == "float") {
-                vector<cl_float> data(numObjects);
+            else if (buffers[i].getSize() == elementSize) {
+                vector<T> data(numObjects);
                context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
                for (int j = 0; j < numObjects; j++)
                    values[j][base] = data[j];
@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c
    }
 }

-void OpenCLParameterSet::setParameterValues(const vector<vector<cl_float> >& values) {
+template <class T>
+void OpenCLParameterSet::setParameterValues(const vector<vector<T> >& values) {
+    if (sizeof(T) != elementSize)
+        throw OpenMMException("Called setParameterValues() with vector of wrong type");
    try {
        int base = 0;
        for (int i = 0; i < (int) buffers.size(); i++) {
-            if (buffers[i].getType() == "float4") {
-                vector<mm_float4> data(numObjects);
+            if (buffers[i].getSize() == 4*elementSize) {
+                vector<T> data(4*numObjects);
                for (int j = 0; j < numObjects; j++) {
-                    data[j].x = values[j][base];
+                    data[4*j] = values[j][base];
                    if (base+1 < numParameters)
-                        data[j].y = values[j][base+1];
+                        data[4*j+1] = values[j][base+1];
                    if (base+2 < numParameters)
-                        data[j].z = values[j][base+2];
+                        data[4*j+2] = values[j][base+2];
                    if (base+3 < numParameters)
-                        data[j].w = values[j][base+3];
+                        data[4*j+3] = values[j][base+3];
                }
                context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
                base += 4;
            }
-            else if (buffers[i].getType() == "float2") {
-                vector<mm_float2> data(numObjects);
+            else if (buffers[i].getSize() == 2*elementSize) {
+                vector<T> data(2*numObjects);
                for (int j = 0; j < numObjects; j++) {
-                    data[j].x = values[j][base];
+                    data[2*j] = values[j][base];
                    if (base+1 < numParameters)
-                        data[j].y = values[j][base+1];
+                        data[2*j+1] = values[j][base+1];
                }
                context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
                base += 2;
            }
-            else if (buffers[i].getType() == "float") {
-                vector<cl_float> data(numObjects);
+            else if (buffers[i].getSize() == elementSize) {
+                vector<T> data(numObjects);
                for (int j = 0; j < numObjects; j++)
                    data[j] = values[j][base];
                context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr
    const string suffixes[] = {".x", ".y", ".z", ".w"};
    int buffer = -1;
    for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
-        if (index*sizeof(cl_float) < buffers[i].getSize())
+        if (index*elementSize < buffers[i].getSize())
            buffer = i;
        else
-            index -= buffers[i].getSize()/sizeof(cl_float);
+            index -= buffers[i].getSize()/elementSize;
    }
    if (buffer == -1)
        throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")");
    stringstream suffix;
    suffix << (buffer+1) << extraSuffix;
-    if (buffers[buffer].getType() != "float")
+    if (buffers[buffer].getSize() != elementSize)
        suffix << suffixes[index];
    return suffix.str();
 }
+
+/**
+ * Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
+ */
+namespace OpenMM {
+template void OpenCLParameterSet::getParameterValues<float>(vector<vector<float> >& values) const;
+template void OpenCLParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
+template void OpenCLParameterSet::getParameterValues<double>(vector<vector<double> >& values) const;
+template void OpenCLParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
+}
\ No newline at end of file
--- a/platforms/opencl/src/OpenCLParameterSet.h
+++ b/platforms/opencl/src/OpenCLParameterSet.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -51,8 +51,9 @@ public:
     * @param name             the name of the parameter set
     * @param bufferPerParameter  if true, a separate cl::Buffer is created for each parameter.  If false,
     *                            multiple parameters may be combined into a single buffer.
+     * @param useDoublePrecision  whether values should be stored as single or double precision
     */
-    OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false);
+    OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
    ~OpenCLParameterSet();
    /**
     * Get the number of parameters.
@@ -71,13 +72,15 @@ public:
     *
     * @param values on exit, values[i][j] contains the value of parameter j for object i
     */
-    void getParameterValues(std::vector<std::vector<cl_float> >& values) const;
+    template <class T>
+    void getParameterValues(std::vector<std::vector<T> >& values) const;
    /**
     * Set the values of all parameters.
     *
     * @param values values[i][j] contains the value of parameter j for object i
     */
-    void setParameterValues(const std::vector<std::vector<cl_float> >& values);
+    template <class T>
+    void setParameterValues(const std::vector<std::vector<T> >& values);
    /**
     * Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
     * containing the data.
@@ -95,8 +98,7 @@ public:
    std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
 private:
    OpenCLContext& context;
-    int numParameters;
-    int numObjects;
+    int numParameters, numObjects, elementSize;
    std::string name;
    std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers;
 };

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() {
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(OpenCLDeviceIndex());
    platformProperties.push_back(OpenCLPlatformIndex());
+    platformProperties.push_back(OpenCLPrecision());
    setPropertyDefaultValue(OpenCLDeviceIndex(), "");
    setPropertyDefaultValue(OpenCLPlatformIndex(), "");
+    setPropertyDefaultValue(OpenCLPrecision(), "single");
 }

 bool OpenCLPlatform::supportsDoublePrecision() const {
@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
            getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second);
    const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ?
            getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
-    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue));
+    string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
+            getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
+    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
 }

 void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
    delete data;
 }

-OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
+        const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
    int platformIndex = 0;
    if (platformPropValue.length() > 0)
        stringstream(platformPropValue) >> platformIndex;
@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
        if (devices[i].length() > 0) {
            unsigned int deviceIndex;
            stringstream(devices[i]) >> deviceIndex;
-            contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, *this));
+            contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, precisionProperty, *this));
        }
    }
    if (contexts.size() == 0)
-        contexts.push_back(new OpenCLContext(system, platformIndex, -1, *this));
+        contexts.push_back(new OpenCLContext(system, platformIndex, -1, precisionProperty, *this));
    stringstream device;
    for (int i = 0; i < (int) contexts.size(); i++) {
        if (i > 0)
@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
    }
    propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
    propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
+    propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
    contextEnergy.resize(contexts.size());
 }


--- a/platforms/opencl/src/kernels/andersenThermostat.cl
+++ b/platforms/opencl/src/kernels/andersenThermostat.cl
@@ -2,17 +2,19 @@
 * Apply the Andersen thermostat to adjust particle velocities.
 */

-__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global const float2* restrict stepSize, __global const float4* restrict random,
+__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global mixed4* velm, __global const mixed2* restrict stepSize, __global const float4* restrict random,
        unsigned int randomIndex, __global const int* restrict atomGroups) {
    float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
    float randomRange = erf(collisionProbability/sqrt(2.0f));
    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        float4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];
-        float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f);
-        float add = (1.0f-scale)*sqrt(kT*velocity.w);
-        velocity.xyz = scale*velocity.xyz + add*velRand.xyz;
+        real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
+        real add = (1-scale)*sqrt(kT*velocity.w);
+        velocity.x = scale*velocity.x + add*velRand.x;
+        velocity.y = scale*velocity.y + add*velRand.y;
+        velocity.z = scale*velocity.z + add*velRand.z;
        velm[index] = velocity;
    }
 }
--- a/platforms/opencl/src/kernels/brownian.cl
+++ b/platforms/opencl/src/kernels/brownian.cl
@@ -2,13 +2,16 @@
 * Perform the first step of Brownian integration.
 */

-__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global const float4* restrict force,
-        __global float4* restrict posDelta, __global const float4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
+__kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __global const real4* restrict force,
+        __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
    randomIndex += get_global_id(0);
    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        float invMass = velm[index].w;
-        if (invMass != 0.0)
-            posDelta[index] = (float4) (tauDeltaT*invMass*force[index].xyz + noiseAmplitude*sqrt(invMass)*random[randomIndex].xyz, 0.0f);
+        mixed invMass = velm[index].w;
+        if (invMass != 0) {
+            posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*sqrt(invMass)*random[randomIndex].x,
+                                        tauDeltaT*invMass*force[index].y + noiseAmplitude*sqrt(invMass)*random[randomIndex].y,
+                                        tauDeltaT*invMass*force[index].z + noiseAmplitude*sqrt(invMass)*random[randomIndex].z, 0);
+        }
        randomIndex += get_global_size(0);
    }
 }
@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
 * Perform the second step of Brownian integration.
 */

-__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global const float4* restrict posDelta) {
+__kernel void integrateBrownianPart2(mixed oneOverDeltaT, __global real4* posq, __global real4* posqCorrection, __global mixed4* velm, __global const mixed4* restrict posDelta) {
    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        if (velm[index].w != 0.0) {
-            float4 delta = posDelta[index];
-            velm[index].xyz = oneOverDeltaT*delta.xyz;
-            posq[index].xyz = posq[index].xyz + delta.xyz;
+        if (velm[index].w != 0) {
+            mixed4 delta = posDelta[index];
+            velm[index].x = oneOverDeltaT*delta.x;
+            velm[index].y = oneOverDeltaT*delta.y;
+            velm[index].z = oneOverDeltaT*delta.z;
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            pos.x += delta.x;
+            pos.y += delta.y;
+            pos.z += delta.z;
+#ifdef USE_MIXED_PRECISION
+            posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+            posq[index] = pos;
+#endif
        }
    }
 }
--- a/platforms/opencl/src/kernels/ccma.cl
+++ b/platforms/opencl/src/kernels/ccma.cl
+mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
+#ifdef USE_MIXED_PRECISION
+    real4 pos1 = posq[index];
+    real4 pos2 = posqCorrection[index];
+    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+    return posq[index];
+#endif
+}
 /**
 * Compute the direction each constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global float4* restrict constraintDistance, __global const float4* restrict atomPositions) {
+__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance, __global const real4* restrict atomPositions, __global const real4* restrict posCorrection) {
    for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
        // Compute the direction for this constraint.

        int2 atoms = constraintAtoms[index];
-        float4 dir = constraintDistance[index];
-        float4 oldPos1 = atomPositions[atoms.x];
-        float4 oldPos2 = atomPositions[atoms.y];
+        mixed4 dir = constraintDistance[index];
+        mixed4 oldPos1 = loadPos(atomPositions, posCorrection, atoms.x);
+        mixed4 oldPos2 = loadPos(atomPositions, posCorrection, atoms.y);
        dir.x = oldPos1.x-oldPos2.x;
        dir.y = oldPos1.y-oldPos2.y;
        dir.z = oldPos1.z-oldPos2.z;
@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
 /**
 * Compute the force applied by each constraint.
 */
-__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const float4* restrict constraintDistance, __global const float4* restrict atomPositions,
-        __global const float* restrict reducedMass, __global float* restrict delta1, __global int* restrict converged, float tol, int iteration) {
+__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
+        __global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
    __local int groupConverged;
    if (converged[1-iteration%2]) {
        if (get_global_id(0) == 0)
@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
    if (get_local_id(0) == 0)
        groupConverged = 1;
    barrier(CLK_LOCAL_MEM_FENCE);
-    float lowerTol = 1.0f-2.0f*tol+tol*tol;
-    float upperTol = 1.0f+2.0f*tol+tol*tol;
+    mixed lowerTol = 1-2*tol+tol*tol;
+    mixed upperTol = 1+2*tol+tol*tol;
    for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
        // Compute the force due to this constraint.

        int2 atoms = constraintAtoms[index];
-        float4 dir = constraintDistance[index];
-        float4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
+        mixed4 dir = constraintDistance[index];
+        mixed4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
 #ifndef CONSTRAIN_VELOCITIES
        rp_ij.xyz += dir.xyz;
 #endif
-        float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
-        float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
+        mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
+        mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
 #ifdef CONSTRAIN_VELOCITIES
-        delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2;
+        delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;

        // See whether it has converged.

@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
            converged[iteration%2] = 0;
        }
 #else
-        float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
-        float dist2 = dir.w*dir.w;
-        float diff = dist2 - rp2;
+        mixed rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
+        mixed dist2 = dir.w*dir.w;
+        mixed diff = dist2 - rp2;
        delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);

        // See whether it has converged.
@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
 /**
 * Multiply the vector of constraint forces by the constraint matrix.
 */
-__kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, __global float* restrict delta2, __global const int* restrict constraintMatrixColumn,
-        __global const float* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
+__kernel void multiplyByConstraintMatrix(__global const mixed* restrict delta1, __global mixed* restrict delta2, __global const int* restrict constraintMatrixColumn,
+        __global const mixed* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
    if (converged[iteration%2])
        return; // The constraint iteration has already converged.

    // Multiply by the inverse constraint matrix.

    for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
-        float sum = 0.0f;
+        mixed sum = 0;
        for (int i = 0; ; i++) {
            int element = index+i*NUM_CONSTRAINTS;
            int column = constraintMatrixColumn[element];
@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1,
 /**
 * Update the atom positions based on constraint forces.
 */
-__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const float4* restrict constraintDistance,
-        __global float4* restrict atomPositions, __global const float4* restrict velm, __global const float* restrict delta1, __global const float* restrict delta2, __global int* restrict converged, int iteration) {
+__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const mixed4* restrict constraintDistance,
+        __global mixed4* restrict atomPositions, __global const mixed4* restrict velm, __global const mixed* restrict delta1, __global const mixed* restrict delta2, __global int* restrict converged, int iteration) {
    if (get_global_id(0) == 0)
        converged[1-iteration%2] = 1;
    if (converged[iteration%2])
        return; // The constraint iteration has already converged.
-    float damping = (iteration < 2 ? 0.5f : 1.0f);
+    mixed damping = (iteration < 2 ? 0.5f : 1.0f);
    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
        // Compute the new position of this atom.

-        float4 atomPos = atomPositions[index];
-        float invMass = velm[index].w;
+        mixed4 atomPos = atomPositions[index];
+        mixed invMass = velm[index].w;
        int num = numAtomConstraints[index];
        for (int i = 0; i < num; i++) {
            int constraint = atomConstraints[index+i*NUM_ATOMS];
            bool forward = (constraint > 0);
            constraint = (forward ? constraint-1 : -constraint-1);
-            float constraintForce = damping*invMass*delta2[constraint];
+            mixed constraintForce = damping*invMass*delta2[constraint];
            constraintForce = (forward ? constraintForce : -constraintForce);
-            float4 dir = constraintDistance[constraint];
+            mixed4 dir = constraintDistance[constraint];
            atomPos.x += constraintForce*dir.x;
            atomPos.y += constraintForce*dir.y;
            atomPos.z += constraintForce*dir.z;

--- a/platforms/opencl/src/kernels/constraints.cl
+++ b/platforms/opencl/src/kernels/constraints.cl
-__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) {
+__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
    for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        float4 position = posq[index];
-        position.xyz += posDelta[index].xyz;
-        posq[index] = position;
+#ifdef USE_MIXED_PRECISION
+        real4 pos1 = posq[index];
+        real4 pos2 = posqCorrection[index];
+        mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+        mixed4 pos = posq[index];
+#endif
+        pos.xyz += posDelta[index].xyz;
+#ifdef USE_MIXED_PRECISION
+        posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+        posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+        posq[index] = pos;
+#endif
    }
 }
--- a/platforms/opencl/src/kernels/customIntegrator.cl
+++ b/platforms/opencl/src/kernels/customIntegrator.cl
-__kernel void computeSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
+__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
    __local float tempBuffer[WORK_GROUP_SIZE];
    const unsigned int thread = get_local_id(0);
-    float sum = 0.0f;
+    float sum = 0;
    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
        sum += sumBuffer[index];
    tempBuffer[thread] = sum;
@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa
        result[outputIndex] = tempBuffer[0];
 }

-__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) {
+#ifdef SUPPORTS_DOUBLE_PRECISION
+__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, unsigned int outputIndex, int bufferSize) {
+    __local double tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = get_local_id(0);
+    double sum = 0;
+    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
+        sum += sumBuffer[index];
+    tempBuffer[thread] = sum;
+    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
+            tempBuffer[thread] += tempBuffer[thread+i];
+    }
+    if (thread == 0)
+        result[outputIndex] = tempBuffer[0];
+}
+#endif
+
+__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
    for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        float4 position = posq[index];
-        position.xyz += posDelta[index].xyz;
-        posq[index] = position;
-        posDelta[index] = (float4) 0.0f;
+#ifdef USE_MIXED_PRECISION
+        real4 pos1 = posq[index];
+        real4 pos2 = posqCorrection[index];
+        mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+        real4 pos = posq[index];
+#endif
+        pos.xyz += posDelta[index].xyz;
+#ifdef USE_MIXED_PRECISION
+        posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+        posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+        posq[index] = pos;
+#endif
+        posDelta[index] = (mixed4) 0;
    }
 }


--- a/platforms/opencl/src/kernels/customIntegratorGlobal.cl
+++ b/platforms/opencl/src/kernels/customIntegratorGlobal.cl
-__kernel void computeGlobal(__global float2* restrict dt, __global float* restrict globals, __global float* restrict params,
-        float uniform, float gaussian, __global const float* restrict energy) {
+__kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
+        float uniform, float gaussian, __global const real* restrict energy) {
    COMPUTE_STEP
 }
--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
-#ifdef SUPPORTS_DOUBLE_PRECISION
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+/**
+ * Load the position of a particle.
+ */
+mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
+#ifdef USE_MIXED_PRECISION
+    real4 pos1 = posq[index];
+    real4 pos2 = posqCorrection[index];
+    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+    return posq[index];
+#endif
+}
+
+/**
+ * Store the position of a particle.
+ */
+void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
+#ifdef USE_MIXED_PRECISION
+    posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+    posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+    posq[index] = pos;
 #endif
+}

-__kernel void computePerDof(__global float4* restrict posq, __global float4* restrict posDelta, __global float4* restrict velm,
-        __global const float4* restrict force, __global const float2* restrict dt, __global const float* restrict globals,
-        __global const float* restrict params, __global float* restrict sum, __global const float4* restrict gaussianValues,
-        unsigned int randomIndex, __global const float4* restrict uniformValues, __global const float* restrict energy
+__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
+        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
+        __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
+        unsigned int randomIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
        PARAMETER_ARGUMENTS) {
-    float stepSize = dt[0].y;
+    mixed stepSize = dt[0].y;
    int index = get_global_id(0);
    randomIndex += index;
    while (index < NUM_ATOMS) {
-#ifdef SUPPORTS_DOUBLE_PRECISION
 #ifdef LOAD_POS_AS_DELTA
-        double4 position = convert_double4(posq[index]+posDelta[index]);
+        mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
 #else
-        double4 position = convert_double4(posq[index]);
-#endif
-        double4 velocity = convert_double4(velm[index]);
-        double4 f = convert_double4(force[index]);
-        double mass = 1.0/velocity.w;
-#else
-#ifdef LOAD_POS_AS_DELTA
-        float4 position = posq[index]+posDelta[index];
-#else
-        float4 position = posq[index];
-#endif
-        float4 velocity = velm[index];
-        float4 f = force[index];
-        float mass = 1.0f/velocity.w;
+        mixed4 position = loadPos(posq, posqCorrection, index);
 #endif
+        mixed4 velocity = velm[index];
+        real4 f = force[index];
+        mixed mass = 1/velocity.w;
        if (velocity.w != 0.0) {
            float4 gaussian = gaussianValues[randomIndex];
            float4 uniform = uniformValues[index];

--- a/platforms/opencl/src/kernels/ewald.cl
+++ b/platforms/opencl/src/kernels/ewald.cl
-
 float2 multofFloat2(float2 a, float2 b) {
    return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
 }

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
-#ifdef SUPPORTS_DOUBLE_PRECISION
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#endif
-
 enum {VelScale, ForceScale, NoiseScale, MaxParams};

 /**
 * Perform the first step of Langevin integration.
 */

-__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta,
-        __global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
-    float vscale = paramBuffer[VelScale];
-    float fscale = paramBuffer[ForceScale];
-    float noisescale = paramBuffer[NoiseScale];
-    float stepSize = dt[0].y;
+__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
+        __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
+    mixed vscale = paramBuffer[VelScale];
+    mixed fscale = paramBuffer[ForceScale];
+    mixed noisescale = paramBuffer[NoiseScale];
+    mixed stepSize = dt[0].y;
    int index = get_global_id(0);
    randomIndex += index;
    while (index < NUM_ATOMS) {
-        float4 velocity = velm[index];
+        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
-            float sqrtInvMass = sqrt(velocity.w);
-            velocity.xyz = vscale*velocity.xyz + fscale*velocity.w*force[index].xyz + noisescale*sqrtInvMass*random[randomIndex].xyz;
+            mixed sqrtInvMass = sqrt(velocity.w);
+            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
+            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
+            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
            velm[index] = velocity;
            posDelta[index] = stepSize*velocity;
        }
@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
 * Perform the second step of Langevin integration.
 */

-__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
+__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
 #ifdef SUPPORTS_DOUBLE_PRECISION
    double invStepSize = 1.0/dt[0].y;
 #else
@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
 #endif
    int index = get_global_id(0);
    while (index < NUM_ATOMS) {
-        float4 vel = velm[index];
+        mixed4 vel = velm[index];
        if (vel.w != 0.0) {
-            float4 pos = posq[index];
-            float4 delta = posDelta[index];
+#ifdef USE_MIXED_PRECISION
+            real4 pos1 = posq[index];
+            real4 pos2 = posqCorrection[index];
+            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+            real4 pos = posq[index];
+#endif
+            mixed4 delta = posDelta[index];
            pos.xyz += delta.xyz;
 #ifdef SUPPORTS_DOUBLE_PRECISION
-            vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz;
+            vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
 #else
            vel.xyz = invStepSize*delta.xyz;
 #endif
+#ifdef USE_MIXED_PRECISION
+            posq[index] = convert_real4(pos);
+            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
            posq[index] = pos;
+#endif
            velm[index] = vel;
        }
        index += get_global_size(0);
@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
 * Select the step size to use for the next step.
 */

-__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt,
-        __global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) {
+__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, __global mixed2* restrict dt,
+        __global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
    // Calculate the error.

-    float err = 0.0f;
+    mixed err = 0.0f;
    unsigned int index = get_local_id(0);
    while (index < NUM_ATOMS) {
-        float4 f = force[index];
-        float invMass = velm[index].w;
+        real4 f = force[index];
+        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
        index += get_global_size(0);
    }
@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
    if (get_global_id(0) == 0) {
        // Select the new step size.

-        float totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        float newStepSize = sqrt(errorTol/totalError);
-        float oldStepSize = dt[0].y;
+        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta

        // Recalculate the integration parameters.

-        float vscale = exp(-newStepSize/tau);
-        float fscale = (1-vscale)*tau;
-        float noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed vscale = exp(-newStepSize/tau);
+        mixed fscale = (1-vscale)*tau;
+        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/opencl/src/kernels/removeCM.cl
+++ b/platforms/opencl/src/kernels/removeCM.cl
@@ -2,13 +2,16 @@
 * Calculate the center of mass momentum.
 */

-__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
+__kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
    int index = get_global_id(0);
    float4 cm = 0.0f;
    while (index < numAtoms) {
-        float4 velocity = velm[index];
-        if (velocity.w != 0.0)
-            cm.xyz += velocity.xyz/velocity.w;
+        mixed4 velocity = velm[index];
+        if (velocity.w != 0) {
+            cm.x += velocity.x/velocity.w;
+            cm.y += velocity.y/velocity.w;
+            cm.z += velocity.z/velocity.w;
+        }
        index += get_global_size(0);
    }

@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
 * Remove center of mass motion.
 */

-__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
+__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
    // First sum all of the momenta that were calculated by individual groups.

    unsigned int index = get_local_id(0);
@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4*

    index = get_global_id(0);
    while (index < numAtoms) {
-        velm[index].xyz -= cm.xyz;
+        velm[index].x -= cm.x;
+        velm[index].y -= cm.y;
+        velm[index].z -= cm.z;
        index += get_global_size(0);
    }
 }
--- a/platforms/opencl/src/kernels/settle.cl
+++ b/platforms/opencl/src/kernels/settle.cl
+mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
+#ifdef USE_MIXED_PRECISION
+    real4 pos1 = posq[index];
+    real4 pos2 = posqCorrection[index];
+    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+    return posq[index];
+#endif
+}
+
 /**
 * Enforce constraints on SETTLE clusters
 */

-__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
+__kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
    int index = get_global_id(0);
    while (index < numClusters) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
        float2 params = clusterParams[index];
-        float4 apos0 = oldPos[atoms.x];
-        float4 xp0 = posDelta[atoms.x];
-        float4 apos1 = oldPos[atoms.y];
-        float4 xp1 = posDelta[atoms.y];
-        float4 apos2 = oldPos[atoms.z];
-        float4 xp2 = posDelta[atoms.z];
-        float m0 = RECIP(velm[atoms.x].w);
-        float m1 = RECIP(velm[atoms.y].w);
-        float m2 = RECIP(velm[atoms.z].w);
+        mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
+        mixed4 xp0 = posDelta[atoms.x];
+        mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
+        mixed4 xp1 = posDelta[atoms.y];
+        mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
+        mixed4 xp2 = posDelta[atoms.z];
+        mixed m0 = 1/velm[atoms.x].w;
+        mixed m1 = 1/velm[atoms.y].w;
+        mixed m2 = 1/velm[atoms.z].w;

        // Apply the SETTLE algorithm.

-        float xb0 = apos1.x-apos0.x;
-        float yb0 = apos1.y-apos0.y;
-        float zb0 = apos1.z-apos0.z;
-        float xc0 = apos2.x-apos0.x;
-        float yc0 = apos2.y-apos0.y;
-        float zc0 = apos2.z-apos0.z;
-
-        float invTotalMass = 1.0f/(m0+m1+m2);
-        float xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
-        float ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
-        float zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
-
-        float xa1 = xp0.x - xcom;
-        float ya1 = xp0.y - ycom;
-        float za1 = xp0.z - zcom;
-        float xb1 = xb0 + xp1.x - xcom;
-        float yb1 = yb0 + xp1.y - ycom;
-        float zb1 = zb0 + xp1.z - zcom;
-        float xc1 = xc0 + xp2.x - xcom;
-        float yc1 = yc0 + xp2.y - ycom;
-        float zc1 = zc0 + xp2.z - zcom;
-
-        float xaksZd = yb0*zc0 - zb0*yc0;
-        float yaksZd = zb0*xc0 - xb0*zc0;
-        float zaksZd = xb0*yc0 - yb0*xc0;
-        float xaksXd = ya1*zaksZd - za1*yaksZd;
-        float yaksXd = za1*xaksZd - xa1*zaksZd;
-        float zaksXd = xa1*yaksZd - ya1*xaksZd;
-        float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
-        float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
-        float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
-
-        float axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        float aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        float azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
-        float trns11 = xaksXd / axlng;
-        float trns21 = yaksXd / axlng;
-        float trns31 = zaksXd / axlng;
-        float trns12 = xaksYd / aylng;
-        float trns22 = yaksYd / aylng;
-        float trns32 = zaksYd / aylng;
-        float trns13 = xaksZd / azlng;
-        float trns23 = yaksZd / azlng;
-        float trns33 = zaksZd / azlng;
-
-        float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
-        float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
-        float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
-        float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
-        float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
-        float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
-        float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
-        float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
-        float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
-        float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
-        float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
+        mixed xb0 = apos1.x-apos0.x;
+        mixed yb0 = apos1.y-apos0.y;
+        mixed zb0 = apos1.z-apos0.z;
+        mixed xc0 = apos2.x-apos0.x;
+        mixed yc0 = apos2.y-apos0.y;
+        mixed zc0 = apos2.z-apos0.z;
+
+        mixed invTotalMass = 1.0f/(m0+m1+m2);
+        mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
+        mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
+        mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
+
+        mixed xa1 = xp0.x - xcom;
+        mixed ya1 = xp0.y - ycom;
+        mixed za1 = xp0.z - zcom;
+        mixed xb1 = xb0 + xp1.x - xcom;
+        mixed yb1 = yb0 + xp1.y - ycom;
+        mixed zb1 = zb0 + xp1.z - zcom;
+        mixed xc1 = xc0 + xp2.x - xcom;
+        mixed yc1 = yc0 + xp2.y - ycom;
+        mixed zc1 = zc0 + xp2.z - zcom;
+
+        mixed xaksZd = yb0*zc0 - zb0*yc0;
+        mixed yaksZd = zb0*xc0 - xb0*zc0;
+        mixed zaksZd = xb0*yc0 - yb0*xc0;
+        mixed xaksXd = ya1*zaksZd - za1*yaksZd;
+        mixed yaksXd = za1*xaksZd - xa1*zaksZd;
+        mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
+        mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
+        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
+        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
+
+        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
+        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
+        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
+        mixed trns11 = xaksXd / axlng;
+        mixed trns21 = yaksXd / axlng;
+        mixed trns31 = zaksXd / axlng;
+        mixed trns12 = xaksYd / aylng;
+        mixed trns22 = yaksYd / aylng;
+        mixed trns32 = zaksYd / aylng;
+        mixed trns13 = xaksZd / azlng;
+        mixed trns23 = yaksZd / azlng;
+        mixed trns33 = zaksZd / azlng;
+
+        mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
+        mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
+        mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
+        mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
+        mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
+        mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
+        mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
+        mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
+        mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
+        mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
+        mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;

        //                                        --- Step2  A2' ---

        float rc = 0.5*params.y;
-        float rb = sqrt(params.x*params.x-rc*rc);
-        float ra = rb*(m1+m2)*invTotalMass;
+        mixed rb = sqrt(params.x*params.x-rc*rc);
+        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
-        float sinphi = za1d / ra;
-        float cosphi = sqrt(1.0f - sinphi*sinphi);
-        float sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
-        float cospsi = sqrt(1.0f - sinpsi*sinpsi);
-
-        float ya2d =   ra*cosphi;
-        float xb2d = - rc*cospsi;
-        float yb2d = - rb*cosphi - rc*sinpsi*sinphi;
-        float yc2d = - rb*cosphi + rc*sinpsi*sinphi;
-        float xb2d2 = xb2d*xb2d;
-        float hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        float deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
+        mixed sinphi = za1d / ra;
+        mixed cosphi = sqrt(1.0f - sinphi*sinphi);
+        mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
+        mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
+
+        mixed ya2d =   ra*cosphi;
+        mixed xb2d = - rc*cospsi;
+        mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
+        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
+        mixed xb2d2 = xb2d*xb2d;
+        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
+        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
        xb2d -= deltx*0.5;

        //                                        --- Step3  al,be,ga ---

-        float alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
-        float beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
-        float gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
+        mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
+        mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
+        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;

-        float al2be2 = alpha*alpha + beta*beta;
-        float sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
+        mixed al2be2 = alpha*alpha + beta*beta;
+        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;

        //                                        --- Step4  A3' ---

-        float costheta = sqrt(1.0f - sintheta*sintheta);
-        float xa3d = - ya2d*sintheta;
-        float ya3d =   ya2d*costheta;
-        float za3d = za1d;
-        float xb3d =   xb2d*costheta - yb2d*sintheta;
-        float yb3d =   xb2d*sintheta + yb2d*costheta;
-        float zb3d = zb1d;
-        float xc3d = - xb2d*costheta - yc2d*sintheta;
-        float yc3d = - xb2d*sintheta + yc2d*costheta;
-        float zc3d = zc1d;
+        mixed costheta = sqrt(1.0f - sintheta*sintheta);
+        mixed xa3d = - ya2d*sintheta;
+        mixed ya3d =   ya2d*costheta;
+        mixed za3d = za1d;
+        mixed xb3d =   xb2d*costheta - yb2d*sintheta;
+        mixed yb3d =   xb2d*sintheta + yb2d*costheta;
+        mixed zb3d = zb1d;
+        mixed xc3d = - xb2d*costheta - yc2d*sintheta;
+        mixed yc3d = - xb2d*sintheta + yc2d*costheta;
+        mixed zc3d = zc1d;

        //                                        --- Step5  A3 ---

-        float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
-        float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
-        float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
-        float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
-        float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
-        float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
-        float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
-        float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
-        float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
+        mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
+        mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
+        mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
+        mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
+        mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
+        mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
+        mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
+        mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
+        mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;

        xp0.x = xcom + xa3;
        xp0.y = ycom + ya3;
@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res
 * Enforce velocity constraints on SETTLE clusters
 */

-__kernel void constrainVelocities(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
+__kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
    for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
-        float4 apos0 = oldPos[atoms.x];
-        float4 apos1 = oldPos[atoms.y];
-        float4 apos2 = oldPos[atoms.z];
-        float4 v0 = velm[atoms.x];
-        float4 v1 = velm[atoms.y];
-        float4 v2 = velm[atoms.z];
+        mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
+        mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
+        mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
+        mixed4 v0 = velm[atoms.x];
+        mixed4 v1 = velm[atoms.y];
+        mixed4 v2 = velm[atoms.z];
        
        // Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
        // and the angle cosines and sines.
        
-        float mA = RECIP(v0.w);
-        float mB = RECIP(v1.w);
-        float mC = RECIP(v2.w);
-        float4 eAB = apos1-apos0;
-        float4 eBC = apos2-apos1;
-        float4 eCA = apos0-apos2;
-        eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
-        eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
-        eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
-        float vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
-        float vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
-        float vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
-        float cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
-        float cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
-        float cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
-        float s2A = 1-cA*cA;
-        float s2B = 1-cB*cB;
-        float s2C = 1-cC*cC;
+        mixed mA = 1/v0.w;
+        mixed mB = 1/v1.w;
+        mixed mC = 1/v2.w;
+        mixed4 eAB = apos1-apos0;
+        mixed4 eBC = apos2-apos1;
+        mixed4 eCA = apos0-apos2;
+        eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
+        eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
+        eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
+        mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
+        mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
+        mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
+        mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
+        mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
+        mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
+        mixed s2A = 1-cA*cA;
+        mixed s2B = 1-cB*cB;
+        mixed s2C = 1-cC*cC;
        
        // Solve the equations.  These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
        // in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
        // making that assumption).  We allow all three atoms to have different masses.
        
-        float mABCinv = RECIP(mA*mB*mC);
-        float denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
-        float tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
-        float tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
-        float tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
+        mixed mABCinv = 1/(mA*mB*mC);
+        mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
+        mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
+        mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
+        mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
        v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
        v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
        v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;