Continuing to implement double precision in OpenCL

c8dac206 · Peter Eastman · 34938e2c · c8dac206 · c8dac206 · c8dac206
Commit c8dac206 authored Oct 17, 2012 by Peter Eastman
20 changed files
--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -58,7 +58,7 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co
 std::string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) {
    arguments.push_back(&data);
    argTypes.push_back(type);
-    return "customArg"+OpenCLExpressionUtilities::intToString(arguments.size());
+    return "customArg"+context.intToString(arguments.size());
 }

 void OpenCLBondedUtilities::addPrefixCode(const string& source) {
@@ -164,17 +164,17 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        stringstream s;
        for (int i = 0; i < (int) prefixCode.size(); i++)
            s<<prefixCode[i];
-        s<<"__kernel void computeBondedForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, int groups";
+        s<<"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
-            string indexType = "uint"+(indexWidth[force] == 1 ? "" : OpenCLExpressionUtilities::intToString(indexWidth[force]));
+            string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
            s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
            s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
        }
        for (int i = 0; i < (int) arguments.size(); i++)
            s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
        s<<") {\n";
-        s<<"float energy = 0.0f;\n";
+        s<<"real energy = 0.0f;\n";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
            s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
@@ -182,7 +182,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        s<<"energyBuffer[get_global_id(0)] += energy;\n";
        s<<"}\n";
        map<string, string> defines;
-        defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
+        defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
        cl::Program program = context.createProgram(s.str(), defines);
        kernels.push_back(cl::Kernel(program, "computeBondedForces"));
    }
@@ -206,7 +206,7 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
        suffix = suffix4;
    else
        suffix = suffix16;
-    string indexType = "uint"+(width == 1 ? "" : OpenCLExpressionUtilities::intToString(width));
+    string indexType = "uint"+(width == 1 ? "" : context.intToString(width));
    stringstream s;
    s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
    s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
@@ -214,13 +214,13 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<"    "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
-        s<<"    float4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
+        s<<"    real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
    }
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    {\n";
        s<<"    unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
-        s<<"    float4 force = forceBuffers[offset];\n";
+        s<<"    real4 force = forceBuffers[offset];\n";
        s<<"    force.xyz += force"<<(i+1)<<".xyz;\n";
        s<<"    forceBuffers[offset] = force;\n";
        s<<"    }\n";

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -68,7 +68,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
-        bonded(NULL), nonbonded(NULL), thread(NULL) {
+        expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (precision == "single") {
        useDoublePrecision = false;
        useMixedPrecision = false;
@@ -145,7 +145,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        this->deviceIndex = deviceIndex;
        if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
            throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
-        compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize);
+        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
        if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
 			defaultOptimizationOptions = "";
 		else
@@ -269,7 +269,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
-    reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
+    reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
    reduceForcesKernel = cl::Kernel(utilities, "reduceForces");

    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
@@ -316,9 +316,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    
    thread = new WorkThread();
    
-    // Create the integration utilities object.
+    // Create utilities objects.
    
    integration = new OpenCLIntegrationUtilities(*this, system);
+    expression = new OpenCLExpressionUtilities(*this);
 }

 OpenCLContext::~OpenCLContext() {
@@ -346,6 +347,8 @@ OpenCLContext::~OpenCLContext() {
        delete atomIndexDevice;
    if (integration != NULL)
        delete integration;
+    if (expression != NULL)
+        delete expression;
    if (bonded != NULL)
        delete bonded;
    if (nonbonded != NULL)
@@ -376,10 +379,10 @@ void OpenCLContext::initialize() {
        reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
        reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
        reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
-        addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
+        addAutoclearBuffer(*longForceBuffer);
    }
-    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
-    addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
+    addAutoclearBuffer(*forceBuffers);
+    addAutoclearBuffer(*energyBuffer);
    int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
    pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
    pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
@@ -479,6 +482,21 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
    return program;
 }

+string OpenCLContext::doubleToString(double value) {
+    stringstream s;
+    s.precision(useDoublePrecision ? 16 : 8);
+    s << scientific << value;
+    if (!useDoublePrecision)
+        s << "f";
+    return s.str();
+}
+
+string OpenCLContext::intToString(int value) {
+    stringstream s;
+    s << value;
+    return s.str();
+}
+
 void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
@@ -494,18 +512,23 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
 }

 void OpenCLContext::clearBuffer(OpenCLArray& array) {
-    clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize()/sizeof(cl_float));
+    clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
 }

 void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
+    int words = size/4;
    clearBufferKernel.setArg<cl::Memory>(0, memory);
-    clearBufferKernel.setArg<cl_int>(1, size);
-    executeKernel(clearBufferKernel, size, 128);
+    clearBufferKernel.setArg<cl_int>(1, words);
+    executeKernel(clearBufferKernel, words, 128);
+}
+
+void OpenCLContext::addAutoclearBuffer(OpenCLArray& array) {
+    addAutoclearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
 }

 void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
    autoclearBuffers.push_back(&memory);
-    autoclearBufferSizes.push_back(size);
+    autoclearBufferSizes.push_back(size/4);
 }

 void OpenCLContext::clearAutoclearBuffers() {
@@ -581,10 +604,10 @@ void OpenCLContext::reduceForces() {

 void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
    int bufferSize = array.getSize()/numBuffers;
-    reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
-    reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
-    reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
-    executeKernel(reduceFloat4Kernel, bufferSize, 128);
+    reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
+    reduceReal4Kernel.setArg<cl_int>(1, bufferSize);
+    reduceReal4Kernel.setArg<cl_int>(2, numBuffers);
+    executeKernel(reduceReal4Kernel, bufferSize, 128);
 }

 void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -45,6 +45,7 @@ namespace OpenMM {
 class OpenCLArray;
 class OpenCLForceInfo;
 class OpenCLIntegrationUtilities;
+class OpenCLExpressionUtilities;
 class OpenCLBondedUtilities;
 class OpenCLNonbondedUtilities;
 class System;
@@ -314,14 +315,18 @@ public:
     * Set all elements of an array to 0.
     *
     * @param memory     the Memory to clear
-     * @param size       the number of float elements in the buffer
+     * @param size       the size of the buffer in bytes
     */
    void clearBuffer(cl::Memory& memory, int size);
+    /**
+     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+     */
+    void addAutoclearBuffer(OpenCLArray& array);
    /**
     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
     *
     * @param memory     the Memory to clear
-     * @param size       the number of float elements in the buffer
+     * @param size       the size of the buffer in bytes
     */
    void addAutoclearBuffer(cl::Memory& memory, int size);
    /**
@@ -329,7 +334,7 @@ public:
     */
    void clearAutoclearBuffers();
    /**
-     * Given a collection of buffers packed into an array, sum them and store
+     * Given a collection of floating point buffers packed into an array, sum them and store
     * the sum in the first buffer.
     *
     * @param array       the array containing the buffers to reduce
@@ -437,6 +442,15 @@ public:
    bool getUseMixedPrecision() {
        return useMixedPrecision;
    }
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     * This takes into account whether the context uses single or double precision.
+     */
+    std::string doubleToString(double value);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    std::string intToString(int value);
    /**
     * Get the size of the periodic box.
     */
@@ -476,6 +490,12 @@ public:
    OpenCLIntegrationUtilities& getIntegrationUtilities() {
        return *integration;
    }
+    /**
+     * Get the OpenCLExpressionUtilities for this context.
+     */
+    OpenCLExpressionUtilities& getExpressionUtilities() {
+        return *expression;
+    }
    /**
     * Get the OpenCLBondedUtilities for this context.
     */
@@ -580,7 +600,7 @@ private:
    cl::Kernel clearFourBuffersKernel;
    cl::Kernel clearFiveBuffersKernel;
    cl::Kernel clearSixBuffersKernel;
-    cl::Kernel reduceFloat4Kernel;
+    cl::Kernel reduceReal4Kernel;
    cl::Kernel reduceForcesKernel;
    std::vector<OpenCLForceInfo*> forces;
    std::vector<Molecule> molecules;
@@ -601,6 +621,7 @@ private:
    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
    OpenCLIntegrationUtilities* integration;
+    OpenCLExpressionUtilities* expression;
    OpenCLBondedUtilities* bonded;
    OpenCLNonbondedUtilities* nonbonded;
    WorkThread* thread;

--- a/platforms/opencl/src/OpenCLExpressionUtilities.cpp
+++ b/platforms/opencl/src/OpenCLExpressionUtilities.cpp
@@ -33,19 +33,6 @@ using namespace OpenMM;
 using namespace Lepton;
 using namespace std;

-string OpenCLExpressionUtilities::doubleToString(double value) {
-    stringstream s;
-    s.precision(8);
-    s << scientific << value << "f";
-    return s.str();
-}
-
-string OpenCLExpressionUtilities::intToString(int value) {
-    stringstream s;
-    s << value;
-    return s.str();
-}
-
 string OpenCLExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
        const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
    vector<pair<ExpressionTreeNode, string> > variableNodes;
@@ -75,13 +62,13 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
            return;
    for (int i = 0; i < (int) node.getChildren().size(); i++)
        processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
-    string name = prefix+intToString(temps.size());
+    string name = prefix+context.intToString(temps.size());
    bool hasRecordedNode = false;
    
    out << tempType << " " << name << " = ";
    switch (node.getOperation().getId()) {
        case Operation::CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+            out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
            break;
        case Operation::VARIABLE:
            throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
@@ -107,7 +94,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
            string valueName = name;
            string derivName = name;
            if (valueNode != NULL && derivNode != NULL) {
-                string name2 = prefix+intToString(temps.size());
+                string name2 = prefix+context.intToString(temps.size());
                out << tempType << " " << name2 << " = 0.0f;\n";
                if (isDeriv) {
                    valueName = name2;
@@ -236,10 +223,10 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
            out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::ADD_CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
+            out << context.doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::MULTIPLY_CONSTANT:
-            out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
+            out << context.doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
            break;
        case Operation::POWER_CONSTANT:
        {
@@ -266,7 +253,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
                for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
                    if (iter->first != exponent) {
                        exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
-                        string name2 = prefix+intToString(temps.size());
+                        string name2 = prefix+context.intToString(temps.size());
                        names.push_back(name2);
                        temps.push_back(make_pair(*iter->second, name2));
                        out << tempType << " " << name2 << " = 0.0f;\n";
@@ -295,7 +282,7 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
                out << "}";
            }
            else
-                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
+                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
            break;
        }
        case Operation::MIN:

--- a/platforms/opencl/src/OpenCLExpressionUtilities.h
+++ b/platforms/opencl/src/OpenCLExpressionUtilities.h
@@ -45,6 +45,8 @@ namespace OpenMM {

 class OPENMM_EXPORT OpenCLExpressionUtilities {
 public:
+    OpenCLExpressionUtilities(OpenCLContext& context) : context(context) {
+    }
    /**
     * Generate the source code for calculating a set of expressions.
     *
@@ -54,10 +56,10 @@ public:
     * @param functions      defines the variable name for each tabulated function that may appear in the expressions
     * @param prefix         a prefix to put in front of temporary variables
     * @param functionParams the variable name containing the parameters for each tabulated function
-     * @param tempType       the type of value to use for temporary variables (defaults to "float")
+     * @param tempType       the type of value to use for temporary variables (defaults to "real")
     */
-    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
-            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
+    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
+            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="real");
    /**
     * Generate the source code for calculating a set of expressions.
     *
@@ -69,7 +71,7 @@ public:
     * @param functionParams the variable name containing the parameters for each tabulated function
     * @param tempType       the type of value to use for temporary variables (defaults to "float")
     */
-    static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
+    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
    /**
     * Calculate the spline coefficients for a tabulated function that appears in expressions.
@@ -79,26 +81,19 @@ public:
     * @param max            the value of the independent variable corresponding to the last element of values
     * @return the spline coefficients
     */
-    static std::vector<mm_float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    static std::string doubleToString(double value);
-    /**
-     * Convert a number to a string in a format suitable for including in a kernel.
-     */
-    static std::string intToString(int value);
+    std::vector<mm_float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
    class FunctionPlaceholder;
 private:
-    static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
+    void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
            std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
            const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
            const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
-    static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
-    static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+    std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
+    void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
            const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
-    static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
+    void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
            std::map<int, const Lepton::ExpressionTreeNode*>& powers);
+    OpenCLContext& context;
 };

 /**

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2011 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -47,15 +47,15 @@ void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
        maxSize = 1;
    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    zkernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f);
+    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
    context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
-    xkernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f);
+    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
    context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    ykernel.setArg<cl_float>(2, forward ? 1.0f : -1.0f);
+    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
    context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
 }

@@ -99,23 +99,23 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
                source<<"int i = get_local_id(0);\n";
            }
            source<<"int j = i/"<<m<<";\n";
-            source<<"float2 c0 = data"<<input<<"[i];\n";
-            source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
-            source<<"float2 d0 = c1+c4;\n";
-            source<<"float2 d1 = c2+c3;\n";
-            source<<"float2 d2 = "<<OpenCLExpressionUtilities::doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
-            source<<"float2 d3 = "<<OpenCLExpressionUtilities::doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
-            source<<"float2 d4 = d0+d1;\n";
-            source<<"float2 d5 = "<<OpenCLExpressionUtilities::doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
-            source<<"float2 d6 = c0-0.25f*d4;\n";
-            source<<"float2 d7 = d6+d5;\n";
-            source<<"float2 d8 = d6-d5;\n";
-            string coeff = OpenCLExpressionUtilities::doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-            source<<"float2 d9 = sign*(float2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
-            source<<"float2 d10 = sign*(float2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+            source<<"real2 c0 = data"<<input<<"[i];\n";
+            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c4;\n";
+            source<<"real2 d1 = c2+c3;\n";
+            source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
+            source<<"real2 d3 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
+            source<<"real2 d4 = d0+d1;\n";
+            source<<"real2 d5 = "<<context.doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
+            source<<"real2 d6 = c0-0.25f*d4;\n";
+            source<<"real2 d7 = d6+d5;\n";
+            source<<"real2 d8 = d6-d5;\n";
+            string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
+            source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+            source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
            source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n";
            source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
            source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
@@ -134,14 +134,14 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
                source<<"int i = get_local_id(0);\n";
            }
            source<<"int j = i/"<<m<<";\n";
-            source<<"float2 c0 = data"<<input<<"[i];\n";
-            source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"float2 d0 = c0+c2;\n";
-            source<<"float2 d1 = c0-c2;\n";
-            source<<"float2 d2 = c1+c3;\n";
-            source<<"float2 d3 = sign*(float2) (c1.y-c3.y, c3.x-c1.x);\n";
+            source<<"real2 c0 = data"<<input<<"[i];\n";
+            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
+            source<<"real2 d0 = c0+c2;\n";
+            source<<"real2 d1 = c0-c2;\n";
+            source<<"real2 d2 = c1+c3;\n";
+            source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
            source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n";
            source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
            source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
@@ -159,12 +159,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
                source<<"int i = get_local_id(0);\n";
            }
            source<<"int j = i/"<<m<<";\n";
-            source<<"float2 c0 = data"<<input<<"[i];\n";
-            source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"float2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"float2 d0 = c1+c2;\n";
-            source<<"float2 d1 = c0-0.5f*d0;\n";
-            source<<"float2 d2 = sign*"<<OpenCLExpressionUtilities::doubleToString(sin(M_PI/3.0))<<"*(float2) (c1.y-c2.y, c2.x-c1.x);\n";
+            source<<"real2 c0 = data"<<input<<"[i];\n";
+            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c2;\n";
+            source<<"real2 d1 = c0-0.5f*d0;\n";
+            source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
            source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
            source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
            source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
@@ -181,15 +181,15 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
                source<<"int i = get_local_id(0);\n";
            }
            source<<"int j = i/"<<m<<";\n";
-            source<<"float2 c0 = data"<<input<<"[i];\n";
-            source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
+            source<<"real2 c0 = data"<<input<<"[i];\n";
+            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
            source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
            source<<"}\n";
            m = m*2;
        }
        else
-            throw OpenMMException("Illegal size for FFT: "+OpenCLExpressionUtilities::intToString(zsize));
+            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
@@ -205,16 +205,17 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
        source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
    source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
    map<string, string> replacements;
-    replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize);
-    replacements["YSIZE"] = OpenCLExpressionUtilities::intToString(ysize);
-    replacements["ZSIZE"] = OpenCLExpressionUtilities::intToString(zsize);
-    replacements["M_PI"] = OpenCLExpressionUtilities::doubleToString(M_PI);
+    replacements["XSIZE"] = context.intToString(xsize);
+    replacements["YSIZE"] = context.intToString(ysize);
+    replacements["ZSIZE"] = context.intToString(zsize);
+    replacements["M_PI"] = context.doubleToString(M_PI);
    replacements["COMPUTE_FFT"] = source.str();
    replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
    cl::Kernel kernel(program, "execFFT");
-    kernel.setArg(3, zsize*sizeof(mm_float2), NULL);
-    kernel.setArg(4, zsize*sizeof(mm_float2), NULL);
-    kernel.setArg(5, zsize*sizeof(mm_float2), NULL);
+    int bufferSize = zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+    kernel.setArg(3, bufferSize, NULL);
+    kernel.setArg(4, bufferSize, NULL);
+    kernel.setArg(5, bufferSize, NULL);
    return kernel;
 }
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -559,8 +559,8 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        // Create the CCMA kernels.

        map<string, string> defines;
-        defines["NUM_CONSTRAINTS"] = OpenCLExpressionUtilities::intToString(numCCMA);
-        defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(numAtoms);
+        defines["NUM_CONSTRAINTS"] = context.intToString(numCCMA);
+        defines["NUM_ATOMS"] = context.intToString(numAtoms);
        cl::Program ccmaProgram = context.createProgram(OpenCLKernelSources::ccma, defines);
        ccmaDirectionsKernel = cl::Kernel(ccmaProgram, "computeConstraintDirections");
        ccmaPosForceKernel = cl::Kernel(ccmaProgram, "computeConstraintForce");
@@ -630,9 +630,9 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    // Create the kernels for virtual sites.

    map<string, string> defines;
-    defines["NUM_2_AVERAGE"] = OpenCLExpressionUtilities::intToString(num2Avg);
-    defines["NUM_3_AVERAGE"] = OpenCLExpressionUtilities::intToString(num3Avg);
-    defines["NUM_OUT_OF_PLANE"] = OpenCLExpressionUtilities::intToString(numOutOfPlane);
+    defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
+    defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
+    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
    cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
    vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
    vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2011 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -267,7 +267,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
    if (useCutoff) {
        map<string, string> defines;
-        defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
+        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
        if (forceBufferPerAtomBlock)
            defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
        if (usePeriodic)
@@ -281,6 +281,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
        findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
        findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
+        if (context.getUseDoublePrecision())
+            findInteractingBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
+        else
            findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
        findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
        findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
@@ -293,6 +296,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
        if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
            findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
+            if (context.getUseDoublePrecision())
+                findInteractionsWithinBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
+            else
                findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
            findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
@@ -315,6 +321,20 @@ int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_u
    throw OpenMMException("Internal error: exclusion in unexpected tile");
 }

+static void setPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
+    if (cl.getUseDoublePrecision())
+        kernel.setArg<mm_double4>(index, cl.getPeriodicBoxSizeDouble());
+    else
+        kernel.setArg<mm_float4>(index, cl.getPeriodicBoxSize());
+}
+
+static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
+    if (cl.getUseDoublePrecision())
+        kernel.setArg<mm_double4>(index, cl.getInvPeriodicBoxSizeDouble());
+    else
+        kernel.setArg<mm_float4>(index, cl.getInvPeriodicBoxSize());
+}
+
 void OpenCLNonbondedUtilities::prepareInteractions() {
    if (!useCutoff)
        return;
@@ -327,15 +347,15 @@ void OpenCLNonbondedUtilities::prepareInteractions() {

    // Compute the neighbor list.

-    findBlockBoundsKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
-    findBlockBoundsKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
+    setPeriodicBoxSizeArg(context, findBlockBoundsKernel, 1);
+    setInvPeriodicBoxSizeArg(context, findBlockBoundsKernel, 2);
    context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
-    findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
-    findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
+    setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
+    setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 2);
    context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1);
    if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
-        findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
-        findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
+        setPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 1);
+        setInvPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 2);
        context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
    }
 }
@@ -343,8 +363,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
 void OpenCLNonbondedUtilities::computeInteractions() {
    if (cutoff != -1.0) {
        if (useCutoff) {
-            forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
-            forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
+            setPeriodicBoxSizeArg(context, forceKernel, 10);
+            setInvPeriodicBoxSizeArg(context, forceKernel, 11);
        }
        context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
@@ -498,11 +518,11 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
        defines["USE_EXCLUSIONS"] = "1";
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
-    defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize);
-    defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
-    defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
-    defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
-    defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
+    defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
+    defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
+    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
+    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
+    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
    if ((localDataSize/4)%2 == 0)
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
    string file;

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
@@ -30,6 +30,7 @@
 #include "OpenCLContext.h"
 #include "openmm/System.h"
 #include "OpenCLExpressionUtilities.h"
+#include <sstream>
 #include <string>
 #include <vector>

@@ -287,8 +288,11 @@ public:
            name(name), componentType(componentType), numComponents(numComponents), size(size), memory(&memory) {
        if (numComponents == 1)
            type = componentType;
-        else
-            type = componentType+OpenCLExpressionUtilities::intToString(numComponents);
+        else {
+            std::stringstream s;
+            s << componentType << numComponents;
+            type = s.str();
+        }
    }
    const std::string& getName() const {
        return name;

--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -54,14 +54,14 @@ using namespace std;
 class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask {
 public:
    BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, mm_float4* pinnedMemory) : context(context), cl(cl), kernel(kernel),
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cl(cl), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) {
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.

        if (cl.getContextIndex() > 0)
-            cl.getQueue().enqueueWriteBuffer(cl.getPosq().getDeviceBuffer(), CL_FALSE, 0, cl.getPaddedNumAtoms()*sizeof(mm_float4), pinnedMemory);
+            cl.getQueue().enqueueWriteBuffer(cl.getPosq().getDeviceBuffer(), CL_FALSE, 0, cl.getPaddedNumAtoms()*cl.getPosq().getElementSize(), pinnedMemory);
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
    }
 private:
@@ -70,13 +70,13 @@ private:
    OpenCLCalcForcesAndEnergyKernel& kernel;
    bool includeForce, includeEnergy;
    int groups;
-    mm_float4* pinnedMemory;
+    void* pinnedMemory;
 };

 class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask {
 public:
    FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, mm_float4* pinnedMemory) :
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, void* pinnedMemory) :
            context(context), cl(cl), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
            completionTime(completionTime), pinnedMemory(pinnedMemory) {
    }
@@ -87,8 +87,9 @@ public:
        if (includeForce) {
            if (cl.getContextIndex() > 0) {
                int numAtoms = cl.getPaddedNumAtoms();
+                void* dest = (cl.getUseDoublePrecision() ? (void*) &((mm_double4*) pinnedMemory)[(cl.getContextIndex()-1)*numAtoms] : (void*) &((mm_float4*) pinnedMemory)[(cl.getContextIndex()-1)*numAtoms]);
                cl.getQueue().enqueueReadBuffer(cl.getForce().getDeviceBuffer(), CL_TRUE, 0,
-                        numAtoms*sizeof(mm_float4), &pinnedMemory[(cl.getContextIndex()-1)*numAtoms]);
+                        numAtoms*cl.getForce().getElementSize(), dest);
            }
            else
                cl.getQueue().finish();
@@ -103,7 +104,7 @@ private:
    int groups;
    double& energy;
    long long& completionTime;
-    mm_float4* pinnedMemory;
+    void* pinnedMemory;
 };

 OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
@@ -129,19 +130,20 @@ void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {

 void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    OpenCLContext& cl0 = *data.contexts[0];
+    int elementSize = (cl0.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
    if (contextForces == NULL) {
        contextForces = OpenCLArray::create<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
                data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces");
-        int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*sizeof(mm_float4);
+        int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*elementSize;
        pinnedPositionBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
-        pinnedPositionMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
+        pinnedPositionMemory = cl0.getQueue().enqueueMapBuffer(*pinnedPositionBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
        pinnedForceBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
-        pinnedForceMemory = (mm_float4*) cl0.getQueue().enqueueMapBuffer(*pinnedForceBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
+        pinnedForceMemory = cl0.getQueue().enqueueMapBuffer(*pinnedForceBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
    }

    // Copy coordinates over to each device and execute the kernel.
    
-    cl0.getQueue().enqueueReadBuffer(cl0.getPosq().getDeviceBuffer(), CL_TRUE, 0, cl0.getPaddedNumAtoms()*sizeof(mm_float4), pinnedPositionMemory);
+    cl0.getQueue().enqueueReadBuffer(cl0.getPosq().getDeviceBuffer(), CL_TRUE, 0, cl0.getPaddedNumAtoms()*elementSize, pinnedPositionMemory);
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        OpenCLContext& cl = *data.contexts[i];
@@ -165,8 +167,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
        
        OpenCLContext& cl = *data.contexts[0];
        int numAtoms = cl.getPaddedNumAtoms();
-        cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*sizeof(mm_float4),
-                numAtoms*(data.contexts.size()-1)*sizeof(mm_float4), pinnedForceMemory);
+        int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
+        cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
+                numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
        cl.reduceBuffer(*contextForces, data.contexts.size());
        
        // Balance work between the contexts by transferring a few nonbonded tiles from the context that

--- a/platforms/opencl/src/OpenCLParallelKernels.h
+++ b/platforms/opencl/src/OpenCLParallelKernels.h
@@ -84,8 +84,8 @@ private:
    OpenCLArray* contextForces;
    cl::Buffer* pinnedPositionBuffer;
    cl::Buffer* pinnedForceBuffer;
-    mm_float4* pinnedPositionMemory;
-    mm_float4* pinnedForceMemory;
+    void* pinnedPositionMemory;
+    void* pinnedForceMemory;
 };

 /**

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -141,7 +141,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
        device << contexts[i]->getDeviceIndex();
    }
    propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
-    propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
+    propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = contexts[0]->intToString(platformIndex);
    propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
    contextEnergy.resize(contexts.size());
 }

--- a/platforms/opencl/src/OpenCLSort.h
+++ b/platforms/opencl/src/OpenCLSort.h
@@ -162,7 +162,7 @@ public:
        // Assign array elements to buckets.

        unsigned int numBuckets = bucketOffset->getSize();
-        context.clearBuffer(bucketOffset->getDeviceBuffer(), numBuckets);
+        context.clearBuffer(*bucketOffset);
        assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
        assignElementsKernel.setArg<cl_int>(1, data.getSize());
        assignElementsKernel.setArg<cl_int>(2, numBuckets);

--- a/platforms/opencl/src/kernels/angleForce.cl
+++ b/platforms/opencl/src/kernels/angleForce.cl
-float4 v0 = pos2-pos1;
-float4 v1 = pos2-pos3;
-float4 cp = cross(v0, v1);
-float rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
-rp = max(SQRT(rp), 1.0e-06f);
-float r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
-float r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
-float dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
-float cosine = clamp(dot*RSQRT(r21*r23), -1.0f, 1.0f);
-float theta = acos(cosine);
+real4 v0 = pos2-pos1;
+real4 v1 = pos2-pos3;
+real4 cp = cross(v0, v1);
+real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
+rp = max(SQRT(rp), (real) 1.0e-06f);
+real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
+real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
+real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
+real cosine = clamp(dot*RSQRT(r21*r23), (real) -1, (real) 1);
+real theta = acos(cosine);
 COMPUTE_FORCE
-float4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
-float4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
-float4 force2 = -force1-force3;
+real4 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
+real4 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
+real4 force2 = -force1-force3;
--- a/platforms/opencl/src/kernels/bondForce.cl
+++ b/platforms/opencl/src/kernels/bondForce.cl
-float4 delta = pos2-pos1;
-float r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+real4 delta = pos2-pos1;
+real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
 COMPUTE_FORCE
 dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
 delta.xyz *= dEdR;
-float4 force1 = delta;
-float4 force2 = -delta;
\ No newline at end of file
+real4 force1 = delta;
+real4 force2 = -delta;
\ No newline at end of file
--- a/platforms/opencl/src/kernels/cmapTorsionForce.cl
+++ b/platforms/opencl/src/kernels/cmapTorsionForce.cl
-const float PI = 3.14159265358979323846f;
+const real PI = 3.14159265358979323846f;

 // Compute the first angle.

-float4 v0a = (float4) (pos1.xyz-pos2.xyz, 0.0f);
-float4 v1a = (float4) (pos3.xyz-pos2.xyz, 0.0f);
-float4 v2a = (float4) (pos3.xyz-pos4.xyz, 0.0f);
-float4 cp0a = cross(v0a, v1a);
-float4 cp1a = cross(v1a, v2a);
-float cosangle = dot(normalize(cp0a), normalize(cp1a));
-float angleA;
+real4 v0a = (real4) (pos1.xyz-pos2.xyz, 0.0f);
+real4 v1a = (real4) (pos3.xyz-pos2.xyz, 0.0f);
+real4 v2a = (real4) (pos3.xyz-pos4.xyz, 0.0f);
+real4 cp0a = cross(v0a, v1a);
+real4 cp1a = cross(v1a, v2a);
+real cosangle = dot(normalize(cp0a), normalize(cp1a));
+real angleA;
 if (cosangle > 0.99f || cosangle < -0.99f) {
    // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-    float4 cross_prod = cross(cp0a, cp1a);
-    float scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
+    real4 cross_prod = cross(cp0a, cp1a);
+    real scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
    angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale));
    if (cosangle < 0.0f)
        angleA = PI-angleA;
@@ -25,18 +25,18 @@ angleA = fmod(angleA+2.0f*PI, 2.0f*PI);

 // Compute the second angle.

-float4 v0b = (float4) (pos5.xyz-pos6.xyz, 0.0f);
-float4 v1b = (float4) (pos7.xyz-pos6.xyz, 0.0f);
-float4 v2b = (float4) (pos7.xyz-pos8.xyz, 0.0f);
-float4 cp0b = cross(v0b, v1b);
-float4 cp1b = cross(v1b, v2b);
+real4 v0b = (real4) (pos5.xyz-pos6.xyz, 0.0f);
+real4 v1b = (real4) (pos7.xyz-pos6.xyz, 0.0f);
+real4 v2b = (real4) (pos7.xyz-pos8.xyz, 0.0f);
+real4 cp0b = cross(v0b, v1b);
+real4 cp1b = cross(v1b, v2b);
 cosangle = dot(normalize(cp0b), normalize(cp1b));
-float angleB;
+real angleB;
 if (cosangle > 0.99f || cosangle < -0.99f) {
    // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-    float4 cross_prod = cross(cp0b, cp1b);
-    float scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
+    real4 cross_prod = cross(cp0b, cp1b);
+    real scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
    angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale));
    if (cosangle < 0.0f)
        angleB = PI-angleB;
@@ -50,7 +50,7 @@ angleB = fmod(angleB+2.0f*PI, 2.0f*PI);

 int2 pos = MAP_POS[MAPS[index]];
 int size = pos.y;
-float delta = 2*PI/size;
+real delta = 2*PI/size;
 int s = (int) (angleA/delta);
 int t = (int) (angleB/delta);
 float4 c[4];
@@ -59,14 +59,14 @@ c[0] = COEFF[coeffIndex];
 c[1] = COEFF[coeffIndex+1];
 c[2] = COEFF[coeffIndex+2];
 c[3] = COEFF[coeffIndex+3];
-float da = angleA/delta-s;
-float db = angleB/delta-t;
+real da = angleA/delta-s;
+real db = angleB/delta-t;

 // Evaluate the spline to determine the energy and gradients.

-float torsionEnergy = 0.0f;
-float dEdA = 0.0f;
-float dEdB = 0.0f;
+real torsionEnergy = 0.0f;
+real dEdA = 0.0f;
+real dEdB = 0.0f;
 torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
 dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
 dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
@@ -85,17 +85,17 @@ energy += torsionEnergy;

 // Apply the force to the first torsion.

-float normCross1 = dot(cp0a, cp0a);
-float normSqrBC = dot(v1a, v1a);
-float normBC = SQRT(normSqrBC);
-float normCross2 = dot(cp1a, cp1a);
-float dp = 1.0f/normSqrBC;
-float4 ff = (float4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
-float4 force1 = ff.x*cp0a;
-float4 force4 = ff.w*cp1a;
-float4 d = ff.y*force1 - ff.z*force4;
-float4 force2 = d-force1;
-float4 force3 = -d-force4;
+real normCross1 = dot(cp0a, cp0a);
+real normSqrBC = dot(v1a, v1a);
+real normBC = SQRT(normSqrBC);
+real normCross2 = dot(cp1a, cp1a);
+real dp = 1.0f/normSqrBC;
+real4 ff = (real4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
+real4 force1 = ff.x*cp0a;
+real4 force4 = ff.w*cp1a;
+real4 d = ff.y*force1 - ff.z*force4;
+real4 force2 = d-force1;
+real4 force3 = -d-force4;

 // Apply the force to the second torsion.

@@ -104,9 +104,9 @@ normSqrBC = dot(v1b, v1b);
 normBC = SQRT(normSqrBC);
 normCross2 = dot(cp1b, cp1b);
 dp = 1.0f/normSqrBC;
-ff = (float4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
-float4 force5 = ff.x*cp0b;
-float4 force8 = ff.w*cp1b;
+ff = (real4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
+real4 force5 = ff.x*cp0b;
+real4 force8 = ff.w*cp1b;
 d = ff.y*force5 - ff.z*force8;
-float4 force6 = d-force5;
-float4 force7 = -d-force8;
+real4 force6 = d-force5;
+real4 force7 = -d-force8;
--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
 #if USE_EWALD
 bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
 if (!isExcluded || needCorrection) {
-    float tempForce = 0.0f;
+    real tempForce = 0;
    if (r2 < CUTOFF_SQUARED || needCorrection) {
-        const float alphaR = EWALD_ALPHA*r;
-        const float expAlphaRSqr = EXP(-alphaR*alphaR);
-        const float prefactor = 138.935456f*posq1.w*posq2.w*invR;
+        const real alphaR = EWALD_ALPHA*r;
+        const real expAlphaRSqr = EXP(-alphaR*alphaR);
+        const real prefactor = 138.935456f*posq1.w*posq2.w*invR;

        // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
        // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
        // error of 3e-7.

-        float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
+        real t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
        t *= t;
        t *= t;
        t *= t;
-        const float erfcAlphaR = RECIP(t*t);
+        const real erfcAlphaR = RECIP(t*t);
        if (needCorrection) {
            // Subtract off the part of this interaction that was included in the reciprocal space contribution.

@@ -24,11 +24,11 @@ if (!isExcluded || needCorrection) {
        }
        else {
 #if HAS_LENNARD_JONES
-            float sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
-            float sig2 = invR*sig;
+            real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+            real sig2 = invR*sig;
            sig2 *= sig2;
-            float sig6 = sig2*sig2*sig2;
-            float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+            real sig6 = sig2*sig2*sig2;
+            real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
            tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
            tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
 #else
@@ -41,32 +41,37 @@ if (!isExcluded || needCorrection) {
 }
 #else
 {
+#ifdef USE_DOUBLE_PRECISION
+    unsigned long includeInteraction;
+#else
+    unsigned int includeInteraction;
+#endif
 #ifdef USE_CUTOFF
-    unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
+    includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
 #else
-    unsigned int includeInteraction = (!isExcluded);
+    includeInteraction = (!isExcluded);
 #endif
-    float tempForce = 0.0f;
+    real tempForce = 0;
  #if HAS_LENNARD_JONES
-    float sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
-    float sig2 = invR*sig;
+    real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+    real sig2 = invR*sig;
    sig2 *= sig2;
-    float sig6 = sig2*sig2*sig2;
-    float epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+    real sig6 = sig2*sig2*sig2;
+    real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
    tempForce = epssig6*(12.0f*sig6 - 6.0f);
-    tempEnergy += select(0.0f, epssig6*(sig6 - 1.0f), includeInteraction);
+    tempEnergy += select((real) 0, epssig6*(sig6-1), includeInteraction);
  #endif
 #if HAS_COULOMB
  #ifdef USE_CUTOFF
-    const float prefactor = 138.935456f*posq1.w*posq2.w;
+    const real prefactor = 138.935456f*posq1.w*posq2.w;
    tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
-    tempEnergy += select(0.0f, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
+    tempEnergy += select((real) 0, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
  #else
-    const float prefactor = 138.935456f*posq1.w*posq2.w*invR;
+    const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
    tempForce += prefactor;
-    tempEnergy += select(0.0f, prefactor, includeInteraction);
+    tempEnergy += select((real) 0, prefactor, includeInteraction);
  #endif
 #endif
-    dEdR += select(0.0f, tempForce*invR*invR, includeInteraction);
+    dEdR += select((real) 0, tempForce*invR*invR, includeInteraction);
 }
 #endif
\ No newline at end of file
--- a/platforms/opencl/src/kernels/customCompoundBond.cl
+++ b/platforms/opencl/src/kernels/customCompoundBond.cl
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-float4 ccb_delta(float4 vec1, float4 vec2) {
-    float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+real4 ccb_delta(real4 vec1, real4 vec2) {
+    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
@@ -10,17 +10,17 @@ float4 ccb_delta(float4 vec1, float4 vec2) {
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-float ccb_computeAngle(float4 vec1, float4 vec2) {
-    float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    float cosine = dotProduct*RSQRT(vec1.w*vec2.w);
-    float angle;
+real ccb_computeAngle(real4 vec1, real4 vec2) {
+    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
+    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
+    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        float4 crossProduct = cross(vec1, vec2);
-        float scale = vec1.w*vec2.w;
+        real4 crossProduct = cross(vec1, vec2);
+        real scale = vec1.w*vec2.w;
        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
+        if (cosine < 0)
            angle = M_PI-angle;
    }
    else
@@ -31,8 +31,8 @@ float ccb_computeAngle(float4 vec1, float4 vec2) {
 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-float4 ccb_computeCross(float4 vec1, float4 vec2) {
-    float4 result = cross(vec1, vec2);
+real4 ccb_computeCross(real4 vec1, real4 vec2) {
+    real4 result = cross(vec1, vec2);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
 }
--- a/platforms/opencl/src/kernels/customExternalForce.cl
+++ b/platforms/opencl/src/kernels/customExternalForce.cl
 COMPUTE_FORCE
-float4 force1 = (float4) (-dEdX, -dEdY, -dEdZ, 0.0f);
+real4 force1 = (real4) (-dEdX, -dEdY, -dEdZ, 0);