CUDA implementation of vector functions for CustomIntegrator

4c28df55 · Peter Eastman · cb92103e · 4c28df55 · 4c28df55 · 4c28df55
Commit 4c28df55 authored May 04, 2018 by Peter Eastman
6 changed files
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1495,9 +1495,8 @@ class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
 public:
    enum GlobalTargetType {DT, VARIABLE, PARAMETER};
    CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cu) : IntegrateCustomStepKernel(name, platform), cu(cu),
-            hasInitializedKernels(false), localValuesAreCurrent(false), perDofValues(NULL), needsEnergyParamDerivs(false) {
+            hasInitializedKernels(false), needsEnergyParamDerivs(false) {
    }
-    ~CudaIntegrateCustomStepKernel();
    /**
     * Initialize the kernel.
     * 
@@ -1561,7 +1560,7 @@ private:
    class ReorderListener;
    class GlobalTarget;
    class DerivFunction;
-    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+    std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator,
        const std::string& forceName, const std::string& energyName, std::vector<const TabulatedFunction*>& functions,
        std::vector<std::pair<std::string, std::string> >& functionNames);
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
@@ -1574,21 +1573,21 @@ private:
    double energy;
    float energyFloat;
    int numGlobalVariables, sumWorkGroupSize;
-    bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
+    bool hasInitializedKernels, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
-    mutable bool localValuesAreCurrent;
+    std::vector<bool> deviceValuesAreCurrent;
+    mutable std::vector<bool> localValuesAreCurrent; 
    CudaArray globalValues;
    CudaArray sumBuffer;
    CudaArray summedValue;
    CudaArray uniformRandoms;
    CudaArray randomSeed;
    CudaArray perDofEnergyParamDerivs;
-    std::vector<CudaArray> tabulatedFunctions;
+    std::vector<CudaArray> tabulatedFunctions, perDofValues;
    std::map<int, double> savedEnergy;
    std::map<int, CudaArray> savedForces;
    std::set<int> validSavedForces;
-    CudaParameterSet* perDofValues;
+    mutable std::vector<std::vector<float4> > localPerDofValuesFloat;
-    mutable std::vector<std::vector<float> > localPerDofValuesFloat;
+    mutable std::vector<std::vector<double4> > localPerDofValuesDouble;
-    mutable std::vector<std::vector<double> > localPerDofValuesDouble;
    std::map<std::string, double> energyParamDerivs;
    std::vector<std::string> perDofEnergyParamDerivNames;
    std::vector<float> localPerDofEnergyParamDerivsFloat;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -203,12 +203,13 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
                break;
            }
        }
-        if (this->deviceIndex == -1)
+        if (this->deviceIndex == -1) {
            if (deviceIndex != -1)
                throw OpenMMException("The requested CUDA device could not be loaded");
            else
                throw OpenMMException("No compatible CUDA device is available");
        }
+    }
    else {
        isLinkedContext = true;
        context = originalContext->context;

--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -69,17 +69,24 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
        processExpression(out, node.getChildren()[i], temps, functions, functionNames, prefix, functionParams, allExpressions, tempType);
    string name = prefix+context.intToString(temps.size());
    bool hasRecordedNode = false;
+    bool isVecType = (tempType[tempType.size()-1] == '3');
    out << tempType << " " << name << " = ";
    switch (node.getOperation().getId()) {
        case Operation::CONSTANT:
-            out << context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+        {
+            string value = context.doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
+            if (isVecType)
+                out << "make_" << tempType << "(" << value << ")";
+            else
+                out << value;
            break;
+        }
        case Operation::VARIABLE:
            throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
        case Operation::CUSTOM:
        {
-            out << "0.0f;\n";
+            out << "make_" << tempType << "(0);\n";
            temps.push_back(make_pair(node, name));
            hasRecordedNode = true;
@@ -137,6 +144,71 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
                }
            }
+            else if (node.getOperation().getName() == "dot") {
+                for (int j = 0; j < nodes.size(); j++) {
+                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
+                    string child1 = getTempName(node.getChildren()[0], temps);
+                    string child2 = getTempName(node.getChildren()[1], temps);
+                    if (derivOrder[0] == 0 && derivOrder[1] == 0)
+                        out << nodeNames[j] << " = make_" << tempType << "(dot(" << child1 << ", " << child2 << "));\n";
+                    else
+                        throw OpenMMException("Unsupported derivative order for cross()");
+                }
+            }
+            else if (node.getOperation().getName() == "cross") {
+                for (int j = 0; j < nodes.size(); j++) {
+                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
+                    string child1 = getTempName(node.getChildren()[0], temps);
+                    string child2 = getTempName(node.getChildren()[1], temps);
+                    if (derivOrder[0] == 0 && derivOrder[1] == 0)
+                        out << nodeNames[j] << " = cross(" << child1 << ", " << child2 << ");\n";
+                    else
+                        throw OpenMMException("Unsupported derivative order for cross()");
+                }
+            }
+            else if (node.getOperation().getName() == "vector") {
+                for (int j = 0; j < nodes.size(); j++) {
+                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
+                    if (derivOrder[0] == 0 && derivOrder[1] == 0 && derivOrder[2] == 0) {
+                        out << nodeNames[j] << ".x = " << getTempName(node.getChildren()[0], temps) << ".x;\n";
+                        out << nodeNames[j] << ".y = " << getTempName(node.getChildren()[1], temps) << ".y;\n";
+                        out << nodeNames[j] << ".z = " << getTempName(node.getChildren()[2], temps) << ".z;\n";
+                    }
+                    else if (derivOrder[0] == 1 && derivOrder[1] == 0 && derivOrder[2] == 0)
+                        out << nodeNames[j] << ".x = 1;\n";
+                    else if (derivOrder[0] == 0 && derivOrder[1] == 1 && derivOrder[2] == 0)
+                        out << nodeNames[j] << ".y = 1;\n";
+                    else if (derivOrder[0] == 0 && derivOrder[1] == 0 && derivOrder[2] == 1)
+                        out << nodeNames[j] << ".z = 1;\n";
+                }
+            }
+            else if (node.getOperation().getName() == "_x") {
+                for (int j = 0; j < nodes.size(); j++) {
+                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
+                    if (derivOrder[0] == 0)
+                        out << nodeNames[j] << " = make_" << tempType << "(" << getTempName(node.getChildren()[0], temps) << ".x);\n";
+                    else
+                        throw OpenMMException("Unsupported derivative order for _x()");
+                }
+            }
+            else if (node.getOperation().getName() == "_y") {
+                for (int j = 0; j < nodes.size(); j++) {
+                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
+                    if (derivOrder[0] == 0)
+                        out << nodeNames[j] << " = make_" << tempType << "(" << getTempName(node.getChildren()[0], temps) << ".y);\n";
+                    else
+                        throw OpenMMException("Unsupported derivative order for _y()");
+                }
+            }
+            else if (node.getOperation().getName() == "_z") {
+                for (int j = 0; j < nodes.size(); j++) {
+                    const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
+                    if (derivOrder[0] == 0)
+                        out << nodeNames[j] << " = make_" << tempType << "(" << getTempName(node.getChildren()[0], temps) << ".z);\n";
+                    else
+                        throw OpenMMException("Unsupported derivative order for _z()");
+                }
+            }
            else {
                // This is a tabulated function.
@@ -150,8 +222,19 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                    paramsFloat.push_back(context.doubleToString(functionParams[i][j]));
                    paramsInt.push_back(context.intToString((int) functionParams[i][j]));
                }
+                vector<string> suffixes;
+                if (isVecType) {
+                    suffixes.push_back(".x");
+                    suffixes.push_back(".y");
+                    suffixes.push_back(".z");
+                }
+                else {
+                    suffixes.push_back("");
+                }
+                for (auto& suffix : suffixes) {
+                    out << "{\n";
                    if (dynamic_cast<const Continuous1DFunction*>(functions[i]) != NULL) {
-                    out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+                        out << "real x = " << getTempName(node.getChildren()[0], temps) << suffix << ";\n";
                        out << "if (x >= " << paramsFloat[0] << " && x <= " << paramsFloat[1] << ") {\n";
                        out << "x = (x - " << paramsFloat[0] << ")*" << paramsFloat[2] << ";\n";
                        out << "int index = (int) (floor(x));\n";
@@ -162,15 +245,15 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        for (int j = 0; j < nodes.size(); j++) {
                            const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                            if (derivOrder[0] == 0)
-                            out << nodeNames[j] << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(" << paramsFloat[2] << "*" << paramsFloat[2] << ");\n";
+                                out << nodeNames[j] << suffix << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(" << paramsFloat[2] << "*" << paramsFloat[2] << ");\n";
                            else
-                            out << nodeNames[j] << " = (coeff.y-coeff.x)*" << paramsFloat[2] << "+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/" << paramsFloat[2] << ";\n";
+                                out << nodeNames[j] << suffix << " = (coeff.y-coeff.x)*" << paramsFloat[2] << "+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/" << paramsFloat[2] << ";\n";
                        }
                        out << "}\n";
                    }
                    else if (dynamic_cast<const Continuous2DFunction*>(functions[i]) != NULL) {
-                    out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+                        out << "real x = " << getTempName(node.getChildren()[0], temps) << suffix << ";\n";
-                    out << "real y = " << getTempName(node.getChildren()[1], temps) << ";\n";
+                        out << "real y = " << getTempName(node.getChildren()[1], temps) << suffix << ";\n";
                        out << "if (x >= " << paramsFloat[2] << " && x <= " << paramsFloat[3] << " && y >= " << paramsFloat[4] << " && y <= " << paramsFloat[5] << ") {\n";
                        out << "x = (x - " << paramsFloat[2] << ")*" << paramsFloat[6] << ";\n";
                        out << "y = (y - " << paramsFloat[4] << ")*" << paramsFloat[7] << ";\n";
@@ -185,24 +268,24 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        for (int j = 0; j < nodes.size(); j++) {
                            const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                            if (derivOrder[0] == 0 && derivOrder[1] == 0) {
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;\n";
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + ((c[2].w*db + c[2].z)*db + c[2].y)*db + c[2].x;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + ((c[2].w*db + c[2].z)*db + c[2].y)*db + c[2].x;\n";
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + ((c[1].w*db + c[1].z)*db + c[1].y)*db + c[1].x;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + ((c[1].w*db + c[1].z)*db + c[1].y)*db + c[1].x;\n";
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + ((c[0].w*db + c[0].z)*db + c[0].y)*db + c[0].x;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + ((c[0].w*db + c[0].z)*db + c[0].y)*db + c[0].x;\n";
                            }
                            else if (derivOrder[0] == 1 && derivOrder[1] == 0) {
-                            out << nodeNames[j] << " = db*" << nodeNames[j] << " + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;\n";
+                                out << nodeNames[j] << suffix << " = db*" << nodeNames[j] << suffix << " + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;\n";
-                            out << nodeNames[j] << " = db*" << nodeNames[j] << " + (3.0f*c[3].z*da + 2.0f*c[2].z)*da + c[1].z;\n";
+                                out << nodeNames[j] << suffix << " = db*" << nodeNames[j] << suffix << " + (3.0f*c[3].z*da + 2.0f*c[2].z)*da + c[1].z;\n";
-                            out << nodeNames[j] << " = db*" << nodeNames[j] << " + (3.0f*c[3].y*da + 2.0f*c[2].y)*da + c[1].y;\n";
+                                out << nodeNames[j] << suffix << " = db*" << nodeNames[j] << suffix << " + (3.0f*c[3].y*da + 2.0f*c[2].y)*da + c[1].y;\n";
-                            out << nodeNames[j] << " = db*" << nodeNames[j] << " + (3.0f*c[3].x*da + 2.0f*c[2].x)*da + c[1].x;\n";
+                                out << nodeNames[j] << suffix << " = db*" << nodeNames[j] << suffix << " + (3.0f*c[3].x*da + 2.0f*c[2].x)*da + c[1].x;\n";
-                            out << nodeNames[j] << " *= " << paramsFloat[6] << ";\n";
+                                out << nodeNames[j] << suffix << " *= " << paramsFloat[6] << ";\n";
                            }
                            else if (derivOrder[0] == 0 && derivOrder[1] == 1) {
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;\n";
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + (3.0f*c[2].w*db + 2.0f*c[2].z)*db + c[2].y;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + (3.0f*c[2].w*db + 2.0f*c[2].z)*db + c[2].y;\n";
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + (3.0f*c[1].w*db + 2.0f*c[1].z)*db + c[1].y;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + (3.0f*c[1].w*db + 2.0f*c[1].z)*db + c[1].y;\n";
-                            out << nodeNames[j] << " = da*" << nodeNames[j] << " + (3.0f*c[0].w*db + 2.0f*c[0].z)*db + c[0].y;\n";
+                                out << nodeNames[j] << suffix << " = da*" << nodeNames[j] << suffix << " + (3.0f*c[0].w*db + 2.0f*c[0].z)*db + c[0].y;\n";
-                            out << nodeNames[j] << " *= " << paramsFloat[7] << ";\n";
+                                out << nodeNames[j] << suffix << " *= " << paramsFloat[7] << ";\n";
                            }
                            else
                                throw OpenMMException("Unsupported derivative order for Continuous2DFunction");
@@ -210,9 +293,9 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        out << "}\n";
                    }
                    else if (dynamic_cast<const Continuous3DFunction*>(functions[i]) != NULL) {
-                    out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+                        out << "real x = " << getTempName(node.getChildren()[0], temps) << suffix << ";\n";
-                    out << "real y = " << getTempName(node.getChildren()[1], temps) << ";\n";
+                        out << "real y = " << getTempName(node.getChildren()[1], temps) << suffix << ";\n";
-                    out << "real z = " << getTempName(node.getChildren()[2], temps) << ";\n";
+                        out << "real z = " << getTempName(node.getChildren()[2], temps) << suffix << ";\n";
                        out << "if (x >= " << paramsFloat[3] << " && x <= " << paramsFloat[4] << " && y >= " << paramsFloat[5] << " && y <= " << paramsFloat[6] << " && z >= " << paramsFloat[7] << " && z <= " << paramsFloat[8] << ") {\n";
                        out << "x = (x - " << paramsFloat[3] << ")*" << paramsFloat[9] << ";\n";
                        out << "y = (y - " << paramsFloat[5] << ")*" << paramsFloat[10] << ";\n";
@@ -236,7 +319,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                                        int base = k + 4*m;
                                        out << "value[" << m << "] = db*value[" << m << "] + ((c[" << base << "].w*da + c[" << base << "].z)*da + c[" << base << "].y)*da + c[" << base << "].x;\n";
                                    }
-                            out << nodeNames[j] << " = value[0] + dc*(value[1] + dc*(value[2] + dc*value[3]));\n";
+                                out << nodeNames[j] << suffix << " = value[0] + dc*(value[1] + dc*(value[2] + dc*value[3]));\n";
                            }
                            else if (derivOrder[0] == 1 && derivOrder[1] == 0 && derivOrder[2] == 0) {
                                out << "real derivx[4] = {0, 0, 0, 0};\n";
@@ -245,8 +328,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                                        int base = k + 4*m;
                                        out << "derivx[" << m << "] = db*derivx[" << m << "] + (3*c[" << base << "].w*da + 2*c[" << base << "].z)*da + c[" << base << "].y;\n";
                                    }
-                            out << nodeNames[j] << " = derivx[0] + dc*(derivx[1] + dc*(derivx[2] + dc*derivx[3]));\n";
+                                out << nodeNames[j] << suffix << " = derivx[0] + dc*(derivx[1] + dc*(derivx[2] + dc*derivx[3]));\n";
-                            out << nodeNames[j] << " *= " << paramsFloat[9] << ";\n";
+                                out << nodeNames[j] << suffix << " *= " << paramsFloat[9] << ";\n";
                            }
                            else if (derivOrder[0] == 0 && derivOrder[1] == 1 && derivOrder[2] == 0) {
                                const string suffixes[] = {".x", ".y", ".z", ".w"};
@@ -257,8 +340,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                                        string suffix = suffixes[k];
                                        out << "derivy[" << m << "] = da*derivy[" << m << "] + (3*c[" << (base+3) << "]" << suffix << "*db + 2*c[" << (base+2) << "]" << suffix << ")*db + c[" << (base+1) << "]" << suffix << ";\n";
                                    }
-                            out << nodeNames[j] << " = derivy[0] + dc*(derivy[1] + dc*(derivy[2] + dc*derivy[3]));\n";
+                                out << nodeNames[j] << suffix << " = derivy[0] + dc*(derivy[1] + dc*(derivy[2] + dc*derivy[3]));\n";
-                            out << nodeNames[j] << " *= " << paramsFloat[10] << ";\n";
+                                out << nodeNames[j] << suffix << " *= " << paramsFloat[10] << ";\n";
                            }
                            else if (derivOrder[0] == 0 && derivOrder[1] == 0 && derivOrder[2] == 1) {
                                out << "real derivz[4] = {0, 0, 0, 0};\n";
@@ -267,8 +350,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                                        int base = k + 4*m;
                                        out << "derivz[" << m << "] = db*derivz[" << m << "] + ((c[" << base << "].w*da + c[" << base << "].z)*da + c[" << base << "].y)*da + c[" << base << "].x;\n";
                                    }
-                            out << nodeNames[j] << " = derivz[1] + dc*(2*derivz[2] + dc*3*derivz[3]);\n";
+                                out << nodeNames[j] << suffix << " = derivz[1] + dc*(2*derivz[2] + dc*3*derivz[3]);\n";
-                            out << nodeNames[j] << " *= " << paramsFloat[11] << ";\n";
+                                out << nodeNames[j] << suffix << " *= " << paramsFloat[11] << ";\n";
                            }
                            else
                                throw OpenMMException("Unsupported derivative order for Continuous3DFunction");
@@ -279,10 +362,10 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        for (int j = 0; j < nodes.size(); j++) {
                            const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                            if (derivOrder[0] == 0) {
-                            out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
+                                out << "real x = " << getTempName(node.getChildren()[0], temps) << suffix << ";\n";
                                out << "if (x >= 0 && x < " << paramsInt[0] << ") {\n";
                                out << "int index = (int) floor(x+0.5f);\n";
-                            out << nodeNames[j] << " = " << functionNames[i].second << "[index];\n";
+                                out << nodeNames[j] << suffix << " = " << functionNames[i].second << "[index];\n";
                                out << "}\n";
                            }
                        }
@@ -291,13 +374,13 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        for (int j = 0; j < nodes.size(); j++) {
                            const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                            if (derivOrder[0] == 0 && derivOrder[1] == 0) {
-                            out << "int x = (int) floor(" << getTempName(node.getChildren()[0], temps) << "+0.5f);\n";
+                                out << "int x = (int) floor(" << getTempName(node.getChildren()[0], temps) << suffix << "+0.5f);\n";
-                            out << "int y = (int) floor(" << getTempName(node.getChildren()[1], temps) << "+0.5f);\n";
+                                out << "int y = (int) floor(" << getTempName(node.getChildren()[1], temps) << suffix << "+0.5f);\n";
                                out << "int xsize = (int) " << paramsInt[0] << ";\n";
                                out << "int ysize = (int) " << paramsInt[1] << ";\n";
                                out << "int index = x+y*xsize;\n";
                                out << "if (index >= 0 && index < xsize*ysize)\n";
-                            out << nodeNames[j] << " = " << functionNames[i].second << "[index];\n";
+                                out << nodeNames[j] << suffix << " = " << functionNames[i].second << "[index];\n";
                            }
                        }
                    }
@@ -305,17 +388,19 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                        for (int j = 0; j < nodes.size(); j++) {
                            const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
                            if (derivOrder[0] == 0 && derivOrder[1] == 0 && derivOrder[2] == 0) {
-                            out << "int x = (int) floor(" << getTempName(node.getChildren()[0], temps) << "+0.5f);\n";
+                                out << "int x = (int) floor(" << getTempName(node.getChildren()[0], temps) << suffix << "+0.5f);\n";
-                            out << "int y = (int) floor(" << getTempName(node.getChildren()[1], temps) << "+0.5f);\n";
+                                out << "int y = (int) floor(" << getTempName(node.getChildren()[1], temps) << suffix << "+0.5f);\n";
-                            out << "int z = (int) floor(" << getTempName(node.getChildren()[2], temps) << "+0.5f);\n";
+                                out << "int z = (int) floor(" << getTempName(node.getChildren()[2], temps) << suffix << "+0.5f);\n";
                                out << "int xsize = (int) " << paramsInt[0] << ";\n";
                                out << "int ysize = (int) " << paramsInt[1] << ";\n";
                                out << "int zsize = (int) " << paramsInt[2] << ";\n";
                                out << "int index = x+(y+z*ysize)*xsize;\n";
                                out << "if (index >= 0 && index < xsize*ysize*zsize)\n";
-                            out << nodeNames[j] << " = " << functionNames[i].second << "[index];\n";
+                                out << nodeNames[j] << suffix << " = " << functionNames[i].second << "[index];\n";
+                            }
                        }
                    }
+                    out << "}\n";
                }
            }
            out << "}";
@@ -507,8 +592,23 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            out << "ceil(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
        case Operation::SELECT:
-            out << "(" << getTempName(node.getChildren()[0], temps) << " != 0 ? " << getTempName(node.getChildren()[1], temps) << " : " << getTempName(node.getChildren()[2], temps) << ")";
+        {
+            string compareVal = getTempName(node.getChildren()[0], temps);
+            string val1 = getTempName(node.getChildren()[1], temps);
+            string val2 = getTempName(node.getChildren()[2], temps);
+            if (isVecType) {
+                out << "make_" << tempType << "(0);\n";
+                out << "{\n";
+                out << tempType<<" tempCompareValue = " << compareVal << ";\n";
+                out << name << ".x = (tempCompareValue.x != 0 ? " << val1 << ".x : " << val2 << ".x);\n";
+                out << name << ".y = (tempCompareValue.y != 0 ? " << val1 << ".y : " << val2 << ".y);\n";
+                out << name << ".z = (tempCompareValue.z != 0 ? " << val1 << ".z : " << val2 << ".z);\n";
+                out << "}\n";
+            }
+            else
+                out << "(" << compareVal << " != 0 ? " << val1 << " : " << val2 << ")";
            break;
+        }
        default:
            throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
    }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -7131,7 +7131,7 @@ double CudaIntegrateVariableLangevinStepKernel::computeKineticEnergy(ContextImpl
 class CudaIntegrateCustomStepKernel::ReorderListener : public CudaContext::ReorderListener {
 public:
-    ReorderListener(CudaContext& cu, CudaParameterSet& perDofValues, vector<vector<float> >& localPerDofValuesFloat, vector<vector<double> >& localPerDofValuesDouble, bool& deviceValuesAreCurrent) :
+    ReorderListener(CudaContext& cu, vector<CudaArray>& perDofValues, vector<vector<float4> >& localPerDofValuesFloat, vector<vector<double4> >& localPerDofValuesDouble, vector<bool>& deviceValuesAreCurrent) :
            cu(cu), perDofValues(perDofValues), localPerDofValuesFloat(localPerDofValuesFloat), localPerDofValuesDouble(localPerDofValuesDouble), deviceValuesAreCurrent(deviceValuesAreCurrent) {
        int numAtoms = cu.getNumAtoms();
        lastAtomOrder.resize(numAtoms);
@@ -7141,52 +7141,42 @@ public:
    void execute() {
        // Reorder the per-DOF variables to reflect the new atom order.
-        if (perDofValues.getNumParameters() == 0)
+        if (perDofValues.size() == 0)
            return;
        int numAtoms = cu.getNumAtoms();
        const vector<int>& order = cu.getAtomIndex();
+        for (int index = 0; index < perDofValues.size(); index++) {
            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-            if (deviceValuesAreCurrent)
+                if (deviceValuesAreCurrent[index])
-                perDofValues.getParameterValues(localPerDofValuesDouble);
+                    perDofValues[index].download(localPerDofValuesDouble[index]);
-            vector<vector<double> > swap(3*numAtoms);
+                vector<double4> swap(numAtoms);
-            for (int i = 0; i < numAtoms; i++) {
+                for (int i = 0; i < numAtoms; i++)
-                swap[3*lastAtomOrder[i]] = localPerDofValuesDouble[3*i];
+                    swap[lastAtomOrder[i]] = localPerDofValuesDouble[index][i];
-                swap[3*lastAtomOrder[i]+1] = localPerDofValuesDouble[3*i+1];
+                for (int i = 0; i < numAtoms; i++)
-                swap[3*lastAtomOrder[i]+2] = localPerDofValuesDouble[3*i+2];
+                    localPerDofValuesDouble[index][i] = swap[order[i]];
-            }
+                perDofValues[index].upload(localPerDofValuesDouble[index]);
-            for (int i = 0; i < numAtoms; i++) {
-                localPerDofValuesDouble[3*i] = swap[3*order[i]];
-                localPerDofValuesDouble[3*i+1] = swap[3*order[i]+1];
-                localPerDofValuesDouble[3*i+2] = swap[3*order[i]+2];
-            }
-            perDofValues.setParameterValues(localPerDofValuesDouble);
            }
            else {
-            if (deviceValuesAreCurrent)
+                if (deviceValuesAreCurrent[index])
-                perDofValues.getParameterValues(localPerDofValuesFloat);
+                    perDofValues[index].download(localPerDofValuesFloat[index]);
-            vector<vector<float> > swap(3*numAtoms);
+                vector<float4> swap(numAtoms);
-            for (int i = 0; i < numAtoms; i++) {
+                for (int i = 0; i < numAtoms; i++)
-                swap[3*lastAtomOrder[i]] = localPerDofValuesFloat[3*i];
+                    swap[lastAtomOrder[i]] = localPerDofValuesFloat[index][i];
-                swap[3*lastAtomOrder[i]+1] = localPerDofValuesFloat[3*i+1];
+                for (int i = 0; i < numAtoms; i++)
-                swap[3*lastAtomOrder[i]+2] = localPerDofValuesFloat[3*i+2];
+                    localPerDofValuesFloat[index][i] = swap[order[i]];
-            }
+                perDofValues[index].upload(localPerDofValuesFloat[index]);
-            for (int i = 0; i < numAtoms; i++) {
-                localPerDofValuesFloat[3*i] = swap[3*order[i]];
-                localPerDofValuesFloat[3*i+1] = swap[3*order[i]+1];
-                localPerDofValuesFloat[3*i+2] = swap[3*order[i]+2];
            }
-            perDofValues.setParameterValues(localPerDofValuesFloat);
+            deviceValuesAreCurrent[index] = true;
        }
        for (int i = 0; i < numAtoms; i++)
            lastAtomOrder[i] = order[i];
-        deviceValuesAreCurrent = true;
    }
 private:
    CudaContext& cu;
-    CudaParameterSet& perDofValues;
+    vector<CudaArray>& perDofValues;
-    vector<vector<float> >& localPerDofValuesFloat;
+    vector<vector<float4> >& localPerDofValuesFloat;
-    vector<vector<double> >& localPerDofValuesDouble;
+    vector<vector<double4> >& localPerDofValuesDouble;
-    bool& deviceValuesAreCurrent;
+    vector<bool>& deviceValuesAreCurrent;
    vector<int> lastAtomOrder;
 };
@@ -7211,64 +7201,64 @@ private:
    string param;
 };
-CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
-    cu.setAsCurrent();
-    if (perDofValues != NULL)
-        delete perDofValues;
-}
 void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
    cu.getPlatformData().initializeContexts(system);
    cu.setAsCurrent();
    cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
    numGlobalVariables = integrator.getNumGlobalVariables();
    int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
-    sumBuffer.initialize(cu, ((3*system.getNumParticles()+3)/4)*4, elementSize, "sumBuffer");
+    sumBuffer.initialize(cu, system.getNumParticles(), elementSize, "sumBuffer");
    summedValue.initialize(cu, 1, elementSize, "summedValue");
-    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
+    perDofValues.resize(integrator.getNumPerDofVariables());
-    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
+    localPerDofValuesFloat.resize(perDofValues.size());
+    localPerDofValuesDouble.resize(perDofValues.size());
+    for (int i = 0; i < perDofValues.size(); i++)
+        perDofValues[i].initialize(cu, system.getNumParticles(), 4*elementSize, "perDofVariables");
+    localValuesAreCurrent.resize(integrator.getNumPerDofVariables(), false);
+    deviceValuesAreCurrent.resize(integrator.getNumPerDofVariables(), false);
+    cu.addReorderListener(new ReorderListener(cu, perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
    SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
 }
-string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator,
+string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator,
        const string& forceName, const string& energyName, vector<const TabulatedFunction*>& functions, vector<pair<string, string> >& functionNames) {
-    const string suffixes[] = {".x", ".y", ".z"};
-    string suffix = suffixes[component];
    map<string, Lepton::ParsedExpression> expressions;
-    if (variable == "x")
+    expressions["double3 tempResult = "] = expr;
-        expressions["position"+suffix+" = "] = expr;
-    else if (variable == "v")
-        expressions["velocity"+suffix+" = "] = expr;
-    else if (variable == "")
-        expressions["sum[3*index+"+cu.intToString(component)+"] = "] = expr;
-    else {
-        for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
-            if (variable == integrator.getPerDofVariableName(i))
-                expressions["perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i)+" = "] = expr;
-    }
-    if (expressions.size() == 0)
-        throw OpenMMException("Unknown per-DOF variable: "+variable);
    map<string, string> variables;
-    variables["x"] = "position"+suffix;
+    variables["x"] = "trimTo3(position)";
-    variables["v"] = "velocity"+suffix;
+    variables["v"] = "trimTo3(velocity)";
-    variables[forceName] = "f"+suffix;
+    variables[forceName] = "trimTo3(f)";
-    variables["gaussian"] = "gaussian"+suffix;
+    variables["gaussian"] = "trimTo3(gaussian)";
-    variables["uniform"] = "uniform"+suffix;
+    variables["uniform"] = "trimTo3(uniform)";
    variables["m"] = "mass";
    variables["dt"] = "stepSize";
    if (energyName != "")
        variables[energyName] = "energy";
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
-        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(globalVariableIndex[i])+"]";
+        variables[integrator.getGlobalVariableName(i)] = "make_double3(globals["+cu.intToString(globalVariableIndex[i])+"])";
    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
-        variables[integrator.getPerDofVariableName(i)] = "perDof"+suffix.substr(1)+perDofValues->getParameterSuffix(i);
+        variables[integrator.getPerDofVariableName(i)] = "perDof"+cu.intToString(i);
    for (int i = 0; i < (int) parameterNames.size(); i++)
-        variables[parameterNames[i]] = "globals["+cu.intToString(parameterVariableIndex[i])+"]";
+        variables[parameterNames[i]] = "make_double3(globals["+cu.intToString(parameterVariableIndex[i])+"])";
    vector<pair<ExpressionTreeNode, string> > variableNodes;
    findExpressionsForDerivs(expr.getRootNode(), variableNodes);
    for (auto& var : variables)
        variableNodes.push_back(make_pair(ExpressionTreeNode(new Operation::Variable(var.first)), var.second));
-    return cu.getExpressionUtilities().createExpressions(expressions, variableNodes, functions, functionNames, "temp"+cu.intToString(component)+"_", "double");
+    string result = cu.getExpressionUtilities().createExpressions(expressions, variableNodes, functions, functionNames, "temp", "double3");
+    if (variable == "x")
+        result += "position.x = tempResult.x; position.y = tempResult.y; position.z = tempResult.z;\n";
+    else if (variable == "v")
+        result += "velocity.x = tempResult.x; velocity.y = tempResult.y; velocity.z = tempResult.z;\n";
+    else if (variable == "")
+        result += "sum[index] = tempResult.x+tempResult.y+tempResult.z;\n";
+    else {
+        for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
+            if (variable == integrator.getPerDofVariableName(i)) {
+                string varName = "perDof"+cu.intToString(i);
+                result += varName+".x = tempResult.x; "+varName+".y = tempResult.y; "+varName+".z = tempResult.z;\n";
+            }
+    }
+    return result;
 }
 void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {
@@ -7277,6 +7267,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
    int numAtoms = cu.getNumAtoms();
    int numSteps = integrator.getNumComputations();
    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
+    string perDofType = (useDouble ? "double4" : "float4");
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
@@ -7487,23 +7478,18 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                // Compute a per-DOF value.
                stringstream compute;
-                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+                for (int i = 0; i < perDofValues.size(); i++)
-                    CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+                    compute << "double3 perDof"<<cu.intToString(i)<<" = trimTo3(convertToDouble4(perDofValues"<<cu.intToString(i)<<"[index]));\n";
-                    compute << buffer.getType()<<" perDofx"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index];\n";
-                    compute << buffer.getType()<<" perDofy"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+1];\n";
-                    compute << buffer.getType()<<" perDofz"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+2];\n";
-                }
                int numGaussian = 0, numUniform = 0;
                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
                    numGaussian += numAtoms*usesVariable(expression[j][0], "gaussian");
                    numUniform += numAtoms*usesVariable(expression[j][0], "uniform");
                    compute << "{\n";
                    if (numGaussian > 0)
-                        compute << "float4 gaussian = gaussianValues[gaussianIndex+index];\n";
+                        compute << "double4 gaussian = convertToDouble4(gaussianValues[gaussianIndex+index]);\n";
                    if (numUniform > 0)
-                        compute << "float4 uniform = uniformValues[uniformIndex+index];\n";
+                        compute << "double4 uniform = convertToDouble4(uniformValues[uniformIndex+index]);\n";
-                    for (int i = 0; i < 3; i++)
+                    compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], integrator, forceName[j], energyName[j], functionList, functionNames);
-                        compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j][0], i, integrator, forceName[j], energyName[j], functionList, functionNames);
                    if (variable[j] == "x") {
                        if (storePosAsDelta[j])
                            compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n";
@@ -7513,12 +7499,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                    else if (variable[j] == "v")
                        compute << "velm[index] = convertFromDouble4(velocity);\n";
                    else {
-                        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+                        for (int i = 0; i < perDofValues.size(); i++)
-                            CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+                            compute << "perDofValues"<<cu.intToString(i)<<"[index] = make_"<<perDofType<<"(perDof"<<cu.intToString(i)<<".x, perDof"<<cu.intToString(i)<<".y, perDof"<<cu.intToString(i)<<".z, 0);\n";
-                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index] = perDofx"<<cu.intToString(i+1)<<";\n";
-                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+1] = perDofy"<<cu.intToString(i+1)<<";\n";
-                            compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+2] = perDofz"<<cu.intToString(i+1)<<";\n";
-                        }
                    }
                    if (numGaussian > 0)
                        compute << "gaussianIndex += NUM_ATOMS;\n";
@@ -7529,10 +7511,9 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                map<string, string> replacements;
                replacements["COMPUTE_STEP"] = compute.str();
                stringstream args;
-                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+                for (int i = 0; i < perDofValues.size(); i++) {
-                    CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+                    string valueName = "perDofValues"+cu.intToString(i);
-                    string valueName = "perDofValues"+cu.intToString(i+1);
+                    args << ", " << perDofType << "* __restrict__ " << valueName;
-                    args << ", " << buffer.getType() << "* __restrict__ " << valueName;
                }
                for (int i = 0; i < (int) tableTypes.size(); i++)
                    args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
@@ -7563,8 +7544,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                else
                    args1.push_back(&energyFloat);
                args1.push_back(&perDofEnergyParamDerivs.getDevicePointer());
-                for (auto& buffer : perDofValues->getBuffers())
+                for (auto& array : perDofValues)
-                    args1.push_back(&buffer.getMemory());
+                    args1.push_back(&array.getDevicePointer());
                for (auto& array : tabulatedFunctions)
                    args1.push_back(&array.getDevicePointer());
                kernelArgs[step].push_back(args1);
@@ -7574,7 +7555,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                    vector<void*> args2;
                    args2.push_back(&sumBuffer.getDevicePointer());
                    args2.push_back(&summedValue.getDevicePointer());
-                    defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
+                    defines["SUM_BUFFER_SIZE"] = cu.intToString(numAtoms);
                    module = cu.createModule(CudaKernelSources::customIntegrator, defines);
                    kernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
                    kernels[step].push_back(kernel);
@@ -7621,30 +7602,24 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        // Create the kernel for computing kinetic energy.
        stringstream computeKE;
-        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+        for (int i = 0; i < perDofValues.size(); i++)
-            const CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+            computeKE << "double3 perDof"<<cu.intToString(i)<<" = trimTo3(convertToDouble4(perDofValues"<<cu.intToString(i)<<"[index]));\n";
-            computeKE << buffer.getType()<<" perDofx"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index];\n";
-            computeKE << buffer.getType()<<" perDofy"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+1];\n";
-            computeKE << buffer.getType()<<" perDofz"<<cu.intToString(i+1)<<" = perDofValues"<<cu.intToString(i+1)<<"[3*index+2];\n";
-        }
        Lepton::ParsedExpression keExpression = Lepton::Parser::parse(integrator.getKineticEnergyExpression()).optimize();
-        for (int i = 0; i < 3; i++)
+        computeKE << createPerDofComputation("", keExpression, integrator, "f", "", functionList, functionNames);
-            computeKE << createPerDofComputation("", keExpression, i, integrator, "f", "", functionList, functionNames);
        map<string, string> replacements;
        replacements["COMPUTE_STEP"] = computeKE.str();
        stringstream args;
-        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
+        for (int i = 0; i < perDofValues.size(); i++) {
-            const CudaNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
+            string valueName = "perDofValues"+cu.intToString(i);
-            string valueName = "perDofValues"+cu.intToString(i+1);
+            args << ", " << perDofType << "* __restrict__ " << valueName;
-            args << ", " << buffer.getType() << "* __restrict__ " << valueName;
        }
        for (int i = 0; i < (int) tableTypes.size(); i++)
            args << ", const " << tableTypes[i]<< "* __restrict__ table" << i;
        replacements["PARAMETER_ARGUMENTS"] = args.str();
-        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
+        defines["SUM_BUFFER_SIZE"] = cu.intToString(numAtoms);
        if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
            defines.erase("LOAD_POS_AS_DELTA");
-        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
+        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customIntegratorPerDof, replacements), defines);
        kineticEnergyKernel = cu.getKernel(module, "computePerDof");
        kineticEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
        kineticEnergyArgs.push_back(NULL);
@@ -7662,15 +7637,15 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        else
            kineticEnergyArgs.push_back(&energyFloat);
        kineticEnergyArgs.push_back(&perDofEnergyParamDerivs.getDevicePointer());
-        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
+        for (auto& array : perDofValues)
-            kineticEnergyArgs.push_back(&perDofValues->getBuffers()[i].getMemory());
+            kineticEnergyArgs.push_back(&array.getDevicePointer());
        for (auto& array : tabulatedFunctions)
            kineticEnergyArgs.push_back(&array.getDevicePointer());
        keNeedsForce = usesVariable(keExpression, "f");
        // Create a second kernel to sum the values.
-        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
+        defines["SUM_BUFFER_SIZE"] = cu.intToString(numAtoms);
        module = cu.createModule(CudaKernelSources::customIntegrator, defines);
        sumKineticEnergyKernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
@@ -7682,14 +7657,16 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
    // Make sure all values (variables, parameters, etc.) are up to date.
-    if (!deviceValuesAreCurrent) {
+    for (int i = 0; i < perDofValues.size(); i++) {
+        if (!deviceValuesAreCurrent[i]) {
            if (useDouble)
-            perDofValues->setParameterValues(localPerDofValuesDouble);
+                perDofValues[i].upload(localPerDofValuesDouble[i]);
            else
-            perDofValues->setParameterValues(localPerDofValuesFloat);
+                perDofValues[i].upload(localPerDofValuesFloat[i]);
-        deviceValuesAreCurrent = true;
+            deviceValuesAreCurrent[i] = true;
+        }
+        localValuesAreCurrent[i] = false;
    }
-    localValuesAreCurrent = false;
    double stepSize = integrator.getStepSize();
    recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex), integrator);
    for (int i = 0; i < (int) parameterNames.size(); i++) {
@@ -7733,7 +7710,7 @@ void CudaIntegrateCustomStepKernel::findExpressionsForDerivs(const ExpressionTre
            ;
        if (index == perDofEnergyParamDerivNames.size())
            perDofEnergyParamDerivNames.push_back(param);
-        variableNodes.push_back(make_pair(node, "energyParamDerivs["+cu.intToString(index)+"]"));
+        variableNodes.push_back(make_pair(node, "make_double3(energyParamDerivs["+cu.intToString(index)+"])"));
        needsEnergyParamDerivs = true;
    }
    else {
@@ -8022,49 +7999,46 @@ void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, con
 }
 void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
-    values.resize(perDofValues->getNumObjects()/3);
+    values.resize(perDofValues[variable].getSize());
    const vector<int>& order = cu.getAtomIndex();
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-        if (!localValuesAreCurrent) {
+        if (!localValuesAreCurrent[variable]) {
-            perDofValues->getParameterValues(localPerDofValuesDouble);
+            perDofValues[variable].download(localPerDofValuesDouble[variable]);
-            localValuesAreCurrent = true;
+            localValuesAreCurrent[variable] = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++) {
+            values[order[i]][0] = localPerDofValuesDouble[variable][i].x;
+            values[order[i]][1] = localPerDofValuesDouble[variable][i].y;
+            values[order[i]][2] = localPerDofValuesDouble[variable][i].z;
        }
-        for (int i = 0; i < (int) values.size(); i++)
-            for (int j = 0; j < 3; j++)
-                values[order[i]][j] = localPerDofValuesDouble[3*i+j][variable];
    }
    else {
-        if (!localValuesAreCurrent) {
+        if (!localValuesAreCurrent[variable]) {
-            perDofValues->getParameterValues(localPerDofValuesFloat);
+            perDofValues[variable].download(localPerDofValuesFloat[variable]);
-            localValuesAreCurrent = true;
+            localValuesAreCurrent[variable] = true;
+        }
+        for (int i = 0; i < (int) values.size(); i++) {
+            values[order[i]][0] = localPerDofValuesFloat[variable][i].x;
+            values[order[i]][1] = localPerDofValuesFloat[variable][i].y;
+            values[order[i]][2] = localPerDofValuesFloat[variable][i].z;
        }
-        for (int i = 0; i < (int) values.size(); i++)
-            for (int j = 0; j < 3; j++)
-                values[order[i]][j] = localPerDofValuesFloat[3*i+j][variable];
    }
 }
 void CudaIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
    const vector<int>& order = cu.getAtomIndex();
+    localValuesAreCurrent[variable] = true;
+    deviceValuesAreCurrent[variable] = false;
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
-        if (!localValuesAreCurrent) {
+        localPerDofValuesDouble[variable].resize(values.size());
-            perDofValues->getParameterValues(localPerDofValuesDouble);
-            localValuesAreCurrent = true;
-        }
        for (int i = 0; i < (int) values.size(); i++)
-            for (int j = 0; j < 3; j++)
+            localPerDofValuesDouble[variable][i] = make_double4(values[order[i]][0], values[order[i]][1], values[order[i]][2], 0);
-                localPerDofValuesDouble[3*i+j][variable] = values[order[i]][j];
    }
    else {
-        if (!localValuesAreCurrent) {
+        localPerDofValuesFloat[variable].resize(values.size());
-            perDofValues->getParameterValues(localPerDofValuesFloat);
-            localValuesAreCurrent = true;
-        }
        for (int i = 0; i < (int) values.size(); i++)
-            for (int j = 0; j < 3; j++)
+            localPerDofValuesFloat[variable][i] = make_float4(values[order[i]][0], values[order[i]][1], values[order[i]][2], 0);
-                localPerDofValuesFloat[3*i+j][variable] = (float) values[order[i]][j];
    }
-    deviceValuesAreCurrent = false;
 }
 void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {

--- a/platforms/cuda/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda/src/kernels/customIntegratorPerDof.cu
@@ -23,10 +23,14 @@ inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ po
 #endif
 }
-inline __device__ double4 convertToDouble4(mixed4 a) {
+inline __device__ double4 convertToDouble4(float4 a) {
    return make_double4(a.x, a.y, a.z, a.w);
 }
+inline __device__ double4 convertToDouble4(double4 a) {
+    return a;
+}
 inline __device__ mixed4 convertFromDouble4(double4 a) {
    return make_mixed4(a.x, a.y, a.z, a.w);
 }
@@ -36,7 +40,7 @@ extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __rest
        mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues,
        const mixed energy, mixed* __restrict__ energyParamDerivs
        PARAMETER_ARGUMENTS) {
-    mixed stepSize = dt[0].y;
+    double3 stepSize = make_double3(dt[0].y);
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    const double forceScale = 1.0/0xFFFFFFFF;
    while (index < NUM_ATOMS) {

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -7592,26 +7592,15 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
 string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator,
        const string& forceName, const string& energyName, vector<const TabulatedFunction*>& functions, vector<pair<string, string> >& functionNames) {
    string tempType = (cl.getSupportsDoublePrecision() ? "double3" : "float3");
+    string convert = (cl.getSupportsDoublePrecision() ? "convert_double3" : "");
    map<string, Lepton::ParsedExpression> expressions;
-    if (variable == "x")
+    expressions[tempType+" tempResult = "] = expr;
-        expressions["position.xyz = "] = expr;
-    else if (variable == "v")
-        expressions["velocity.xyz = "] = expr;
-    else if (variable == "")
-        expressions[tempType+" tempSum = "] = expr;
-    else {
-        for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
-            if (variable == integrator.getPerDofVariableName(i))
-                expressions["perDof"+cl.intToString(i)+" = "] = expr;
-    }
-    if (expressions.size() == 0)
-        throw OpenMMException("Unknown per-DOF variable: "+variable);
    map<string, string> variables;
-    variables["x"] = "position.xyz";
+    variables["x"] = convert+"(position.xyz)";
-    variables["v"] = "velocity.xyz";
+    variables["v"] = convert+"(velocity.xyz)";
-    variables[forceName] = "f.xyz";
+    variables[forceName] = convert+"(f.xyz)";
-    variables["gaussian"] = "convert_mixed4(gaussian).xyz";
+    variables["gaussian"] = convert+"(gaussian.xyz)";
-    variables["uniform"] = "convert_mixed4(uniform).xyz";
+    variables["uniform"] = convert+"(uniform.xyz)";
    variables["m"] = "mass";
    variables["dt"] = "stepSize";
    if (energyName != "")
@@ -7619,7 +7608,7 @@ string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& va
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
        variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(globalVariableIndex[i])+"]";
    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
-        variables[integrator.getPerDofVariableName(i)] = "perDof"+cl.intToString(i);
+        variables[integrator.getPerDofVariableName(i)] = convert+"(perDof"+cl.intToString(i)+")";
    for (int i = 0; i < (int) parameterNames.size(); i++)
        variables[parameterNames[i]] = "globals["+cl.intToString(parameterVariableIndex[i])+"]";
    vector<pair<ExpressionTreeNode, string> > variableNodes;
@@ -7627,8 +7616,19 @@ string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& va
    for (auto& var : variables)
        variableNodes.push_back(make_pair(ExpressionTreeNode(new Operation::Variable(var.first)), var.second));
    string result = cl.getExpressionUtilities().createExpressions(expressions, variableNodes, functions, functionNames, "temp", tempType);
-    if (variable == "")
+    if (variable == "x")
-        result += "sum[index] = tempSum.x+tempSum.y+tempSum.z;\n";
+        result += "position.x = tempResult.x; position.y = tempResult.y; position.z = tempResult.z;\n";
+    else if (variable == "v")
+        result += "velocity.x = tempResult.x; velocity.y = tempResult.y; velocity.z = tempResult.z;\n";
+    else if (variable == "")
+        result += "sum[index] = tempResult.x+tempResult.y+tempResult.z;\n";
+    else {
+        for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
+            if (variable == integrator.getPerDofVariableName(i)) {
+                string varName = "perDof"+cl.intToString(i);
+                result += varName+".x = tempResult.x; "+varName+".y = tempResult.y; "+varName+".z = tempResult.z;\n";
+            }
+    }
    return result;
 }
@@ -7637,7 +7637,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
    int numAtoms = cl.getNumAtoms();
    int numSteps = integrator.getNumComputations();
    bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
-    string tempType = (useDouble ? "double3" : "float3");
+    string tempType = (cl.getSupportsDoublePrecision() ? "double3" : "float3");
    string perDofType = (useDouble ? "double4" : "float4");
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
@@ -7849,7 +7849,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                stringstream compute;
                for (int i = 0; i < perDofValues.size(); i++)
-                    compute << tempType<<" perDof"<<cl.intToString(i)<<" = perDofValues"<<cl.intToString(i)<<"[index].xyz;\n";
+                    compute << tempType<<" perDof"<<cl.intToString(i)<<" = convert_"<<tempType<<"(perDofValues"<<cl.intToString(i)<<"[index].xyz);\n";
                int numGaussian = 0, numUniform = 0;
                for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
                    numGaussian += numAtoms*usesVariable(expression[j][0], "gaussian");
@@ -7874,7 +7874,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                        compute << "velm[index] = convert_mixed4(velocity);\n";
                    else {
                        for (int i = 0; i < perDofValues.size(); i++)
-                            compute << "perDofValues"<<cl.intToString(i)<<"[index] = ("<<perDofType<<") (perDof"<<cl.intToString(i)<<", 0);\n";
+                            compute << "perDofValues"<<cl.intToString(i)<<"[index] = ("<<perDofType<<") (perDof"<<cl.intToString(i)<<".x, perDof"<<cl.intToString(i)<<".y, perDof"<<cl.intToString(i)<<".z, 0);\n";
                    }
                    if (numGaussian > 0)
                        compute << "gaussianIndex += NUM_ATOMS;\n";
@@ -7971,7 +7971,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        stringstream computeKE;
        for (int i = 0; i < perDofValues.size(); i++)
-            computeKE << tempType<<" perDof"<<cl.intToString(i)<<" = perDofValues"<<cl.intToString(i)<<"[index].xyz;\n";
+            computeKE << tempType<<" perDof"<<cl.intToString(i)<<" = convert_"<<tempType<<"(perDofValues"<<cl.intToString(i)<<"[index].xyz);\n";
        Lepton::ParsedExpression keExpression = Lepton::Parser::parse(integrator.getKineticEnergyExpression()).optimize();
        computeKE << createPerDofComputation("", keExpression, integrator, "f", "", functionList, functionNames);
        map<string, string> replacements;
@@ -8005,7 +8005,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
            kineticEnergyKernel.setArg<cl_float>(index++, 0.0f);
        kineticEnergyKernel.setArg<cl::Buffer>(index++, perDofEnergyParamDerivs.getDeviceBuffer());
        for (auto& array : perDofValues)
-            kineticEnergyKernel.setArg<cl::Memory>(index++, array.getDeviceBuffer());
+            kineticEnergyKernel.setArg<cl::Buffer>(index++, array.getDeviceBuffer());
        for (auto& array : tabulatedFunctions)
            kineticEnergyKernel.setArg<cl::Buffer>(index++, array.getDeviceBuffer());
        keNeedsForce = usesVariable(keExpression, "f");