Code cleanup to CUDA platform

b441c4a7 · Peter Eastman · a8020adc · b441c4a7 · b441c4a7 · b441c4a7
Commit b441c4a7 authored Jul 03, 2018 by Peter Eastman
6 changed files
--- a/platforms/cuda/include/CudaArray.h
+++ b/platforms/cuda/include/CudaArray.h
@@ -130,7 +130,27 @@ public:
     * Copy the values in a vector to the device memory.
     */
    template <class T>
-    void upload(const std::vector<T>& data) {
+    void upload(const std::vector<T>& data, bool convert = true) {
+        if (convert && data.size() == size && sizeof(T) != elementSize) {
+            if (sizeof(T) == 2*elementSize) {
+                // Convert values from double to single precision.
+                const double* d = reinterpret_cast<const double*>(&data[0]);
+                std::vector<float> v(elementSize*size/sizeof(float));
+                for (int i = 0; i < v.size(); i++)
+                    v[i] = (float) d[i];
+                upload(&v[0], true);
+                return;
+            }
+            if (2*sizeof(T) == elementSize) {
+                // Convert values from single to double precision.
+                const float* d = reinterpret_cast<const float*>(&data[0]);
+                std::vector<double> v(elementSize*size/sizeof(double));
+                for (int i = 0; i < v.size(); i++)
+                    v[i] = (double) d[i];
+                upload(&v[0], true);
+                return;
+            }
+        }
        if (sizeof(T) != elementSize || data.size() != size)
            throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
        upload(&data[0], true);

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -1592,10 +1592,8 @@ private:
    mutable std::vector<std::vector<double4> > localPerDofValuesDouble;
    std::map<std::string, double> energyParamDerivs;
    std::vector<std::string> perDofEnergyParamDerivNames;
-    std::vector<float> localPerDofEnergyParamDerivsFloat;
+    std::vector<double> localPerDofEnergyParamDerivs;
-    std::vector<double> localPerDofEnergyParamDerivsDouble;
+    std::vector<double> localGlobalValues;
-    std::vector<float> globalValuesFloat;
-    std::vector<double> globalValuesDouble;
    std::vector<double> initialGlobalVariables;
    std::vector<std::vector<CUfunction> > kernels;
    std::vector<std::vector<std::vector<void*> > > kernelArgs;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -728,58 +728,9 @@ string CudaContext::intToString(int value) const {
 }
 std::string CudaContext::getErrorString(CUresult result) {
-    switch (result) {
+    const char* message;
-        case CUDA_SUCCESS: return "CUDA_SUCCESS";
+    if (cuGetErrorName(result, &message) == CUDA_SUCCESS)
-        case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
+        return string(message);
-        case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
-        case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
-        case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
-        case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED";
-        case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
-        case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
-        case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
-        case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
-        case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
-        case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
-        case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
-        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
-        case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
-        case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
-        case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
-        case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
-        case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
-        case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
-        case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
-        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
-        case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
-        case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
-        case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
-        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
-        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
-        case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
-        case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
-        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
-        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
-        case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM";
-        case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
-        case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
-        case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
-        case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
-        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-        case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
-        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
-        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-        case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-        case CUDA_ERROR_ASSERT: return "CUDA_ERROR_ASSERT";
-        case CUDA_ERROR_TOO_MANY_PEERS: return "CUDA_ERROR_TOO_MANY_PEERS";
-        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-        case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
-        case CUDA_ERROR_NOT_SUPPORTED: return "CUDA_ERROR_NOT_SUPPORTED";
-        case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
-    }
    return "CUDA error";
 }
@@ -886,18 +837,10 @@ double CudaContext::reduceEnergy() {
 void CudaContext::setCharges(const vector<double>& charges) {
    if (!chargeBuffer.isInitialized())
        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
-    if (getUseDoublePrecision()) {
+    vector<double> c(numAtoms);
-        vector<double> c(numAtoms);
+    for (int i = 0; i < numAtoms; i++)
-        for (int i = 0; i < numAtoms; i++)
+        c[i] = charges[i];
-            c[i] = charges[i];
+    chargeBuffer.upload(c, true);
-        chargeBuffer.upload(c);
-    }
-    else {
-        vector<float> c(numAtoms);
-        for (int i = 0; i < numAtoms; i++)
-            c[i] = (float) charges[i];
-        chargeBuffer.upload(c);
-    }
    void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
    executeKernel(setChargesKernel, args, numAtoms);
 }
@@ -916,32 +859,33 @@ public:
    VirtualSiteInfo(const System& system) {
        for (int i = 0; i < system.getNumParticles(); i++) {
            if (system.isVirtualSite(i)) {
-                siteTypes.push_back(&typeid(system.getVirtualSite(i)));
+                const VirtualSite& vsite = system.getVirtualSite(i);
+                siteTypes.push_back(&typeid(vsite));
                vector<int> particles;
                particles.push_back(i);
-                for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
+                for (int j = 0; j < vsite.getNumParticles(); j++)
-                    particles.push_back(system.getVirtualSite(i).getParticle(j));
+                    particles.push_back(vsite.getParticle(j));
                siteParticles.push_back(particles);
                vector<double> weights;
-                if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                if (dynamic_cast<const TwoParticleAverageSite*>(&vsite) != NULL) {
                    // A two particle average.
-                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
+                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(vsite);
                    weights.push_back(site.getWeight(0));
                    weights.push_back(site.getWeight(1));
                }
-                else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                else if (dynamic_cast<const ThreeParticleAverageSite*>(&vsite) != NULL) {
                    // A three particle average.
-                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
+                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(vsite);
                    weights.push_back(site.getWeight(0));
                    weights.push_back(site.getWeight(1));
                    weights.push_back(site.getWeight(2));
                }
-                else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
+                else if (dynamic_cast<const OutOfPlaneSite*>(&vsite) != NULL) {
                    // An out of plane site.
-                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
+                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(vsite);
                    weights.push_back(site.getWeight12());
                    weights.push_back(site.getWeight13());
                    weights.push_back(site.getWeightCross());

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -376,58 +376,31 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
        vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
-        if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
+        int elementSize = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
-            ccmaDistance.initialize<double4>(context, numCCMA, "CcmaDistance");
+        ccmaDistance.initialize(context, numCCMA, 4*elementSize, "CcmaDistance");
-            ccmaDelta1.initialize<double>(context, numCCMA, "CcmaDelta1");
+        ccmaDelta1.initialize(context, numCCMA, elementSize, "CcmaDelta1");
-            ccmaDelta2.initialize<double>(context, numCCMA, "CcmaDelta2");
+        ccmaDelta2.initialize(context, numCCMA, elementSize, "CcmaDelta2");
-            ccmaReducedMass.initialize<double>(context, numCCMA, "CcmaReducedMass");
+        ccmaReducedMass.initialize(context, numCCMA, elementSize, "CcmaReducedMass");
-            ccmaConstraintMatrixValue.initialize<double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
+        ccmaConstraintMatrixValue.initialize(context, numCCMA*maxRowElements, elementSize, "ConstraintMatrixValue");
-            vector<double4> distanceVec(ccmaDistance.getSize());
+        vector<double4> distanceVec(ccmaDistance.getSize());
-            vector<double> reducedMassVec(ccmaReducedMass.getSize());
+        vector<double> reducedMassVec(ccmaReducedMass.getSize());
-            vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
+        vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
-            for (int i = 0; i < numCCMA; i++) {
+        for (int i = 0; i < numCCMA; i++) {
-                int index = constraintOrder[i];
+            int index = constraintOrder[i];
-                int c = ccmaConstraints[index];
+            int c = ccmaConstraints[index];
-                atomsVec[i].x = atom1[c];
+            atomsVec[i].x = atom1[c];
-                atomsVec[i].y = atom2[c];
+            atomsVec[i].y = atom2[c];
-                distanceVec[i].w = distance[c];
+            distanceVec[i].w = distance[c];
-                reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
+            reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
-                for (unsigned int j = 0; j < matrix[index].size(); j++) {
+            for (unsigned int j = 0; j < matrix[index].size(); j++) {
-                    constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
+                constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
-                    constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
+                constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
-                }
-                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
-            }
-            ccmaDistance.upload(distanceVec);
-            ccmaReducedMass.upload(reducedMassVec);
-            ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
-        }
-        else {
-            ccmaDistance.initialize<float4>(context, numCCMA, "CcmaDistance");
-            ccmaDelta1.initialize<float>(context, numCCMA, "CcmaDelta1");
-            ccmaDelta2.initialize<float>(context, numCCMA, "CcmaDelta2");
-            ccmaReducedMass.initialize<float>(context, numCCMA, "CcmaReducedMass");
-            ccmaConstraintMatrixValue.initialize<float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
-            vector<float4> distanceVec(ccmaDistance.getSize());
-            vector<float> reducedMassVec(ccmaReducedMass.getSize());
-            vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
-            for (int i = 0; i < numCCMA; i++) {
-                int index = constraintOrder[i];
-                int c = ccmaConstraints[index];
-                atomsVec[i].x = atom1[c];
-                atomsVec[i].y = atom2[c];
-                distanceVec[i].w = (float) distance[c];
-                reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
-                for (unsigned int j = 0; j < matrix[index].size(); j++) {
-                    constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
-                    constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
-                }
-                constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
            }
-            ccmaDistance.upload(distanceVec);
+            constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
-            ccmaReducedMass.upload(reducedMassVec);
-            ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
        }
+        ccmaDistance.upload(distanceVec, true);
+        ccmaReducedMass.upload(reducedMassVec, true);
+        ccmaConstraintMatrixValue.upload(constraintMatrixValueVec, true);
        for (unsigned int i = 0; i < atomConstraints.size(); i++) {
            numAtomConstraintsVec[i] = atomConstraints[i].size();
            for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
@@ -521,57 +494,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
        vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
    }
-    if (context.getUseDoublePrecision()) {
+    int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-        vsite2AvgWeights.initialize<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
+    vsite2AvgWeights.initialize(context, max(1, num2Avg), 2*elementSize, "vsite2AvgWeights");
-        vsite3AvgWeights.initialize<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
+    vsite3AvgWeights.initialize(context, max(1, num3Avg), 4*elementSize, "vsite3AvgWeights");
-        vsiteOutOfPlaneWeights.initialize<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
+    vsiteOutOfPlaneWeights.initialize(context, max(1, numOutOfPlane), 4*elementSize, "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsWeights.initialize<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
+    vsiteLocalCoordsWeights.initialize(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), elementSize, "vsiteLocalCoordsWeights");
-        vsiteLocalCoordsPos.initialize<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
+    vsiteLocalCoordsPos.initialize(context, max(1, (int) vsiteLocalCoordsPosVec.size()), 4*elementSize, "vsiteLocalCoordsPos");
-        if (num2Avg > 0)
+    if (num2Avg > 0)
-            vsite2AvgWeights.upload(vsite2AvgWeightVec);
+        vsite2AvgWeights.upload(vsite2AvgWeightVec, true);
-        if (num3Avg > 0)
+    if (num3Avg > 0)
-            vsite3AvgWeights.upload(vsite3AvgWeightVec);
+        vsite3AvgWeights.upload(vsite3AvgWeightVec, true);
-        if (numOutOfPlane > 0)
+    if (numOutOfPlane > 0)
-            vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec);
+        vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec, true);
-        if (numLocalCoords > 0) {
+    if (numLocalCoords > 0) {
-            vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec);
+        vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec, true);
-            vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec);
+        vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec, true);
-        }
-    }
-    else {
-        vsite2AvgWeights.initialize<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
-        vsite3AvgWeights.initialize<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
-        vsiteOutOfPlaneWeights.initialize<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
-        vsiteLocalCoordsWeights.initialize<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
-        vsiteLocalCoordsPos.initialize<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
-        if (num2Avg > 0) {
-            vector<float2> floatWeights(num2Avg);
-            for (int i = 0; i < num2Avg; i++)
-                floatWeights[i] = make_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y);
-            vsite2AvgWeights.upload(floatWeights);
-        }
-        if (num3Avg > 0) {
-            vector<float4> floatWeights(num3Avg);
-            for (int i = 0; i < num3Avg; i++)
-                floatWeights[i] = make_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f);
-            vsite3AvgWeights.upload(floatWeights);
-        }
-        if (numOutOfPlane > 0) {
-            vector<float4> floatWeights(numOutOfPlane);
-            for (int i = 0; i < numOutOfPlane; i++)
-                floatWeights[i] = make_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f);
-            vsiteOutOfPlaneWeights.upload(floatWeights);
-        }
-        if (numLocalCoords > 0) {
-            vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
-            for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
-                floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
-            vsiteLocalCoordsWeights.upload(floatWeights);
-            vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
-            for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
-                floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
-            vsiteLocalCoordsPos.upload(floatPos);
-        }
    }
    // Create the kernels used by this class.

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1905,25 +1905,12 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                        for (int i = 0; i < ndata; i++)
                            if (moduli[i] < 1.0e-7)
                                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
-                        if (cu.getUseDoublePrecision()) {
+                        if (dim == 0)
-                            if (dim == 0)
+                            xmoduli->upload(moduli, true);
-                                xmoduli->upload(moduli);
+                        else if (dim == 1)
-                            else if (dim == 1)
+                            ymoduli->upload(moduli, true);
-                                ymoduli->upload(moduli);
+                        else
-                            else
+                            zmoduli->upload(moduli, true);
-                                zmoduli->upload(moduli);
-                        }
-                        else {
-                            vector<float> modulif(ndata);
-                            for (int i = 0; i < ndata; i++)
-                                modulif[i] = (float) moduli[i];
-                            if (dim == 0)
-                                xmoduli->upload(modulif);
-                            else if (dim == 1)
-                                ymoduli->upload(modulif);
-                            else
-                                zmoduli->upload(modulif);
-                        }
                    }
                }
            }
@@ -1941,14 +1928,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        replacements["CHARGE2"] = "posq2.w";
    }
    else {
-        if (cu.getUseDoublePrecision())
+        charges.upload(chargeVec, true);
-            charges.upload(chargeVec);
-        else {
-            vector<float> c(charges.getSize());
-            for (int i = 0; i < c.size(); i++)
-                c[i] = (float) chargeVec[i];
-            charges.upload(c);
-        }
        replacements["CHARGE1"] = prefix+"charge1";
        replacements["CHARGE2"] = prefix+"charge2";
    }
@@ -2214,16 +2194,8 @@ void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
        sigmaEpsilonVector[i] = make_float2(0,0);
    if (usePosqCharges)
        cu.setCharges(chargeVector);
-    else {
+    else
-        if (cu.getUseDoublePrecision())
+        charges.upload(chargeVector, true);
-            charges.upload(chargeVector);
-        else {
-            vector<float> c(charges.getSize());
-            for (int i = 0; i < c.size(); i++)
-                c[i] = (float) chargeVector[i];
-            charges.upload(c);
-        }
-    }
    sigmaEpsilon.upload(sigmaEpsilonVector);
    // Record the exceptions.
@@ -2889,14 +2861,7 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
        chargeVec[i] = charge;
        paramsVector[i] = make_float2((float) radius, (float) (scalingFactor*radius));
    }
-    if (cu.getUseDoublePrecision())
+    charges.upload(chargeVec, true);
-        charges.upload(chargeVec);
-    else {
-        vector<float> c(charges.getSize());
-        for (int i = 0; i < c.size(); i++)
-            c[i] = (float) chargeVec[i];
-        charges.upload(c);
-    }
    params.upload(paramsVector);
    prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
    surfaceAreaFactor = -6.0*4*M_PI*force.getSurfaceAreaEnergy();
@@ -3044,14 +3009,7 @@ void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, c
    }
    for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
        paramsVector[i] = make_float2(1, 1);
-    if (cu.getUseDoublePrecision())
+    charges.upload(chargeVector, true);
-        charges.upload(chargeVector);
-    else {
-        vector<float> c(charges.getSize());
-        for (int i = 0; i < c.size(); i++)
-            c[i] = (float) chargeVector[i];
-        charges.upload(c);
-    }
    params.upload(paramsVector);
    // Mark that the current reordering may be invalid.
@@ -4781,8 +4739,7 @@ void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, con
    numGroups = force.getNumGroups();
    vector<int> groupParticleVec;
-    vector<float> groupWeightVecFloat;
+    vector<double> groupWeightVec;
-    vector<double> groupWeightVecDouble;
    vector<int> groupOffsetVec;
    groupOffsetVec.push_back(0);
    for (int i = 0; i < numGroups; i++) {
@@ -4794,27 +4751,19 @@ void CudaCalcCustomCentroidBondForceKernel::initialize(const System& system, con
    }
    vector<vector<double> > normalizedWeights;
    CustomCentroidBondForceImpl::computeNormalizedWeights(force, system, normalizedWeights);
-    if (cu.getUseDoublePrecision()) {
+    for (int i = 0; i < numGroups; i++)
-        for (int i = 0; i < numGroups; i++)
+        groupWeightVec.insert(groupWeightVec.end(), normalizedWeights[i].begin(), normalizedWeights[i].end());
-            groupWeightVecDouble.insert(groupWeightVecDouble.end(), normalizedWeights[i].begin(), normalizedWeights[i].end());
-    }
-    else {
-        for (int i = 0; i < numGroups; i++)
-            for (int j = 0; j < normalizedWeights[i].size(); j++)
-                groupWeightVecFloat.push_back((float) normalizedWeights[i][j]);
-    }
    groupParticles.initialize<int>(cu, groupParticleVec.size(), "groupParticles");
    groupParticles.upload(groupParticleVec);
    if (cu.getUseDoublePrecision()) {
        groupWeights.initialize<double>(cu, groupParticleVec.size(), "groupWeights");
-        groupWeights.upload(groupWeightVecDouble);
        centerPositions.initialize<double4>(cu, numGroups, "centerPositions");
    }
    else {
        groupWeights.initialize<float>(cu, groupParticleVec.size(), "groupWeights");
-        groupWeights.upload(groupWeightVecFloat);
        centerPositions.initialize<float4>(cu, numGroups, "centerPositions");
    }
+    groupWeights.upload(groupWeightVec, true);
    groupOffsets.initialize<int>(cu, groupOffsetVec.size(), "groupOffsets");
    groupOffsets.upload(groupOffsetVec);
    groupForces.initialize<long long>(cu, numGroups*3, "groupForces");
@@ -6736,18 +6685,10 @@ void CudaCalcRMSDForceKernel::recordParameters(const RMSDForce& force) {
    // Upload them to the device.
    particles.upload(particleVec);
-    if (cu.getUseDoublePrecision()) {
+    vector<double4> pos;
-        vector<double4> pos;
+    for (Vec3 p : centeredPositions)
-        for (Vec3 p : centeredPositions)
+        pos.push_back(make_double4(p[0], p[1], p[2], 0));
-            pos.push_back(make_double4(p[0], p[1], p[2], 0));
+    referencePos.upload(pos, true);
-        referencePos.upload(pos);
-    }
-    else {
-        vector<float4> pos;
-        for (Vec3 p : centeredPositions)
-            pos.push_back(make_float4(p[0], p[1], p[2], 0));
-        referencePos.upload(pos);
-    }
    // Record the sum of the norms of the reference positions.
@@ -6930,20 +6871,11 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
        double vscale = exp(-stepSize*friction);
        double fscale = (friction == 0 ? stepSize : (1-vscale)/friction);
        double noisescale = sqrt(kT*(1-vscale*vscale));
-        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+        vector<double> p(params.getSize());
-            vector<double> p(params.getSize());
+        p[0] = vscale;
-            p[0] = vscale;
+        p[1] = fscale;
-            p[1] = fscale;
+        p[2] = noisescale;
-            p[2] = noisescale;
+        params.upload(p, true);
-            params.upload(p);
-        }
-        else {
-            vector<float> p(params.getSize());
-            p[0] = (float) vscale;
-            p[1] = (float) fscale;
-            p[2] = (float) noisescale;
-            params.upload(p);
-        }
        prevTemp = temperature;
        prevFriction = friction;
        prevStepSize = stepSize;
@@ -7442,22 +7374,20 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        // Allocate space for storing global values, both on the host and the device.
-        globalValuesFloat.resize(expressionSet.getNumVariables());
+        localGlobalValues.resize(expressionSet.getNumVariables());
-        globalValuesDouble.resize(expressionSet.getNumVariables());
        int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
        globalValues.initialize(cu, expressionSet.getNumVariables(), elementSize, "globalValues");
        for (int i = 0; i < integrator.getNumGlobalVariables(); i++) {
-            globalValuesDouble[globalVariableIndex[i]] = initialGlobalVariables[i];
+            localGlobalValues[globalVariableIndex[i]] = initialGlobalVariables[i];
            expressionSet.setVariable(globalVariableIndex[i], initialGlobalVariables[i]);
        }
        for (int i = 0; i < (int) parameterVariableIndex.size(); i++) {
            double value = context.getParameter(parameterNames[i]);
-            globalValuesDouble[parameterVariableIndex[i]] = value;
+            localGlobalValues[parameterVariableIndex[i]] = value;
            expressionSet.setVariable(parameterVariableIndex[i], value);
        }
        int numContextParams = context.getParameters().size();
-        localPerDofEnergyParamDerivsFloat.resize(numContextParams);
+        localPerDofEnergyParamDerivs.resize(numContextParams);
-        localPerDofEnergyParamDerivsDouble.resize(numContextParams);
        perDofEnergyParamDerivs.initialize(cu, max(1, numContextParams), elementSize, "perDofEnergyParamDerivs");
        // Record information about the targets of steps that will be stored in global variables.
@@ -7732,8 +7662,8 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
    recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex), integrator);
    for (int i = 0; i < (int) parameterNames.size(); i++) {
        double value = context.getParameter(parameterNames[i]);
-        if (value != globalValuesDouble[parameterVariableIndex[i]]) {
+        if (value != localGlobalValues[parameterVariableIndex[i]]) {
-            globalValuesDouble[parameterVariableIndex[i]] = value;
+            localGlobalValues[parameterVariableIndex[i]] = value;
            deviceGlobalsAreCurrent = false;
        }
    }
@@ -7830,16 +7760,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
                if (needsEnergyParamDerivs) {
                    context.getEnergyParameterDerivatives(energyParamDerivs);
                    if (perDofEnergyParamDerivNames.size() > 0) {
-                        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
+                        for (int i = 0; i < perDofEnergyParamDerivNames.size(); i++)
-                            for (int i = 0; i < perDofEnergyParamDerivNames.size(); i++)
+                            localPerDofEnergyParamDerivs[i] = energyParamDerivs[perDofEnergyParamDerivNames[i]];
-                                localPerDofEnergyParamDerivsDouble[i] = energyParamDerivs[perDofEnergyParamDerivNames[i]];
+                        perDofEnergyParamDerivs.upload(localPerDofEnergyParamDerivs, true);
-                            perDofEnergyParamDerivs.upload(localPerDofEnergyParamDerivsDouble);
-                        }
-                        else {
-                            for (int i = 0; i < perDofEnergyParamDerivNames.size(); i++)
-                                localPerDofEnergyParamDerivsFloat[i] = (float) energyParamDerivs[perDofEnergyParamDerivNames[i]];
-                            perDofEnergyParamDerivs.upload(localPerDofEnergyParamDerivsFloat);
-                        }
                    }
                }
            }
@@ -7852,13 +7775,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
        if (needsGlobals[step] && !deviceGlobalsAreCurrent) {
            // Upload the global values to the device.
-            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
+            globalValues.upload(localGlobalValues, true);
-                globalValues.upload(globalValuesDouble);
-            else {
-                for (int j = 0; j < (int) globalValuesDouble.size(); j++)
-                    globalValuesFloat[j] = (float) globalValuesDouble[j];
-                globalValues.upload(globalValuesFloat);
-            }
        }
        bool stepInvalidatesForces = invalidatesForces[step];
        if (stepType[step] == CustomIntegrator::ComputePerDof && !merged[step]) {
@@ -8005,17 +7922,17 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
 void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target, CustomIntegrator& integrator) {
    switch (target.type) {
        case DT:
-            if (value != globalValuesDouble[dtVariableIndex])
+            if (value != localGlobalValues[dtVariableIndex])
                deviceGlobalsAreCurrent = false;
            expressionSet.setVariable(dtVariableIndex, value);
-            globalValuesDouble[dtVariableIndex] = value;
+            localGlobalValues[dtVariableIndex] = value;
            cu.getIntegrationUtilities().setNextStepSize(value);
            integrator.setStepSize(value);
            break;
        case VARIABLE:
        case PARAMETER:
            expressionSet.setVariable(target.variableIndex, value);
-            globalValuesDouble[target.variableIndex] = value;
+            localGlobalValues[target.variableIndex] = value;
            deviceGlobalsAreCurrent = false;
            break;
    }
@@ -8026,8 +7943,8 @@ void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context
        return;
    for (int i = 0; i < (int) parameterNames.size(); i++) {
        double value = context.getParameter(parameterNames[i]);
-        if (value != globalValuesDouble[parameterVariableIndex[i]])
+        if (value != localGlobalValues[parameterVariableIndex[i]])
-            context.setParameter(parameterNames[i], globalValuesDouble[parameterVariableIndex[i]]);
+            context.setParameter(parameterNames[i], localGlobalValues[parameterVariableIndex[i]]);
    }
 }
@@ -8040,7 +7957,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
    }
    values.resize(numGlobalVariables);
    for (int i = 0; i < numGlobalVariables; i++)
-        values[i] = globalValuesDouble[globalVariableIndex[i]];
+        values[i] = localGlobalValues[globalVariableIndex[i]];
 }
 void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
@@ -8053,7 +7970,7 @@ void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, con
        return;
    }
    for (int i = 0; i < numGlobalVariables; i++) {
-        globalValuesDouble[globalVariableIndex[i]] = values[i];
+        localGlobalValues[globalVariableIndex[i]] = values[i];
        expressionSet.setVariable(globalVariableIndex[i], values[i]);
    }
    deviceGlobalsAreCurrent = false;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -794,32 +794,33 @@ public:
    VirtualSiteInfo(const System& system) : OpenCLForceInfo(0) {
        for (int i = 0; i < system.getNumParticles(); i++) {
            if (system.isVirtualSite(i)) {
-                siteTypes.push_back(&typeid(system.getVirtualSite(i)));
+                const VirtualSite& vsite = system.getVirtualSite(i);
+                siteTypes.push_back(&typeid(vsite));
                vector<int> particles;
                particles.push_back(i);
-                for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
+                for (int j = 0; j < vsite.getNumParticles(); j++)
-                    particles.push_back(system.getVirtualSite(i).getParticle(j));
+                    particles.push_back(vsite.getParticle(j));
                siteParticles.push_back(particles);
                vector<double> weights;
-                if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                if (dynamic_cast<const TwoParticleAverageSite*>(&vsite) != NULL) {
                    // A two particle average.
-                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
+                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(vsite);
                    weights.push_back(site.getWeight(0));
                    weights.push_back(site.getWeight(1));
                }
-                else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                else if (dynamic_cast<const ThreeParticleAverageSite*>(&vsite) != NULL) {
                    // A three particle average.
-                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
+                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(vsite);
                    weights.push_back(site.getWeight(0));
                    weights.push_back(site.getWeight(1));
                    weights.push_back(site.getWeight(2));
                }
-                else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
+                else if (dynamic_cast<const OutOfPlaneSite*>(&vsite) != NULL) {
                    // An out of plane site.
-                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
+                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(vsite);
                    weights.push_back(site.getWeight12());
                    weights.push_back(site.getWeight13());
                    weights.push_back(site.getWeightCross());