Merge github.com:SimTk/openmm

19d2885a · Lee-Ping · 99ef4344 · 57a6768e · 19d2885a · 19d2885a
Commit 19d2885a authored Jan 23, 2014 by Lee-Ping
20 changed files
--- a/platforms/cpu/tests/TestCpuSettle.cpp
+++ b/platforms/cpu/tests/TestCpuSettle.cpp
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CPU implementation of the SETTLE algorithm.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CpuPlatform.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "sfmt/SFMT.h"
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+void testConstraints() {
+    const int numMolecules = 10;
+    const int numParticles = numMolecules*3;
+    const int numConstraints = numMolecules*3;
+    const double temp = 100.0;
+    CpuPlatform platform;
+    System system;
+    LangevinIntegrator integrator(temp, 2.0, 0.001);
+    integrator.setConstraintTolerance(1e-5);
+    NonbondedForce* forceField = new NonbondedForce();
+    for (int i = 0; i < numMolecules; ++i) {
+        system.addParticle(16.0);
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        forceField->addParticle(-0.82, 0.317, 0.65);
+        forceField->addParticle(0.41, 1.0, 0.0);
+        forceField->addParticle(0.41, 1.0, 0.0);
+        system.addConstraint(i*3, i*3+1, 0.1);
+        system.addConstraint(i*3, i*3+2, 0.1);
+        system.addConstraint(i*3+1, i*3+2, 0.163);
+    }
+    system.addForce(forceField);
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    for (int i = 0; i < numMolecules; ++i) {
+        positions[i*3] = Vec3((i%4)*0.4, (i/4)*0.4, 0);
+        positions[i*3+1] = positions[i*3]+Vec3(0.1, 0, 0);
+        positions[i*3+2] = positions[i*3]+Vec3(-0.03333, 0.09428, 0);
+        velocities[i*3] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+        velocities[i*3+1] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+        velocities[i*3+2] = Vec3(genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5, genrand_real2(sfmt)-0.5);
+    }
+    context.setPositions(positions);
+    context.setVelocities(velocities);
+
+    // Simulate it and see whether the constraints remain satisfied.
+
+    for (int i = 0; i < 1000; ++i) {
+        integrator.step(1);
+        State state = context.getState(State::Positions | State::Forces);
+        for (int j = 0; j < numConstraints; ++j) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(j, particle1, particle2, distance);
+            Vec3 p1 = state.getPositions()[particle1];
+            Vec3 p2 = state.getPositions()[particle2];
+            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
+            ASSERT_EQUAL_TOL(distance, dist, 1e-5);
+        }
+    }
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        if (!CpuPlatform::isProcessorSupported()) {
+            cout << "CPU is not supported.  Exiting." << endl;
+            return 0;
+        }
+        testConstraints();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/cuda/CMakeLists.txt
+++ b/platforms/cuda/CMakeLists.txt
@@ -14,10 +14,6 @@
 #   libOpenMMCUDA_static[_d].a
 #----------------------------------------------------

-IF (APPLE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
-ENDIF (APPLE)
-
 set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
 if(OPENMM_BUILD_CUDA_TESTS)
    SUBDIRS (tests)
@@ -39,9 +35,9 @@ SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)

 # Ensure that debug libraries have "_d" appended to their names.
 # CMake gets this right on Windows automatically with this definition.
-IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+IF (MSVC)
    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
-ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+ENDIF (MSVC)

 # But on Unix or Cygwin we have to add the suffix manually
 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -75,7 +75,7 @@ public:
    static const int ThreadBlockSize;
    static const int TileSize;
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
-            const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
+            const std::string& compiler, const std::string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData);
    ~CudaContext();
    /**
     * This is called to initialize internal data structures after all Forces in the system
@@ -623,6 +623,8 @@ public:
 */
 class CudaContext::ForcePreComputation {
 public:
+    virtual ~ForcePreComputation() {
+    }
    /**
     * @param includeForce  true if forces should be computed
     * @param includeEnergy true if potential energy should be computed
@@ -639,6 +641,8 @@ public:
 */
 class CudaContext::ForcePostComputation {
 public:
+    virtual ~ForcePostComputation() {
+    }
    /**
     * @param includeForce  true if forces should be computed
     * @param includeEnergy true if potential energy should be computed

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -738,7 +738,7 @@ private:
 class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
 public:
    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcCustomGBForceKernel(name, platform),
-            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
+            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), energyDerivChain(NULL), longEnergyDerivs(NULL), globals(NULL),
            valueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
    }
    ~CudaCalcCustomGBForceKernel();
@@ -772,6 +772,7 @@ private:
    CudaParameterSet* params;
    CudaParameterSet* computedValues;
    CudaParameterSet* energyDerivs;
+    CudaParameterSet* energyDerivChain;
    CudaArray* longEnergyDerivs;
    CudaArray* globals;
    CudaArray* valueBuffers;

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -95,6 +95,13 @@ public:
        static const std::string key = "CudaCompiler";
        return key;
    }
+    /**
+     * This is the name of the parameter for specifying the host compiler for the CUDA compiler to use.
+     */
+    static const std::string& CudaHostCompiler() {
+        static const std::string key = "CudaHostCompiler";
+        return key;
+    }
    /**
     * This is the name of the parameter for specifying the path to the directory for creating temporary files.
     */
@@ -107,7 +114,7 @@ public:
 class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
-            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty);
+            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();

--- a/platforms/cuda/sharedTarget/CMakeLists.txt
+++ b/platforms/cuda/sharedTarget/CMakeLists.txt
@@ -19,6 +19,11 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_CUDA_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_SHARED_LIBRARY")
+IF (APPLE)
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
+ELSE (APPLE)
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}")
+ENDIF (APPLE)

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -72,9 +72,12 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
 bool CudaContext::hasInitializedCuda = false;

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
-        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
+        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
        posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+    this->compiler = "\""+compiler+"\"";
+    if (hostCompiler.size() > 0)
+        this->compiler = compiler+" --compiler-bindir "+hostCompiler;
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -153,9 +156,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
-    bonded = new CudaBondedUtilities(*this);
-    nonbonded = new CudaNonbondedUtilities(*this);
-    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
@@ -166,9 +166,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
-        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
@@ -181,9 +178,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
@@ -194,9 +188,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed2"] = "make_float2";
        compilationDefines["make_mixed3"] = "make_float3";
        compilationDefines["make_mixed4"] = "make_float4";
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));

@@ -233,6 +224,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    
    // Create utilities objects.
    
+    bonded = new CudaBondedUtilities(*this);
+    nonbonded = new CudaNonbondedUtilities(*this);
    integration = new CudaIntegrationUtilities(*this, system);
    expression = new CudaExpressionUtilities(*this);
 }
@@ -280,6 +273,22 @@ CudaContext::~CudaContext() {
 void CudaContext::initialize() {
    cuCtxSetCurrent(context);
    string errorMessage = "Error initializing Context";
+    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
+    if (useDoublePrecision) {
+        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else if (useMixedPrecision) {
+        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else {
+        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
+    }
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision || useMixedPrecision)
@@ -441,13 +450,13 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    out.close();
 #ifdef WIN32
 #ifdef _DEBUG
-    string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+    string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #else
-    string command = "\""+compiler+"\" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+    string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #endif
    int res = compileInWindows(command);
 #else
-    string command = "\""+compiler+"\" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
+    string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
    int res = std::system(command.c_str());
 #endif
    try {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1460,8 +1460,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    int numParticles = force.getNumParticles();
    sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    CudaArray& posq = cu.getPosq();
-    float4* posqf = (float4*) cu.getPinnedBuffer();
-    double4* posqd = (double4*) cu.getPinnedBuffer();
+    vector<double4> temp(posq.getSize());
+    float4* posqf = (float4*) &temp[0];
+    double4* posqd = (double4*) &temp[0];
    vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
    vector<vector<int> > exclusionList(numParticles);
    double sumSquaredCharges = 0.0;
@@ -1486,7 +1487,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
    }
-    posq.upload(cu.getPinnedBuffer());
+    posq.upload(&temp[0]);
    sigmaEpsilon->upload(sigmaEpsilonVector);
    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
@@ -2410,8 +2411,9 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
    cu.addAutoclearBuffer(*bornSum);
    cu.addAutoclearBuffer(*bornForce);
    CudaArray& posq = cu.getPosq();
-    float4* posqf = (float4*) cu.getPinnedBuffer();
-    double4* posqd = (double4*) cu.getPinnedBuffer();
+    vector<double4> temp(posq.getSize());
+    float4* posqf = (float4*) &temp[0];
+    double4* posqd = (double4*) &temp[0];
    vector<float2> paramsVector(cu.getPaddedNumAtoms(), make_float2(1, 1));
    const double dielectricOffset = 0.009;
    for (int i = 0; i < force.getNumParticles(); i++) {
@@ -2424,7 +2426,7 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
        else
            posqf[i] = make_float4(0, 0, 0, (float) charge);
    }
-    posq.upload(cu.getPinnedBuffer());
+    posq.upload(&temp[0]);
    params->upload(paramsVector);
    prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
@@ -2600,6 +2602,8 @@ CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() {
        delete computedValues;
    if (energyDerivs != NULL)
        delete energyDerivs;
+    if (energyDerivChain != NULL)
+        delete energyDerivChain;
    if (longEnergyDerivs != NULL)
        delete longEnergyDerivs;
    if (globals != NULL)
@@ -2743,6 +2747,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
    }
    longEnergyDerivs = CudaArray::create<long long>(cu, force.getNumComputedValues()*cu.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
    energyDerivs = new CudaParameterSet(cu, force.getNumComputedValues(), cu.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
+    energyDerivChain = new CudaParameterSet(cu, force.getNumComputedValues(), cu.getPaddedNumAtoms(), "customGBEnergyDerivativeChain", true);
 
    // Create the kernels.

@@ -3009,6 +3014,11 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            extraArgs << ", " << buffer.getType() << "* __restrict__ derivBuffers" << index;
            compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
        }
+        for (int i = 0; i < (int) energyDerivChain->getBuffers().size(); i++) {
+            CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivChain->getBuffers()[i];
+            string index = cu.intToString(i+1);
+            extraArgs << ", " << buffer.getType() << "* __restrict__ derivChain" << index;
+        }
        extraArgs << ", const long long* __restrict__ derivBuffersIn";
        for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
            load << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
@@ -3054,6 +3064,10 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        
        // Record values.
        
+        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+            string index = cu.intToString(i+1);
+            compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
+        }
        compute << "forceBuffers[index] += (long long) (force.x*0x100000000);\n";
        compute << "forceBuffers[index+PADDED_NUM_ATOMS] += (long long) (force.y*0x100000000);\n";
        compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += (long long) (force.z*0x100000000);\n";
@@ -3066,7 +3080,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
        }
        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
            string index = cu.intToString(i+1);
-            compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
+            compute << "derivChain" << index << "[index] = deriv" << index << ";\n";
        }
        map<string, string> replacements;
        replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
@@ -3204,9 +3218,9 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
            if (chainStr.find(paramName+"1") != chainStr.npos || chainStr.find(paramName+"2") != chainStr.npos)
                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
        }
-        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
+        for (int i = 0; i < (int) energyDerivChain->getBuffers().size(); i++) {
            if (needChainForValue[i]) { 
-                CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
+                CudaNonbondedUtilities::ParameterInfo& buffer = energyDerivChain->getBuffers()[i];
                string paramName = prefix+"dEdV"+cu.intToString(i+1);
                parameters.push_back(CudaNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
            }
@@ -3352,6 +3366,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
            perParticleEnergyArgs.push_back(&computedValues->getBuffers()[i].getMemory());
        for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
            perParticleEnergyArgs.push_back(&energyDerivs->getBuffers()[i].getMemory());
+        for (int i = 0; i < (int) energyDerivChain->getBuffers().size(); i++)
+            perParticleEnergyArgs.push_back(&energyDerivChain->getBuffers()[i].getMemory());
        perParticleEnergyArgs.push_back(&longEnergyDerivs->getDevicePointer());
        if (tabulatedFunctionParams != NULL) {
            for (int i = 0; i < (int) tabulatedFunctions.size(); i++)

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -90,6 +90,7 @@ CudaPlatform::CudaPlatform() {
    platformProperties.push_back(CudaUseCpuPme());
    platformProperties.push_back(CudaCompiler());
    platformProperties.push_back(CudaTempDirectory());
+    platformProperties.push_back(CudaHostCompiler());
    setPropertyDefaultValue(CudaDeviceIndex(), "");
    setPropertyDefaultValue(CudaDeviceName(), "");
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
@@ -114,6 +115,8 @@ CudaPlatform::CudaPlatform() {
    string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
    setPropertyDefaultValue(CudaTempDirectory(), tmp);
 #endif
+    char* hostCompiler = getenv("CUDA_HOST_COMPILER");
+    setPropertyDefaultValue(CudaHostCompiler(), (hostCompiler == NULL ? "" : string(hostCompiler)));
 }

 double CudaPlatform::getSpeed() const {
@@ -149,6 +152,8 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
+    const string& hostCompilerPropValue = (properties.find(CudaHostCompiler()) == properties.end() ?
+            getPropertyDefaultValue(CudaHostCompiler()) : properties.find(CudaHostCompiler())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
@@ -156,7 +161,7 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
        cpuPmePropValue = "false";
-    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue));
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -165,7 +170,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }

 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
@@ -174,15 +179,24 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
-    for (int i = 0; i < (int) devices.size(); i++) {
-        if (devices[i].length() > 0) {
-            unsigned int deviceIndex;
-            stringstream(devices[i]) >> deviceIndex;
-            contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+    try {
+        for (int i = 0; i < (int) devices.size(); i++) {
+            if (devices[i].length() > 0) {
+                unsigned int deviceIndex;
+                stringstream(devices[i]) >> deviceIndex;
+                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+            }
        }
+        if (contexts.size() == 0)
+            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+    }
+    catch (...) {
+        // If an exception was thrown, do our best to clean up memory.
+        
+        for (int i = 0; i < (int) contexts.size(); i++)
+            delete contexts[i];
+        throw;
    }
-    if (contexts.size() == 0)
-        contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, *this));
    stringstream deviceIndex, deviceName;
    for (int i = 0; i < (int) contexts.size(); i++) {
        if (i > 0) {
@@ -202,6 +216,7 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
+    propertyValues[CudaPlatform::CudaHostCompiler()] = hostCompilerProperty;
    contextEnergy.resize(contexts.size());
    
    // Determine whether peer-to-peer copying is supported, and enable it if so.

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -65,7 +65,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                if (r2 < CUTOFF_SQUARED) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
@@ -117,7 +117,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                if (r2 < CUTOFF_SQUARED) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
@@ -268,7 +268,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    if (r2 < CUTOFF_SQUARED) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
@@ -313,7 +313,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
                    if (r2 < CUTOFF_SQUARED) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;

--- a/platforms/cuda/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda/src/kernels/customGBValueN2.cu
@@ -60,7 +60,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                if (r2 < CUTOFF_SQUARED) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
                    real tempValue1 = 0;
@@ -108,7 +108,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                if (r2 < CUTOFF_SQUARED) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+tj;
                    real tempValue1 = 0;
@@ -241,7 +241,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real tempValue1 = 0;
@@ -275,7 +275,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
                    if (r2 < CUTOFF_SQUARED) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real tempValue1 = 0;

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -58,7 +58,7 @@ extern "C" __global__ void computeInteractionGroups(
            if (!isExcluded && r2 < CUTOFF_SQUARED) {
 #endif
                real invR = RSQRT(r2);
-                real r = RECIP(invR);
+                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                real dEdR = 0.0f;
                real tempEnergy = 0.0f;

--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
@@ -116,7 +116,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
                    real rScaledRadiusJ = r+params2.y;
                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
@@ -163,7 +163,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                    real rScaledRadiusJ = r+params2.y;
                    if (params1.x < rScaledRadiusJ) {
@@ -305,7 +305,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
@@ -355,7 +355,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
@@ -461,7 +461,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                    if (r2 < CUTOFF_SQUARED) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        real bornRadius2 = localData[tbx+j].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
@@ -518,7 +518,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                    if (r2 < CUTOFF_SQUARED) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        real bornRadius2 = localData[tbx+tj].bornRadius;
                        real alpha2_ij = bornRadius1*bornRadius2;
                        real D_ij = r2*RECIP(4.0f*alpha2_ij);
@@ -672,7 +672,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        if (r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
-                            real r = RECIP(invR);
+                            real r = r2*invR;
                            real bornRadius2 = localData[tbx+tj].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);
@@ -723,7 +723,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                        if (r2 < CUTOFF_SQUARED) {
 #endif
                            real invR = RSQRT(r2);
-                            real r = RECIP(invR);
+                            real r = r2*invR;
                            real bornRadius2 = localData[tbx+tj].bornRadius;
                            real alpha2_ij = bornRadius1*bornRadius2;
                            real D_ij = r2*RECIP(4.0f*alpha2_ij);

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -161,7 +161,7 @@ extern "C" __global__ void computeNonbonded(
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                real invR = RSQRT(r2);
-                real r = RECIP(invR);
+                real r = r2*invR;
                LOAD_ATOM2_PARAMETERS
                atom2 = y*TILE_SIZE+j;
 #ifdef USE_SYMMETRIC
@@ -232,7 +232,7 @@ extern "C" __global__ void computeNonbonded(
                if (r2 < CUTOFF_SQUARED) {
 #endif
                    real invR = RSQRT(r2);
-                    real r = RECIP(invR);
+                    real r = r2*invR;
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+tj;
 #ifdef USE_SYMMETRIC
@@ -433,7 +433,7 @@ extern "C" __global__ void computeNonbonded(
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
 #ifdef USE_SYMMETRIC
@@ -507,7 +507,7 @@ extern "C" __global__ void computeNonbonded(
                    if (r2 < CUTOFF_SQUARED) {
 #endif
                        real invR = RSQRT(r2);
-                        real r = RECIP(invR);
+                        real r = r2*invR;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
 #ifdef USE_SYMMETRIC

--- a/platforms/cuda/tests/CMakeLists.txt
+++ b/platforms/cuda/tests/CMakeLists.txt
@@ -25,6 +25,11 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    # Link with shared library
    ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
+    IF (APPLE)
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    ELSE (APPLE)
+        SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
+    ENDIF (APPLE)

    IF( ${TEST_ROOT} STREQUAL "TestCUDAGBSAOBCForce2" )


--- a/platforms/cuda/tests/TestCudaCustomGBForce.cpp
+++ b/platforms/cuda/tests/TestCudaCustomGBForce.cpp
@@ -248,7 +248,7 @@ void testMembrane() {
    for (int i = 0; i < (int) forces.size(); ++i)
        norm += forces[i].dot(forces[i]);
    norm = std::sqrt(norm);
-    const double stepSize = 1e-3;
+    const double stepSize = 1e-2;
    double step = 0.5*stepSize/norm;
    vector<Vec3> positions2(numParticles), positions3(numParticles);
    for (int i = 0; i < (int) positions.size(); ++i) {

--- a/platforms/cuda/tests/TestCudaGBSAOBCForce.cpp
+++ b/platforms/cuda/tests/TestCudaGBSAOBCForce.cpp
@@ -71,7 +71,7 @@ void testSingleParticle() {
    double bornRadius = 0.15-0.009; // dielectric offset
    double eps0 = EPSILON0;
    double bornEnergy = (-0.5*0.5/(8*PI_M*eps0))*(1.0/gbsa->getSoluteDielectric()-1.0/gbsa->getSolventDielectric())/bornRadius;
-    double extendedRadius = bornRadius+0.14; // probe radius
+    double extendedRadius = 0.15+0.14; // probe radius
    double nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.15/bornRadius, 6.0); // Where did this formula come from?  Just copied it from CpuImplicitSolvent.cpp
    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
    

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -55,7 +55,8 @@ void testGaussian() {
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
-            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -65,7 +65,8 @@ void verifySorting(vector<float> array) {
    System system;
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
-            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/CMakeLists.txt
+++ b/platforms/opencl/CMakeLists.txt
@@ -14,10 +14,6 @@
 #   libOpenMMOpenCL_static[_d].a
 #----------------------------------------------------

-IF (APPLE)
-    SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
-ENDIF (APPLE)
-
 set(OPENMM_BUILD_OPENCL_TESTS TRUE CACHE BOOL "Whether to build OpenCL test cases")
 if(OPENMM_BUILD_OPENCL_TESTS)
    SUBDIRS (tests)
@@ -39,9 +35,9 @@ SET(STATIC_TARGET ${OPENMMOPENCL_LIBRARY_NAME}_static)

 # Ensure that debug libraries have "_d" appended to their names.
 # CMake gets this right on Windows automatically with this definition.
-IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+IF (MSVC)
    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
-ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+ENDIF (MSVC)

 # But on Unix or Cygwin we have to add the suffix manually
 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)