Merge branch 'master' of https://github.com/SimTk/openmm

8b44287e · Jason Swails · 25580770 · 291484f2 · 8b44287e · 8b44287e
Commit 8b44287e authored Jan 06, 2014 by Jason Swails
15 changed files
--- a/libraries/sfmt/src/SFMT.cpp
+++ b/libraries/sfmt/src/SFMT.cpp
@@ -90,8 +90,10 @@ namespace OpenMM_SFMT {

 class SFMTData {
 public:
+    /** Possibly incorrectly aligned memory for internal state array */
+    char baseData[(N+1)*sizeof(w128_t)];
 	/** the 128-bit internal state array */
-	w128_t sfmt[N];
+	w128_t* sfmt;
 	/** the 32bit integer pointer to the 128-bit internal state array */
 	uint32_t *psfmt32;
 	#if !defined(BIG_ENDIAN64) || defined(ONLY64)
@@ -106,6 +108,9 @@ public:
 	/** a parity check vector which certificate the period of 2^{MEXP} */
 	uint32_t parity[4];
 	SFMTData() {
+        char* offsetData = baseData+15;
+        offsetData -= (long long)offsetData&0xF;
+        sfmt = (w128_t*) offsetData;
 		psfmt32 = &sfmt[0].u[0];
 #if !defined(BIG_ENDIAN64) || defined(ONLY64)
 		psfmt64 = (uint64_t *)&sfmt[0].u[0];

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -55,8 +55,9 @@ public:
        return val;
    }
    float operator[](int i) const {
-        int resultBits = _mm_extract_ps(val, i);
-        return *((float*) &resultBits);
+        float result[4];
+        store(result);
+        return result[i];
    }
    void store(float* v) const {
        _mm_storeu_ps(v, val);
@@ -131,7 +132,9 @@ public:
        return val;
    }
    int operator[](int i) const {
-        return _mm_extract_epi32(val, i);
+        int result[4];
+        store(result);
+        return result[i];
    }
    void store(int* v) const {
        _mm_storeu_si128((__m128i*) v, val);

--- a/openmmapi/src/ContextImpl.cpp
+++ b/openmmapi/src/ContextImpl.cpp
@@ -39,6 +39,7 @@
 #include "openmm/State.h"
 #include "openmm/VirtualSite.h"
 #include "openmm/Context.h"
+#include <algorithm>
 #include <iostream>
 #include <map>
 #include <utility>
@@ -94,6 +95,8 @@ ContextImpl::ContextImpl(Context& owner, const System& system, Integrator& integ
    vector<string> kernelNames;
    kernelNames.push_back(CalcForcesAndEnergyKernel::Name());
    kernelNames.push_back(UpdateStateDataKernel::Name());
+    kernelNames.push_back(ApplyConstraintsKernel::Name());
+    kernelNames.push_back(VirtualSitesKernel::Name());
    for (int i = 0; i < system.getNumForces(); ++i) {
        forceImpls.push_back(system.getForce(i).createImpl());
        map<string, double> forceParameters = forceImpls[forceImpls.size()-1]->getDefaultParameters();
@@ -104,14 +107,40 @@ ContextImpl::ContextImpl(Context& owner, const System& system, Integrator& integ
    hasInitializedForces = true;
    vector<string> integratorKernels = integrator.getKernelNames();
    kernelNames.insert(kernelNames.begin(), integratorKernels.begin(), integratorKernels.end());
-    if (platform == 0)
-        this->platform = platform = &Platform::findPlatform(kernelNames);
-    else if (!platform->supportsKernels(kernelNames))
+    
+    // Select a platform to use.
+    
+    vector<pair<double, Platform*> > candidatePlatforms;
+    if (platform == NULL) {
+        for (int i = 0; i < Platform::getNumPlatforms(); i++) {
+            Platform& p = Platform::getPlatform(i);
+            if (p.supportsKernels(kernelNames))
+                candidatePlatforms.push_back(make_pair(p.getSpeed(), &p));
+        }
+        if (candidatePlatforms.size() == 0)
+            throw OpenMMException("No Platform supports all the requested kernels");
+        sort(candidatePlatforms.begin(), candidatePlatforms.end());
+    }
+    else {
+        if (!platform->supportsKernels(kernelNames))
            throw OpenMMException("Specified a Platform for a Context which does not support all required kernels");
+        candidatePlatforms.push_back(make_pair(platform->getSpeed(), platform));
+    }
+    for (int i = candidatePlatforms.size()-1; i >= 0; i--) {
+        try {
+            this->platform = platform = candidatePlatforms[i].second;
+            platform->contextCreated(*this, properties);
+            break;
+        }
+        catch (...) {
+            if (i > 0)
+                continue;
+            throw;
+        }
+    }
    
    // Create and initialize kernels and other objects.
    
-    platform->contextCreated(*this, properties);
    initializeForcesKernel = platform->createKernel(CalcForcesAndEnergyKernel::Name(), *this);
    initializeForcesKernel.getAs<CalcForcesAndEnergyKernel>().initialize(system);
    updateStateDataKernel = platform->createKernel(UpdateStateDataKernel::Name(), *this);

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -225,8 +225,8 @@ CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
            delete[] bonded14IndexArray[i];
            delete[] bonded14ParamArray[i];
        }
-        delete bonded14IndexArray;
-        delete bonded14ParamArray;
+        delete[] bonded14IndexArray;
+        delete[] bonded14ParamArray;
    }
    if (nonbonded != NULL)
        delete nonbonded;

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -57,7 +57,7 @@ public:

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false), cutoffDistance(0.0f), alphaEwald(0.0f) {
 }

 CpuNonbondedForce::~CpuNonbondedForce() {

--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -75,7 +75,7 @@ public:
    static const int ThreadBlockSize;
    static const int TileSize;
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
-            const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
+            const std::string& compiler, const std::string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData);
    ~CudaContext();
    /**
     * This is called to initialize internal data structures after all Forces in the system

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -95,6 +95,13 @@ public:
        static const std::string key = "CudaCompiler";
        return key;
    }
+    /**
+     * This is the name of the parameter for specifying the host compiler for the CUDA compiler to use.
+     */
+    static const std::string& CudaHostCompiler() {
+        static const std::string key = "CudaHostCompiler";
+        return key;
+    }
    /**
     * This is the name of the parameter for specifying the path to the directory for creating temporary files.
     */
@@ -107,7 +114,7 @@ public:
 class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
-            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty);
+            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -72,9 +72,12 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
 bool CudaContext::hasInitializedCuda = false;

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
-        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
+        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system),
        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
        posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+    this->compiler = "\""+compiler+"\"";
+    if (hostCompiler.size() > 0)
+        this->compiler = compiler+" --compiler-bindir "+hostCompiler;
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
@@ -153,9 +156,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
-    bonded = new CudaBondedUtilities(*this);
-    nonbonded = new CudaNonbondedUtilities(*this);
-    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
    if (useDoublePrecision) {
        posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq");
        velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
@@ -166,9 +166,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
-        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
@@ -181,9 +178,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
        posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
@@ -194,9 +188,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed2"] = "make_float2";
        compilationDefines["make_mixed3"] = "make_float3";
        compilationDefines["make_mixed4"] = "make_float4";
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
-        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
-        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));

@@ -233,6 +224,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    
    // Create utilities objects.
    
+    bonded = new CudaBondedUtilities(*this);
+    nonbonded = new CudaNonbondedUtilities(*this);
    integration = new CudaIntegrationUtilities(*this, system);
    expression = new CudaExpressionUtilities(*this);
 }
@@ -280,6 +273,22 @@ CudaContext::~CudaContext() {
 void CudaContext::initialize() {
    cuCtxSetCurrent(context);
    string errorMessage = "Error initializing Context";
+    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
+    if (useDoublePrecision) {
+        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else if (useMixedPrecision) {
+        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
+    }
+    else {
+        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
+    }
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision || useMixedPrecision)
@@ -441,13 +450,13 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
    out.close();
 #ifdef WIN32
 #ifdef _DEBUG
-    string command = "\""+compiler+"\" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+    string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #else
-    string command = "\""+compiler+"\" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
+    string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
 #endif
    int res = compileInWindows(command);
 #else
-    string command = "\""+compiler+"\" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
+    string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
    int res = std::system(command.c_str());
 #endif
    try {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1460,8 +1460,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    int numParticles = force.getNumParticles();
    sigmaEpsilon = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    CudaArray& posq = cu.getPosq();
-    float4* posqf = (float4*) cu.getPinnedBuffer();
-    double4* posqd = (double4*) cu.getPinnedBuffer();
+    vector<double4> temp(posq.getSize());
+    float4* posqf = (float4*) &temp[0];
+    double4* posqd = (double4*) &temp[0];
    vector<float2> sigmaEpsilonVector(cu.getPaddedNumAtoms(), make_float2(0, 0));
    vector<vector<int> > exclusionList(numParticles);
    double sumSquaredCharges = 0.0;
@@ -1486,7 +1487,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        exclusionList[exclusions[i].first].push_back(exclusions[i].second);
        exclusionList[exclusions[i].second].push_back(exclusions[i].first);
    }
-    posq.upload(cu.getPinnedBuffer());
+    posq.upload(&temp[0]);
    sigmaEpsilon->upload(sigmaEpsilonVector);
    bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
    bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
@@ -2410,8 +2411,9 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
    cu.addAutoclearBuffer(*bornSum);
    cu.addAutoclearBuffer(*bornForce);
    CudaArray& posq = cu.getPosq();
-    float4* posqf = (float4*) cu.getPinnedBuffer();
-    double4* posqd = (double4*) cu.getPinnedBuffer();
+    vector<double4> temp(posq.getSize());
+    float4* posqf = (float4*) &temp[0];
+    double4* posqd = (double4*) &temp[0];
    vector<float2> paramsVector(cu.getPaddedNumAtoms(), make_float2(1, 1));
    const double dielectricOffset = 0.009;
    for (int i = 0; i < force.getNumParticles(); i++) {
@@ -2424,7 +2426,7 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
        else
            posqf[i] = make_float4(0, 0, 0, (float) charge);
    }
-    posq.upload(cu.getPinnedBuffer());
+    posq.upload(&temp[0]);
    params->upload(paramsVector);
    prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
    bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -90,6 +90,7 @@ CudaPlatform::CudaPlatform() {
    platformProperties.push_back(CudaUseCpuPme());
    platformProperties.push_back(CudaCompiler());
    platformProperties.push_back(CudaTempDirectory());
+    platformProperties.push_back(CudaHostCompiler());
    setPropertyDefaultValue(CudaDeviceIndex(), "");
    setPropertyDefaultValue(CudaDeviceName(), "");
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
@@ -114,6 +115,8 @@ CudaPlatform::CudaPlatform() {
    string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
    setPropertyDefaultValue(CudaTempDirectory(), tmp);
 #endif
+    char* hostCompiler = getenv("CUDA_HOST_COMPILER");
+    setPropertyDefaultValue(CudaHostCompiler(), (hostCompiler == NULL ? "" : string(hostCompiler)));
 }

 double CudaPlatform::getSpeed() const {
@@ -149,6 +152,8 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
+    const string& hostCompilerPropValue = (properties.find(CudaHostCompiler()) == properties.end() ?
+            getPropertyDefaultValue(CudaHostCompiler()) : properties.find(CudaHostCompiler())->second);
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
@@ -156,7 +161,7 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
        cpuPmePropValue = "false";
-    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue));
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -165,7 +170,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }

 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0)  {
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
@@ -174,15 +179,24 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
+    try {
        for (int i = 0; i < (int) devices.size(); i++) {
            if (devices[i].length() > 0) {
                unsigned int deviceIndex;
                stringstream(devices[i]) >> deviceIndex;
-            contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
            }
        }
        if (contexts.size() == 0)
-        contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, *this));
+            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
+    }
+    catch (...) {
+        // If an exception was thrown, do our best to clean up memory.
+        
+        for (int i = 0; i < (int) contexts.size(); i++)
+            delete contexts[i];
+        throw;
+    }
    stringstream deviceIndex, deviceName;
    for (int i = 0; i < (int) contexts.size(); i++) {
        if (i > 0) {
@@ -202,6 +216,7 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
    propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
+    propertyValues[CudaPlatform::CudaHostCompiler()] = hostCompilerProperty;
    contextEnergy.resize(contexts.size());
    
    // Determine whether peer-to-peer copying is supported, and enable it if so.

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -55,7 +55,8 @@ void testGaussian() {
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
-            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -65,7 +65,8 @@ void verifySorting(vector<float> array) {
    System system;
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
-            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()));
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -253,8 +253,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
        numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
        numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-        bonded = new OpenCLBondedUtilities(*this);
-        nonbonded = new OpenCLNonbondedUtilities(*this);
        if (useDoublePrecision) {
            posq = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "posq");
            velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
@@ -343,6 +341,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    
    // Create utilities objects.
    
+    bonded = new OpenCLBondedUtilities(*this);
+    nonbonded = new OpenCLNonbondedUtilities(*this);
    integration = new OpenCLIntegrationUtilities(*this, system);
    expression = new OpenCLExpressionUtilities(*this);
 }

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -143,6 +143,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
+    try {
        for (int i = 0; i < (int) devices.size(); i++) {
            if (devices[i].length() > 0) {
                unsigned int deviceIndex;
@@ -152,6 +153,14 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
        }
        if (contexts.size() == 0)
            contexts.push_back(new OpenCLContext(system, platformIndex, -1, precisionProperty, *this));
+    }
+    catch (...) {
+        // If an exception was thrown, do our best to clean up memory.
+        
+        for (int i = 0; i < (int) contexts.size(); i++)
+            delete contexts[i];
+        throw;
+    }
    stringstream deviceIndex, deviceName;
    for (int i = 0; i < (int) contexts.size(); i++) {
        if (i > 0) {

--- a/wrappers/python/simtk/openmm/__init__.py
+++ b/wrappers/python/simtk/openmm/__init__.py
-#
-#
-#
-
 """
 Package simtk.openmm

@@ -13,25 +9,7 @@ It also tries to load any plugin modules it can find.

 __author__ = "Randall J. Radmer"

-import os, sys, glob, os.path
-if sys.platform == "win32":
-    libPrefix=""
-    libExt="dll"
-elif sys.platform == 'darwin':
-    libPrefix="lib"
-    libExt="dylib"
-else:
-    libPrefix="lib"
-    libExt="so"
-    # The following is an evil incantation that is needed to permit
-    # the POSIX "dlopen" function to work.  I do not understand
-    # it.  If a better solution is known, please forward to the
-    # PyOpenMM code maintainers.
-    import ctypes
-    flags = sys.getdlopenflags()
-    sys.setdlopenflags(flags | ctypes.RTLD_GLOBAL)
-
-
+import os, os.path
 from simtk.openmm.openmm import *
 from simtk.openmm.vec3 import Vec3
 from simtk.openmm import version