Merge branch 'master' of https://github.com/pandegroup/openmm into nbfix

208d5240 · ChayaSt · 79e76a4e · 20af24c4 · 208d5240 · 208d5240
Commit 208d5240 authored Feb 19, 2016 by ChayaSt
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,6 +17,7 @@ env:
 matrix:
  include:
    - sudo: required
+      dist: trusty
      env: ==CPU_OPENCL==
           OPENCL=true
           CUDA=false
@@ -33,7 +34,9 @@ matrix:
           -DOPENMM_BUILD_AMOEBA_PLUGIN=OFF
           -DOPENMM_BUILD_PYTHON_WRAPPERS=OFF
           -DOPENMM_BUILD_C_AND_FORTRAN_WRAPPERS=OFF
-           -DOPENMM_BUILD_EXAMPLES=OFF"
+           -DOPENMM_BUILD_EXAMPLES=OFF
+           -DOPENCL_INCLUDE_DIR=$HOME/AMDAPPSDK/include
+           -DOPENCL_LIBRARY=$HOME/AMDAPPSDK/lib/x86_64/libOpenCL.so"
      addons: {apt: {packages: []}}
    - sudo: required
@@ -107,9 +110,16 @@ before_install:
      sudo easy_install pytest;
    fi
  - if [[ "$OPENCL" == "true" ]]; then
-      sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main universe restricted multiverse";
+      wget https://jenkins.choderalab.org/userContent/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2;
-      sudo apt-get -yq update > /dev/null 2>&1 ;
+      tar -xjf AMD-APP-SDK*.tar.bz2;
-      sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers;
+      AMDAPPSDK=${HOME}/AMDAPPSDK;
+      export OPENCL_VENDOR_PATH=${AMDAPPSDK}/etc/OpenCL/vendors;
+      mkdir -p ${OPENCL_VENDOR_PATH};
+      sh AMD-APP-SDK*.sh --tar -xf -C ${AMDAPPSDK};
+      echo libamdocl64.so > ${OPENCL_VENDOR_PATH}/amdocl64.icd;
+      export LD_LIBRARY_PATH=${AMDAPPSDK}/lib/x86_64:${LD_LIBRARY_PATH};
+      chmod +x ${AMDAPPSDK}/bin/x86_64/clinfo;
+      ${AMDAPPSDK}/bin/x86_64/clinfo;
    fi
  # Install swig for Python wrappers. However, testing CUDA and OpenCL, we
  # skip the Python wrapper for speed. We're not using anaconda python,

--- a/devtools/packaging/scripts/linux/build.sh
+++ b/devtools/packaging/scripts/linux/build.sh
@@ -19,13 +19,14 @@ CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=$INSTALL"
 # setting the rpath so that libOpenMMPME.so finds the right libfftw3
 #CMAKE_FLAGS+=" -DCMAKE_INSTALL_RPATH=.."
 CMAKE_FLAGS+=" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++"
-CMAKE_FLAGS+=" -DCUDA_CUDART_LIBRARY=/usr/local/cuda-7.0/lib64/libcudart.so"
+CMAKE_FLAGS+=" -DCUDA_CUDART_LIBRARY=/usr/local/cuda-7.5/lib64/libcudart.so"
-CMAKE_FLAGS+=" -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-7.0/bin/nvcc"
+CMAKE_FLAGS+=" -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-7.5/bin/nvcc"
-CMAKE_FLAGS+=" -DCUDA_SDK_ROOT_DIR=/usr/local/cuda-7.0/"
+CMAKE_FLAGS+=" -DCUDA_SDK_ROOT_DIR=/usr/local/cuda-7.5/"
-CMAKE_FLAGS+=" -DCUDA_TOOLKIT_INCLUDE=/usr/local/cuda-7.0/include"
+CMAKE_FLAGS+=" -DCUDA_TOOLKIT_INCLUDE=/usr/local/cuda-7.5/include"
-CMAKE_FLAGS+=" -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-7.0/"
+CMAKE_FLAGS+=" -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-7.5/"
-CMAKE_FLAGS+=" -DOPENCL_INCLUDE_DIR=/opt/AMDAPPSDK-2.9-1/include/"
+CMAKE_FLAGS+=" -DOPENCL_INCLUDE_DIR=/opt/AMDAPPSDK-3.0/include/"
-CMAKE_FLAGS+=" -DOPENCL_LIBRARY=/opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so"
+CMAKE_FLAGS+=" -DOPENCL_LIBRARY=/opt/AMDAPPSDK-3.0/lib/x86_64/libOpenCL.so"
+CMAKE_FLAGS+=" -DOPENMM_GENERATE_API_DOCS=ON"
 # Set location for FFTW3
 PREFIX="$WORKSPACE/miniconda"
@@ -40,7 +41,8 @@ fi
 mkdir build
 cd build
 cmake ../openmm $CMAKE_FLAGS
-make -j4 all DoxygenApiDocs sphinxpdf
+make -j4 all install
+make -j4 PythonInstall C++ApiDocs PythonApiDocs sphinxpdf
 # Install.
 make install
--- a/devtools/packaging/scripts/linux/prepare.sh
+++ b/devtools/packaging/scripts/linux/prepare.sh
-#!/bin/tcsh
+#!/bin/bash
 # Prepare for build by ensuring necessary prerequisites are locally installed.
@@ -6,16 +6,16 @@
 export WORKSPACE=`pwd`
 # Install miniconda
-export VERSION="Latest"
+export VERSION="latest"
 export PLATFORM="Linux"
 export ARCH="x86_64"
-export MINICONDA="Miniconda-$VERSION-$PLATFORM-$ARCH.sh"
+export MINICONDA="Miniconda2-$VERSION-$PLATFORM-$ARCH.sh"
 if [ -f miniconda ];
 then
   echo "miniconda already exists"
 else
   echo "Downloading miniconda..."
-   rm -rf Miniconda-*
+   rm -rf Miniconda-* miniconda ~/.condarc
   wget --quiet http://repo.continuum.io/miniconda/${MINICONDA}
   bash ${MINICONDA} -b -p miniconda
   PIP_ARGS="-U"
@@ -25,6 +25,6 @@ fi
 export PATH=$WORKSPACE/miniconda/bin:$PATH
 # Ensure configuration is up to date.
-conda config --add channels http://conda.binstar.org/omnia
+conda config --add channels omnia
-conda install --yes --quiet swig fftw3f pip
+conda install --yes --quiet swig fftw3f pip doxygen sphinx sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen lxml cmake
-pip install sphinxcontrib-bibtex
--- a/devtools/packaging/scripts/osx/build.sh
+++ b/devtools/packaging/scripts/osx/build.sh
@@ -22,6 +22,7 @@ CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=$INSTALL"
 CMAKE_FLAGS+=" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++"
 CMAKE_FLAGS+=" -DCMAKE_OSX_DEPLOYMENT_TARGET=10.9"
 CMAKE_FLAGS+=" -DCMAKE_OSX_SYSROOT=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk"
+CMAKE_FLAGS+=" -DOPENMM_GENERATE_API_DOCS=ON"
 # Build in subdirectory.
 # Set location for FFTW3
@@ -37,7 +38,8 @@ fi
 mkdir build
 cd build
 cmake ../openmm $CMAKE_FLAGS
-make -j4 all DoxygenApiDocs sphinxpdf
+make -j4 all install
+make -j4 PythonInstall C++ApiDocs PythonApiDocs sphinxpdf
 # Install.
 make install
--- a/devtools/packaging/scripts/osx/prepare.sh
+++ b/devtools/packaging/scripts/osx/prepare.sh
@@ -27,4 +27,4 @@ export PATH=$WORKSPACE/miniconda/bin:$PATH
 # Ensure configuration is up to date.
 conda config --add channels http://conda.binstar.org/omnia
 conda install --yes --quiet swig fftw3f pip
-pip install sphinxcontrib-bibtex
+pip install sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen
--- a/devtools/packaging/scripts/source/build.sh
+++ b/devtools/packaging/scripts/source/build.sh
@@ -28,6 +28,7 @@ CMAKE_FLAGS+=" -DOPENMM_BUILD_DRUDE_OPENCL_LIB=OFF"
 CMAKE_FLAGS+=" -DOPENMM_BUILD_OPENCL_LIB=OFF"
 CMAKE_FLAGS+=" -DOPENMM_BUILD_RPMD_CUDA_LIB=OFF"
 CMAKE_FLAGS+=" -DOPENMM_BUILD_RPMD_OPENCL_LIB=OFF"
+CMAKE_FLAGS+=" -DOPENMM_GENERATE_API_DOCS=ON"
 # Set location for FFTW3
 #PREFIX="$WORKSPACE/miniconda"
@@ -42,7 +43,8 @@ fi
 mkdir build
 cd build
 cmake ../openmm $CMAKE_FLAGS
-make -j4 all DoxygenApiDocs sphinxpdf
+make -j4 all install
+make -j4 PythonInstall C++ApiDocs PythonApiDocs sphinxpdf
 # Install.
 make install
--- a/devtools/packaging/scripts/source/prepare.sh
+++ b/devtools/packaging/scripts/source/prepare.sh
-#!/bin/tcsh
+#!/bin/bash
 # Prepare for build by ensuring necessary prerequisites are locally installed.
 # Set relative workspace path.
 export WORKSPACE=`pwd`
 # Install miniconda
-export VERSION="Latest"
+export VERSION="latest"
 export PLATFORM="Linux"
 export ARCH="x86_64"
-export MINICONDA="Miniconda-$VERSION-$PLATFORM-$ARCH.sh"
+export MINICONDA="Miniconda2-$VERSION-$PLATFORM-$ARCH.sh"
 if [ -f miniconda ];
 then
   echo "miniconda already exists"
 else
   echo "Downloading miniconda..."
-   rm -rf Miniconda-*
+   rm -rf Miniconda-* miniconda ~/.condarc
   wget --quiet http://repo.continuum.io/miniconda/${MINICONDA}
   bash ${MINICONDA} -b -p miniconda
   PIP_ARGS="-U"
@@ -25,6 +26,6 @@ fi
 export PATH=$WORKSPACE/miniconda/bin:$PATH
 # Ensure configuration is up to date.
-conda config --add channels http://conda.binstar.org/omnia
+conda config --add channels omnia
-conda install --yes --quiet swig fftw3f pip
+conda install --yes --quiet swig fftw3f pip doxygen sphinx sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen lxml cmake
-pip install sphinxcontrib-bibtex
--- a/docs-source/usersguide/application.rst
+++ b/docs-source/usersguide/application.rst
@@ -2070,7 +2070,7 @@ Missing residue templates
 .. CAUTION::
   These features are experimental, and its API is subject to change.
-You can use the :method:`getUnmatchedResidues()` method to get a list of residues
+You can use the :meth:`getUnmatchedResidues()` method to get a list of residues
 in the provided :code:`topology` object that do not currently have a matching
 residue template defined in the :class:`ForceField`.
 ::
@@ -2084,7 +2084,7 @@ with residue template definitions, or identifying which additional residues need
 to be parameterized.
 As a convenience for parameterizing new residues, you can also get a list of
-residues and empty residue templates using :method:`generateTemplatesForUnmatchedResidues`
+residues and empty residue templates using :meth:`generateTemplatesForUnmatchedResidues`
 ::
    pdb = PDBFile('input.pdb')
@@ -2098,7 +2098,7 @@ residues and empty residue templates using :method:`generateTemplatesForUnmatche
        forcefield.registerResidueTemplate(template)
 If you find that templates seem to be incorrectly matched, another useful
-function :method:`getMatchingTemplates()` can help you identify which templates
+function :meth:`getMatchingTemplates()` can help you identify which templates
 are being matched:
 ::

--- a/libraries/irrxml/include/CXMLReaderImpl.h
+++ b/libraries/irrxml/include/CXMLReaderImpl.h
@@ -2,6 +2,8 @@
 // This file is part of the "Irrlicht Engine" and the "irrXML" project.
 // For conditions of distribution and use, see copyright notice in irrlicht.h and/or irrXML.h
+// MODIFIED by Peter Eastman, Feb. 4, 2016, to support numeric escape sequences
 #ifndef __ICXML_READER_IMPL_H_INCLUDED__
 #define __ICXML_READER_IMPL_H_INCLUDED__
@@ -529,10 +531,37 @@ private:
 				pos += SpecialCharacters[specialChar].size();
 			}
 			else
+			{
+				int semicolonPos = origstr.findNext(L';', pos);
+				if (semicolonPos != -1 && origstr.c_str()[pos+1] == L'#')
+				{
+					// it is a numeric character reference
+					int number;
+					core::string<char> numberString;
+					if (origstr.c_str()[pos+2] == L'x')
+					{
+						// hex value
+						for (int i=pos+3; i<semicolonPos; ++i)
+							numberString.append((char) origstr[i]);
+						sscanf(numberString.c_str(), "%x", &number);
+					}
+					else
+					{
+						// decimal value
+						for (int i=pos+2; i<semicolonPos; ++i)
+							numberString.append((char) origstr[i]);
+						sscanf(numberString.c_str(), "%d", &number);
+					}
+					newstr.append(origstr.subString(oldPos, pos - oldPos));
+					newstr.append((char_type) number);
+					pos = semicolonPos+1;
+				}
+				else
 				{
 					newstr.append(origstr.subString(oldPos, pos - oldPos + 1));
 					pos += 1;
 				}
+			}
 			// find next &
 			oldPos = pos;

--- a/openmmapi/include/openmm/internal/timer.h
+++ b/openmmapi/include/openmm/internal/timer.h
+#ifndef OPENMM_TIMER_H_
+#define OPENMM_TIMER_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+/**
+ * This header provides a static function for querying the current system time in seconds.
+ * It is useful when profiling.
+ */
+#ifdef _MSC_VER
+    #include <Windows.h>
+    static double getCurrentTime() {
+        FILETIME ft;
+        GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
+        ULARGE_INTEGER result;
+        result.LowPart = ft.dwLowDateTime;
+        result.HighPart = ft.dwHighDateTime;
+        return 1e-7*result.QuadPart;
+    }
+#else
+    #include <sys/time.h> 
+    static double getCurrentTime() {
+        struct timeval tod;
+        gettimeofday(&tod, 0);
+        return tod.tv_sec+1e-6*tod.tv_usec;
+    }
+#endif
+#endif /*OPENMM_TIMER_H_*/
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -192,6 +192,7 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
+    class GetPositionsTask;
    CudaContext& cu;
 };

--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -29,6 +29,7 @@
 #include "openmm/Platform.h"
 #include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
 #include "windowsExportCuda.h"
 namespace OpenMM {
@@ -122,7 +123,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
-            const std::string& pmeStreamProperty);
+            const std::string& pmeStreamProperty, int numThreads);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
@@ -134,6 +135,7 @@ public:
    int stepCount, computeForceCount;
    double time;
    std::map<std::string, std::string> propertyValues;
+    ThreadPool threads;
 };
 } // namespace OpenMM

--- a/platforms/cuda/src/CudaExpressionUtilities.cpp
+++ b/platforms/cuda/src/CudaExpressionUtilities.cpp
@@ -351,7 +351,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
            break;
        }
        case Operation::POWER:
-            out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            out << "pow((" << tempType << ") " << getTempName(node.getChildren()[0], temps) << ", (" << tempType << ") " << getTempName(node.getChildren()[1], temps) << ")";
            break;
        case Operation::NEGATE:
            out << "-" << getTempName(node.getChildren()[0], temps);
@@ -488,14 +488,14 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
                out << "}";
            }
            else
-                out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << context.doubleToString(exponent) << ")";
+                out << "pow((" << tempType << ") " << getTempName(node.getChildren()[0], temps) << ", (" << tempType << ") " << context.doubleToString(exponent) << ")";
            break;
        }
        case Operation::MIN:
-            out << "min(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            out << "min((" << tempType << ") " << getTempName(node.getChildren()[0], temps) << ", (" << tempType << ") " << getTempName(node.getChildren()[1], temps) << ")";
            break;
        case Operation::MAX:
-            out << "max(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
+            out << "max((" << tempType << ") " << getTempName(node.getChildren()[0], temps) << ", (" << tempType << ") " << getTempName(node.getChildren()[1], temps) << ")";
            break;
        case Operation::ABS:
            out << "fabs(" << getTempName(node.getChildren()[0], temps) << ")";

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -141,17 +141,23 @@ void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
        contexts[i]->setTime(time);
 }
-void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
+class CudaUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
-    cu.setAsCurrent();
+public:
+    GetPositionsTask(CudaContext& cu, vector<Vec3>& positions, vector<float4>& posCorrection) : cu(cu), positions(positions), posCorrection(posCorrection) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
        const vector<int>& order = cu.getAtomIndex();
-    int numParticles = context.getSystem().getNumParticles();
+        int numParticles = cu.getNumAtoms();
-    positions.resize(numParticles);
        Vec3 boxVectors[3];
        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
        if (cu.getUseDoublePrecision()) {
            double4* posq = (double4*) cu.getPinnedBuffer();
-        cu.getPosq().download(posq);
+            for (int i = start; i < end; ++i) {
-        for (int i = 0; i < numParticles; ++i) {
                double4 pos = posq[i];
                int4 offset = cu.getPosCellOffsets()[i];
                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
@@ -159,10 +165,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
        }
        else if (cu.getUseMixedPrecision()) {
            float4* posq = (float4*) cu.getPinnedBuffer();
-        vector<float4> posCorrection;
+            for (int i = start; i < end; ++i) {
-        cu.getPosq().download(posq);
-        cu.getPosqCorrection().download(posCorrection);
-        for (int i = 0; i < numParticles; ++i) {
                float4 pos1 = posq[i];
                float4 pos2 = posCorrection[i];
                int4 offset = cu.getPosCellOffsets()[i];
@@ -171,13 +174,43 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
        }
        else {
            float4* posq = (float4*) cu.getPinnedBuffer();
-        cu.getPosq().download(posq);
+            for (int i = start; i < end; ++i) {
-        for (int i = 0; i < numParticles; ++i) {
                float4 pos = posq[i];
                int4 offset = cu.getPosCellOffsets()[i];
                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
            }
        }
+    }
+    CudaContext& cu;
+    vector<Vec3>& positions;
+    vector<float4>& posCorrection;
+};
+void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
+    cu.setAsCurrent();
+    int numParticles = context.getSystem().getNumParticles();
+    positions.resize(numParticles);
+    vector<float4> posCorrection;
+    if (cu.getUseDoublePrecision()) {
+        double4* posq = (double4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+    }
+    else if (cu.getUseMixedPrecision()) {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq, false);
+        posCorrection.resize(numParticles);
+        cu.getPosqCorrection().download(posCorrection);
+    }
+    else {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+    }
+    // Filling in the output array is done in parallel for speed.
+    GetPositionsTask task(cu, positions, posCorrection);
+    cu.getPlatformData().threads.execute(task);
+    cu.getPlatformData().threads.waitForThreads();
 }
 void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
@@ -6628,12 +6661,12 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
                double value;
                summedValue->download(&value);
-                globalValuesDouble[stepTarget[step].variableIndex] = value;
+                recordGlobalValue(value, stepTarget[step]);
            }
            else {
                float value;
                summedValue->download(&value);
-                globalValuesDouble[stepTarget[step].variableIndex] = value;
+                recordGlobalValue(value, stepTarget[step]);
            }
        }
        else if (stepType[step] == CustomIntegrator::UpdateContextState) {
@@ -6742,6 +6775,7 @@ void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget
        case DT:
            if (value != globalValuesDouble[dtVariableIndex])
                deviceGlobalsAreCurrent = false;
+            expressionSet.setVariable(dtVariableIndex, value);
            globalValuesDouble[dtVariableIndex] = value;
            cu.getIntegrationUtilities().setNextStepSize(value);
            break;

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -29,9 +29,10 @@
 #include "CudaPlatform.h"
 #include "CudaKernelFactory.h"
 #include "CudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
 #include "openmm/Context.h"
 #include "openmm/System.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/hardware.h"
 #include <algorithm>
 #include <cctype>
 #include <sstream>
@@ -175,7 +176,11 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
        cpuPmePropValue = "false";
-    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue));
+    int threads = getNumProcessors();
+    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
+    if (threadsEnv != NULL)
+        stringstream(threadsEnv) >> threads;
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue, threads));
 }
 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -184,7 +189,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }
 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, int numThreads) :
+                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -59,7 +59,7 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Determine whether a particular interaction is in the list of exclusions.
 */
-inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex) {
+inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {
@@ -180,7 +180,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
        const real4* __restrict__ posq, const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
        int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
 #ifdef USE_EXCLUSIONS
-        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
+        , const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex
 #endif
        ) {
    __shared__ real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
@@ -265,6 +265,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
                }
            }
        }
+        if (atom1 < NUM_ATOMS)
            numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
    }
 }
@@ -308,6 +309,7 @@ extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeig
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
        globalOffset += posBuffer[blockDim.x-1];
+        __syncthreads();
    }
    if (threadIdx.x == 0)
        neighborStartIndex[0] = 0;

--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -56,7 +56,7 @@ void testGaussian() {
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/include/OpenCLExpressionUtilities.h
+++ b/platforms/opencl/include/OpenCLExpressionUtilities.h
@@ -74,7 +74,7 @@ public:
     */
    std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
            const std::vector<const TabulatedFunction*>& functions, const std::vector<std::pair<std::string, std::string> >& functionNames,
-            const std::string& prefix, const std::string& tempType="float");
+            const std::string& prefix, const std::string& tempType="real");
    /**
     * Calculate the spline coefficients for a tabulated function that appears in expressions.
     *