Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 5a06df78 · 5a06df78
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/.azure-pipelines/azure-pipelines-windows.yml
+++ b/.azure-pipelines/azure-pipelines-windows.yml
+jobs:
+
+  # Configure, build, install, and test job
+  - job: 'windows_build'
+    displayName: 'Windows VS2015'
+    pool:
+      vmImage: 'vs2015-win2012r2'
+    timeoutInMinutes: 360
+    variables:
+      llvm.version: '7.0.1'
+      mkl.version: '2019.1'
+      python.version: '3.6'
+      cmake.build.type: 'Release'
+    steps:
+      # Install Chocolatey (https://chocolatey.org/install#install-with-powershellexe)
+      - powershell: |
+          Set-ExecutionPolicy Bypass -Scope Process -Force
+          iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))
+          Write-Host "##vso[task.setvariable variable=PATH]$env:PATH"
+          choco --version
+        displayName: "Install Chocolatey"
+      # Install Miniconda
+      - script: |
+          choco install -y miniconda3
+          choco install -y doxygen.install
+          choco install -y swig
+          choco install -y graphviz
+          choco install -y 7zip.install
+          choco install -y wget
+          set PATH=C:\tools\miniconda3\Scripts;C:\tools\miniconda3;C:\tools\miniconda3\Library\bin;%PATH%
+          echo '##vso[task.setvariable variable=PATH]%PATH%'
+          set LIB=C:\tools\miniconda3\Library\lib;%LIB%
+          echo '##vso[task.setvariable variable=LIB]%LIB%'
+          conda --version
+        displayName: "Install Miniconda"
+      # Configure Miniconda
+      - script: |
+          conda config --set always_yes yes
+          conda info
+        displayName: "Configure Miniconda"
+      # Create conda enviroment
+      # Note: conda activate doesn't work here, because it creates a new shell!
+      - script: |
+          conda install cmake ^
+                        cython ^
+                        fftw ^
+                        ninja ^
+                        numpy ^
+                        pytest ^
+                        pytest-xdist ^
+                        python=$(python.version)
+          conda list
+        displayName: "Install conda packages"
+      # Download OpenCL Headers and build the ICD loader
+      - script: |
+          setlocal EnableDelayedExpansion
+          call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64
+          mkdir opencl
+          cd opencl
+          wget https://www.khronos.org/registry/cl/specs/opencl-icd-1.2.11.0.tgz -O opencl-icd-1.2.11.0.tgz
+          7z x opencl-icd-1.2.11.0.tgz > $null
+          7z x opencl-icd-1.2.11.0.tar > $null
+          robocopy .\icd . /E /MOVE
+          mkdir inc\CL > $null
+          wget https://github.com/KhronosGroup/OpenCL-Headers/archive/master.zip
+          7z x master.zip
+          move .\OpenCL-Headers-master\CL\*.h .\inc\CL\
+          mkdir lib > $null
+          cd lib
+          cmake -G Ninja ..
+          cmake --build . ^
+                -- -j %NUMBER_OF_PROCESSORS%
+        displayName: "Download and install OpenCL"
+        workingDirectory: $(Pipeline.Workspace)
+      # Configure
+      - script: |
+          setlocal EnableDelayedExpansion
+          call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64
+          mkdir build & cd build
+          cmake -G Ninja ^
+                -DOPENCL_INCLUDE_DIR=$(Pipeline.Workspace)/opencl/inc ^
+                -DOPENCL_LIBRARY=$(Pipeline.Workspace)/opencl/lib/OpenCL.lib ^
+                -DCMAKE_BUILD_TYPE=$(cmake.build.type) ^
+                -DCMAKE_INSTALL_PREFIX=../install ^
+                -DOPENMM_BUILD_EXAMPLES=OFF ^
+                -DOPENMM_BUILD_OPENCL_TESTS=OFF ^
+                $(Build.SourcesDirectory)
+        displayName: "Configure OpenMM with CMake"
+        workingDirectory: $(Build.BinariesDirectory)
+      # Build
+      - script: |
+          call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64
+          cmake --build . ^
+                --config $(cmake.build.type) ^
+                -- -j %NUMBER_OF_PROCESSORS%
+          cmake --build . --target install
+          cmake --build . --target PythonInstall
+        displayName: "Build OpenMM"
+        workingDirectory: $(Build.BinariesDirectory)/build
+      # Test
+      - script: |
+          python $(Build.SourcesDirectory)\devtools\run-ctest.py --job-duration 50 --parallel %NUMBER_OF_PROCESSORS%
+          cd python\tests
+          python --version
+          set PYTHONPATH=D:\tools\miniconda3\Lib\site-packages
+          dir %PYTHONPATH%
+          py.test -v -n %NUMBER_OF_PROCESSORS%
+        workingDirectory: $(Build.BinariesDirectory)/build
+        displayName: "Run OpenMM tests"
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,12 +14,12 @@ env:
    - CCACHE=$HOME/ccache/lib/ccache/bin


-matrix:
+jobs:
  include:
    - sudo: required
-      dist: trusty
-      env: ==CPU_OPENCL==
-           OPENCL=true
+      dist: xenial
+      name: "CPU OpenCL"
+      env: OPENCL=true
           CUDA=false
           CC=$CCACHE/gcc
           CXX=$CCACHE/g++
@@ -40,9 +40,9 @@ matrix:
      addons: {apt: {packages: []}}

    - sudo: required
-      dist: trusty
-      env: ==CUDA_COMPILE==
-           CUDA=true
+      dist: xenial
+      name: "CUDA Compile"
+      env: CUDA=true
           OPENCL=false
           CUDA_VERSION="7.5-18"
           CC=$CCACHE/gcc
@@ -66,28 +66,28 @@ matrix:
    - language: objective-c
      os: osx
      osx_image: xcode9.3
-      env: ==OSX==
-           OPENCL=false
+      name: "Mac OS"
+      env: OPENCL=false
           CUDA=false
           CMAKE_FLAGS="
             -DOPENMM_BUILD_OPENCL_TESTS=OFF"
      addons: {apt: {packages: []}}

    - sudo: false
-      dist: trusty
-      python: 3.6
-      env: ==STATIC_LIB==
-           OPENCL=false
+      dist: xenial
+      python: "3.6"
+      name: "Static Lib"
+      env: OPENCL=false
           CUDA=false
           CC=$CCACHE/clang
           CXX=$CCACHE/clang++
           CMAKE_FLAGS="-DOPENMM_BUILD_STATIC_LIB=ON"

    - sudo: false
-      dist: trusty
-      python: 3.6
-      env: ==PYTHON_3_6==
-           OPENCL=false
+      dist: xenial
+      python: "3.6"
+      name: "Python 3.6"
+      env: OPENCL=false
           CUDA=false
           CC=$CCACHE/clang
           CXX=$CCACHE/clang++
@@ -96,9 +96,9 @@ matrix:

    - sudo: false
      dist: xenial
-      python: 3.8
-      env: ==PYTHON_3_8==
-           OPENCL=false
+      python: "3.8"
+      name: "Python 3.8"
+      env: OPENCL=false
           CUDA=false
           CC=$CCACHE/gcc
           CXX=$CCACHE/g++

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -341,6 +341,12 @@ IF(OPENMM_BUILD_OPENCL_LIB)
    ADD_SUBDIRECTORY(platforms/opencl)
 ENDIF(OPENMM_BUILD_OPENCL_LIB)

+# Common compute files
+
+IF(CUDA_FOUND OR OPENCL_FOUND)
+    ADD_SUBDIRECTORY(platforms/common)
+ENDIF()
+
 # Optimized CPU platform

 SET(OPENMM_BUILD_CPU_LIB ON CACHE BOOL "Build optimized CPU platform")

--- a/platforms/opencl/EncodeCLFiles.cmake
+++ b/platforms/opencl/EncodeCLFiles.cmake
-FILE(GLOB OPENCL_KERNELS ${CL_SOURCE_DIR}/kernels/*.cl)
-SET(CL_FILE_DECLARATIONS)
-SET(CL_FILE_DEFINITIONS)
-CONFIGURE_FILE(${CL_SOURCE_DIR}/${CL_SOURCE_CLASS}.cpp.in ${CL_KERNELS_CPP})
-FOREACH(file ${OPENCL_KERNELS})
+FILE(GLOB KERNEL_FILES ${KERNEL_SOURCE_DIR}/kernels/*.${KERNEL_FILE_EXTENSION})
+SET(KERNEL_FILE_DECLARATIONS)
+CONFIGURE_FILE(${KERNEL_SOURCE_DIR}/${KERNEL_SOURCE_CLASS}.cpp.in ${KERNELS_CPP})
+FOREACH(file ${KERNEL_FILES})
    # Load the file contents and process it.
    FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
    # Replace all backslashes by double backslashes as they are being put in a C string.
@@ -15,13 +14,13 @@ FOREACH(file ${OPENCL_KERNELS})
    STRING(REPLACE "\n" "\\n\"\n\"" file_content "${file_content}")

    # Determine a name for the variable that will contain this file's contents
-    FILE(RELATIVE_PATH filename ${CL_SOURCE_DIR}/kernels ${file})
+    FILE(RELATIVE_PATH filename ${KERNEL_SOURCE_DIR}/kernels ${file})
    STRING(LENGTH ${filename} filename_length)
    MATH(EXPR filename_length ${filename_length}-3)
    STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)

    # Record the variable declaration and definition.
-    SET(CL_FILE_DECLARATIONS ${CL_FILE_DECLARATIONS}static\ const\ std::string\ ${variable_name};\n)
-    FILE(APPEND ${CL_KERNELS_CPP} const\ string\ ${CL_SOURCE_CLASS}::${variable_name}\ =\ \"${file_content}\"\;\n)
+    SET(KERNEL_FILE_DECLARATIONS ${KERNEL_FILE_DECLARATIONS}static\ const\ std::string\ ${variable_name};\n)
+    FILE(APPEND ${KERNELS_CPP} const\ string\ ${KERNEL_SOURCE_CLASS}::${variable_name}\ =\ \"${file_content}\"\;\n)
 ENDFOREACH(file)
-CONFIGURE_FILE(${CL_SOURCE_DIR}/${CL_SOURCE_CLASS}.h.in ${CL_KERNELS_H})
+CONFIGURE_FILE(${KERNEL_SOURCE_DIR}/${KERNEL_SOURCE_CLASS}.h.in ${KERNELS_H})
--- a/docs-source/api-c++/Doxyfile.in
+++ b/docs-source/api-c++/Doxyfile.in
@@ -588,7 +588,7 @@ INPUT_ENCODING         = UTF-8
 # *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
 # *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90

-FILE_PATTERNS          =
+FILE_PATTERNS          = *.h

 # The RECURSIVE tag can be used to turn specify whether or not subdirectories
 # should be searched for input files as well. Possible values are YES and NO.

--- a/docs-source/developerguide/developer.rst
+++ b/docs-source/developerguide/developer.rst
@@ -32,6 +32,8 @@ It is organized as follows:
  information relevant to writing OpenCL implementations of new features.
 * Chapter :ref:`the-cuda-platform` discusses the architecture of the CUDA Platform, providing
  information relevant to writing CUDA implementations of new features.
+* Chapter :ref:`common-compute` describes the Common Compute framework, which lets you
+  write a single implementation of a feature that can be used for both OpenCL and CUDA.


 This guide assumes you are already familiar with the public API and how to use
@@ -214,9 +216,9 @@ plugins happen to be loaded in.
 Creating New Platforms
 **********************

-One common type of plugin defines a new Platform.  There are three such plugins
-that come with OpenMM: one for the CPU Platform, one for the CUDA Platform, and
-one for the OpenCL Platform.
+One common type of plugin defines a new Platform.  There are four such plugins
+that come with OpenMM: one for the Reference platform, one for the CPU Platform,
+one for the CUDA Platform, and one for the OpenCL Platform.

 To define a new Platform, you must create subclasses of the various abstract
 classes in the OpenMM Low Level API: a subclass of Platform, one or more
@@ -456,15 +458,22 @@ It also defines vector versions of these types (\ :code:`real2`\ ,
 Computing Forces
 ****************

-When forces are computed, they are stored in multiple buffers.  This is done to
-enable multiple work-items or work-groups to compute forces on the same particle
-at the same time; as long as each one writes to a different buffer, there is no
-danger of race conditions.  At the start of a force calculation, all forces in
-all buffers are set to zero.   Each Force is then free to add its contributions
-to any or all of the buffers.  Finally, the buffers are summed to produce the
-total force on each particle.
-
-The size of each buffer is equal to the number of particles, rounded up to the
+When forces are computed, they can be stored in either of two places.  There is
+an array of :code:`long` values storing them as 64 bit fixed point values, and
+a collection of buffers of :code:`real4` values storing them in floating point
+format.  Most GPUs support atomic operations on 64 bit integers, which allows
+many threads to simultaneously record forces without a danger of conflicts.
+Some low end GPUs do not support this, however, especially the embedded GPUs
+found in many laptops.  These devices write to the floating point buffers, with
+careful coordination to make sure two threads will never write to the same
+memory location at the same time.
+
+At the start of a force calculation, all forces in all buffers are set to zero.
+Each Force is then free to add its contributions to any or all of the buffers.
+Finally, the buffers are summed to produce the total force on each particle.
+The total is recorded in both the floating point and fixed point arrays.
+
+The size of each floating point buffer is equal to the number of particles, rounded up to the
 next multiple of 32.  Call :code:`getPaddedNumAtoms()` on the OpenCLContext
 to get that number.  The actual force buffers are obtained by calling 
 :code:`getForceBuffers()`\ .  The first *n* entries (where *n* is the
@@ -473,16 +482,13 @@ represent the second force buffer, and so on.  More generally, the *i*\ ’th
 force buffer’s contribution to the force on particle *j* is stored in
 element :code:`i*context.getPaddedNumAtoms()+j`\ .

-Depending on the device, a buffer may also be created that stores contributions
-to the forces in 64 bit fixed point format.  On devices that support atomic
-operations on 64 bit integers in global memory, this can be a more efficient way
-of accumulating forces than using a large number of force buffers.  To convert a
-value from floating point to fixed point, multiply it by 0x100000000 (2\ :sup:`32`\ ),
-then cast it to a :code:`long`\ .  The fixed point buffer is
-ordered differently from the others.  For atom *i*\ , the x component of its
-force is stored in element :code:`i`\ , the y component in element 
+The fixed point buffer is ordered differently.  For atom *i*\ , the x component
+of its force is stored in element :code:`i`\ , the y component in element 
 :code:`i+context.getPaddedNumAtoms()`\ , and the z component in element 
-:code:`i+2*context.getPaddedNumAtoms()`\ .
+:code:`i+2*context.getPaddedNumAtoms()`\ .  To convert a value from floating
+point to fixed point, multiply it by 0x100000000 (2\ :sup:`32`\ ),
+then cast it to a :code:`long`\ .  Call :code:`getLongForceBuffer()` to get the
+array of fixed point values.

 The potential energy is also accumulated in a set of buffers, but this one is
 simply a list of floating point values.  All of them are set to zero at the
@@ -490,15 +496,10 @@ start of a computation, and they are summed at the end of the computation to
 yield the total energy.

 The OpenCL implementation of each Force object should define a subclass of
-OpenCLForce, and register an instance of it by calling :code:`addForce()` on
-the OpenCLContext.  This serves two purposes:
-
-#. It reports how many force buffers are required when calculating this
-   particular Force.  The OpenCLContext sets the size of its force buffer array
-   based on the largest number of buffers required by any Force.
-#. It implements methods for determining whether particular particles or groups
-   of particles are identical.  This is important when reordering particles, and is
-   discussed below.
+ComputeForceInfo, and register an instance of it by calling :code:`addForce()` on
+the OpenCLContext.  It implements methods for determining whether particular
+particles or groups of particles are identical.  This is important when
+reordering particles, and is discussed below.


 Nonbonded Forces
@@ -586,8 +587,7 @@ where *k* is a per-particle parameter.  First we create a parameter as
 follows
 ::

-    nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("kparam", "float", 1,
-            sizeof(cl_float), kparam->getDeviceBuffer()));
+    nb.addParameter(ComputeParameterInfo(kparam, "kparam", "float", 1));

 where :code:`nb` is the OpenCLNonbondedUtilities for the context.  Now we
 call :code:`addInteraction()` to define an interaction with the following
@@ -700,7 +700,7 @@ exchanged without affecting the System in any way.

 Every Force can contribute to defining the boundaries of molecules, and to
 determining whether two molecules are identical.  This is done through the
-OpenCLForceInfo it adds to the OpenCLContext.  It can specify two types of
+ComputeForceInfo it adds to the OpenCLContext.  It can specify two types of
 information:

 #. Given a pair of particles, it can say whether those two particles are
@@ -792,3 +792,189 @@ buffer.  In contrast, the CUDA platform uses *only* the fixed point buffer
 the CUDA platform only works on devices that support 64 bit atomic operations
 (compute capability 1.2 or higher).

+
+.. _common-compute
+
+Common Compute
+##############
+
+Common Compute is not a platform, but it shares many elements of one.  It exists
+to reduce code duplication between the OpenCL and CUDA platforms.  It allows a
+single implementation to be written for most kernels that can be used by both
+platforms.
+
+OpenCL and CUDA are very similar to each other.  Their computational models are
+nearly identical.  For example, each is based around launching kernels that are
+executed in parallel by many threads.  Each of them groups threads into blocks,
+with more communication and synchronization permitted between the threads
+in a block than between ones in different blocks.  They have very similar memory
+hierarchies: high latency global memory, low latency local/shared memory that
+can be used for communication between the threads of a block, and local variables
+that are visible only to a single thread.
+
+Even their languages for writing kernels are very similar.  Here is an OpenCL
+kernel that adds two arrays together, storing the result in a third array.
+::
+
+    __kernel void addArrays(__global const float* restrict a,
+                            __global const float* restrict b,
+                            __global float* restrict c
+                            int length) {
+        for (int i = get_global_id(0); i < length; i += get_global_size(0))
+            c[i] = a[i]+b[i];
+    }
+
+Here is the corresponding CUDA kernel.
+::
+
+    __extern "C" __global__ void addArrays(const float* __restrict__ a,
+                                           const float* __restrict__ b,
+                                           _float* __restrict__ c
+                                           int length) {
+        for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < length; i += blockDim.x*gridDim.x)
+            c[i] = a[i]+b[i];
+    }
+
+The difference between them is largely just a mechanical find-and-replace.
+After many years of writing and maintaining nearly identical kernels by hand,
+it finally occurred to us that the translation could be done automatically by
+the compiler.  Simply by defining a few preprocessor macros, the following
+kernel can be compiled equally well either as OpenCL or as CUDA.
+::
+
+    KERNEL void addArrays(GLOBAL const float* RESTRICT a,
+                          GLOBAL const float* RESTRICT b,
+                          GLOBAL float* RESTRICT c
+                          int length) {
+        for (int i = GLOBAL_ID; i < length; i += GLOBAL_SIZE)
+            c[i] = a[i]+b[i];
+    }
+
+Writing Device Code
+*******************
+
+When compiling kernels with the Common Compute API, the following macros are
+defined.
+
+.. tabularcolumns:: |l|l|L|
+
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|Macro                          |OpenCL Definition                                           |CUDA Definition                             |
+===============================+============================================================+============================================+
+|:code:`KERNEL`                 |:code:`__kernel`                                            |:code:`extern "C" __global__`               |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`DEVICE`                 |                                                            |:code:`__device__`                          |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`LOCAL`                  |:code:`__local`                                             |:code:`__shared__`                          |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`LOCAL_ARG`              |:code:`__local`                                             |                                            |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`GLOBAL`                 |:code:`__global`                                            |                                            |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`RESTRICT`               |:code:`restrict`                                            |:code:`__restrict__`                        |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`LOCAL_ID`               |:code:`get_local_id(0)`                                     |:code:`threadIdx.x`                         |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`LOCAL_SIZE`             |:code:`get_local_size(0)`                                   |:code:`blockDim.x`                          |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`GLOBAL_ID`              |:code:`get_global_id(0)`                                    |:code:`(blockIdx.x*blockDim.x+threadIdx.x)` |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`GLOBAL_SIZE`            |:code:`get_global_size(0)`                                  |:code:`(blockDim.x*gridDim.x)`              |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`GROUP_ID`               |:code:`get_group_id(0)`                                     |:code:`blockIdx.x`                          |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`NUM_GROUPS`             |:code:`get_num_groups(0)`                                   |:code:`gridDim.x`                           |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`SYNC_THREADS`           |:code:`barrier(CLK_LOCAL_MEM_FENCE+CLK_GLOBAL_MEM_FENCE);`  |:code:`__syncthreads();`                    |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`SYNC_WARPS`             | | if SIMT width >= 32:                                     | | if compute capability >= 7.0:            |
+|                               | | :code:`mem_fence(CLK_LOCAL_MEM_FENCE)`                   | | :code:`__syncwarp();`                    |
+|                               | | otherwise:                                               | | otherwise empty                          |
+|                               | | :code:`barrier(CLK_LOCAL_MEM_FENCE)`                     |                                            |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`MEM_FENCE`              |:code:`mem_fence(CLK_LOCAL_MEM_FENCE+CLK_GLOBAL_MEM_FENCE);`|:code:`__threadfence_block();`              |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+|:code:`ATOMIC_ADD(dest, value)`|:code:`atom_add(dest, value)`                               |:code:`atomicAdd(dest, value)`              |
+-------------------------------+------------------------------------------------------------+--------------------------------------------+
+
+A few other symbols may or may not be defined based on the device you are running on:
+:code:`SUPPORTS_DOUBLE_PRECISION` and :code:`SUPPORTS_64_BIT_ATOMICS`\ .  You
+can use :code:`#ifdef` blocks with these symbols to conditionally compile code
+based on the features supported by the device.  In addition, the CUDA compiler
+defines the symbol :code:`__CUDA_ARCH__`\ , so you can check for this symbol if
+you want to have different code blocks for CUDA and OpenCL.
+
+Both OpenCL and CUDA define vector types like :code:`int2` and :code:`float4`\ .
+The types they support are different but overlapping.  When writing common code,
+use only the vector types that are supported by both OpenCL and CUDA: 2, 3, and 4
+element vectors of type :code:`short`\ , :code:`int`\ , :code:`float`\ , and
+:code:`double`\ .
+
+CUDA uses functions to construct vector values, such as :code:`make_float2(x, y)`\ .
+OpenCL instead uses a typecast like syntax: :code:`(float2) (x, y)`\ .  In common
+code, use the CUDA style :code:`make_` functions.  OpenMM provides definitions
+of these functions when compiling as OpenCL.
+
+In CUDA, vector types are simply data structures.  You can access their elements,
+but not do much more with them.  In contrast, OpenCL's vectors are mathematical
+types.  All standard math operators are defined for them, as well as geometrical
+functions like :code:`dot()` and :code:`cross()`\ .  When compiling kernels as
+CUDA, OpenMM provides definitions of these operators and functions.
+
+OpenCL also supports "swizzle" notation for vectors.  For example, if :code:`f`
+is a :code:`float4` you can construct a vector of its first three elements
+by writing :code:`f.xyz`\ , or you can swap its first two elements by writing
+:code:`f.xy = f.yx`\ .  Unfortunately, there is no practical way to support this
+in CUDA, so swizzle notation cannot be used in common code.  Because stripping
+the final element from a four component vector is such a common operation, OpenMM
+provides a special function for doing it: :code:`trimTo3(f)` is a vector of its
+first three elements.
+
+64 bit integers are another data type that needs special handling.  Both OpenCL
+and CUDA support them, but they use different names for them: :code:`long` in OpenCL,
+:code:`long long` in CUDA.  To work around this inconsistency, OpenMM provides
+the typedefs :code:`mm_long` and :code:`mm_ulong` for signed and unsigned 64 bit
+integers in device code.
+
+Writing Host Code
+*****************
+
+Host code for Common Compute is very similar to host code for OpenCL or CUDA.
+In fact, most of the classes provided by the OpenCL and CUDA platforms are
+subclasses of Common Compute classes.  For example, OpenCLContext and
+CudaContext are both subclasses of ComputeContext.  When writing common code,
+each KernelImpl should expect a ComputeContext to be passed to its constructor.
+By using the common API provided by that abstract class, it can be used for
+either OpenCL or CUDA just based on the particular context passed to it at
+runtime.  Similarly, OpenCLNonbondedUtilities and CudaNonbondedUtilities are
+subclasses of the abstract NonbondedUtilities class, and so on.
+
+ArrayInterface is an abstract class defining the interface for arrays stored on
+the device.  OpenCLArray and CudaArray are both subclasses of it.  To simplify
+code that creates and uses arrays, there is also a third subclass called
+ComputeArray.  It acts as a wrapper around an OpenCLArray or CudaArray,
+automatically creating an array of the appropriate type for the current
+platform.  In practice, just follow these rules:
+
+  1. Whenever you need to create an array, make it a ComputeArray.
+
+  2. Whenever you write a function that expects an array to be passed to it,
+     declare the type to be ArrayInterface.
+
+If you do these two things, all differences between platforms will be handled
+automatically.
+
+OpenCL and CUDA have quite different APIs for compiling and invoking kernels.
+To hide these differences, OpenMM provides a set of abstract classes.  To compile
+device code, pass the source code to :code:`compileProgram()` on the ComputeContext.
+This returns a ComputeProgram.  You can then call its :code:`createKernel()`
+method to get a ComputeKernel object, which has methods for setting arguments
+and invoking the kernel.
+
+Sometimes you need to refer to vector types in host code, such as to set the
+value for a kernel argument or to access the elements of an array.  OpenCL and
+CUDA both define types for them, but they have different names, and in any case
+you want to avoid using OpenCL-specific or CUDA-specific types in common code.
+OpenMM therefore defines types for vectors in host code.  They have the same
+names as the corresponding types in device code, only with the prefix :code:`mm_`\ ,
+for example :code:`mm_int2` and :code:`mm_float3`\ .
\ No newline at end of file
--- a/docs-source/developerguide/license.rst
+++ b/docs-source/developerguide/license.rst
-Portions copyright (c) 2011-2017 Stanford University and the Authors
+Portions copyright (c) 2011-2020 Stanford University and the Authors

 Contributors: Peter Eastman


--- a/docs-source/usersguide/application.rst
+++ b/docs-source/usersguide/application.rst
@@ -120,7 +120,7 @@ steps.
        forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
        system = forcefield.createSystem(pdb.topology, nonbondedMethod=PME,
                nonbondedCutoff=1*nanometer, constraints=HBonds)
-        integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+        integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
        simulation = Simulation(pdb.topology, system, integrator)
        simulation.context.setPositions(pdb.positions)
        simulation.minimizeEnergy()
@@ -210,14 +210,17 @@ convenient and less error-prone.  We could have equivalently specified
 The units system will be described in more detail later, in Section :ref:`units-and-dimensional-analysis`.
 ::

-    integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+    integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)

 This line creates the integrator to use for advancing the equations of motion.
-It specifies a :class:`BAOABLangevinIntegrator`, which performs Langevin dynamics,
+It specifies a :class:`LangevinMiddleIntegrator`, which performs Langevin dynamics,
 and assigns it to a variable called :code:`integrator`\ .  It also specifies
 the values of three parameters that are specific to Langevin dynamics: the
 simulation temperature (300 K), the friction coefficient (1 ps\ :sup:`-1`\ ), and
-the step size (0.002 ps).
+the step size (0.004 ps).  Lots of other integration methods are also available.
+For example, if you wanted to simulate the system at constant energy rather than
+constant temperature you would use a :code:`VerletIntegrator`\ .  The available
+integration methods are listed in Section :ref:`integrators`.
 ::

    simulation = Simulation(pdb.topology, system, integrator)
@@ -295,7 +298,7 @@ found in OpenMM’s :file:`examples` folder with the name :file:`simulateAmber.p
        inpcrd = AmberInpcrdFile('input.inpcrd')
        system = prmtop.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
                constraints=HBonds)
-        integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+        integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
        simulation = Simulation(prmtop.topology, system, integrator)
        simulation.context.setPositions(inpcrd.positions)
        if inpcrd.boxVectors is not None:
@@ -389,7 +392,7 @@ with the name :file:`simulateGromacs.py`.
                includeDir='/usr/local/gromacs/share/gromacs/top')
        system = top.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
                constraints=HBonds)
-        integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+        integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
        simulation = Simulation(top.topology, system, integrator)
        simulation.context.setPositions(gro.positions)
        simulation.minimizeEnergy()
@@ -453,7 +456,7 @@ on the :class:`CharmmPsfFile`.
        params = CharmmParameterSet('charmm22.rtf', 'charmm22.prm')
        system = psf.createSystem(params, nonbondedMethod=NoCutoff,
                nonbondedCutoff=1*nanometer, constraints=HBonds)
-        integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+        integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
        simulation = Simulation(psf.topology, system, integrator)
        simulation.context.setPositions(pdb.positions)
        simulation.minimizeEnergy()
@@ -981,9 +984,10 @@ Value             Meaning

 The main reason to use constraints is that it allows one to use a larger
 integration time step.  With no constraints, one is typically limited to a time
-step of about 1 fs for typical biomolecular force fields like AMBER or CHARMM.  With :code:`HBonds` constraints, this can be increased
-to about 2 fs.  With :code:`HAngles`\ , it can be further increased to 3.5 or
-4 fs.
+step of about 1 fs for typical biomolecular force fields like AMBER or CHARMM.
+With :code:`HBonds` constraints, this can be increased to about 2 fs for Verlet
+dynamics, or about 4 fs for Langevin dynamics.  With :code:`HAngles`\ , it can
+sometimes be increased even further.

 Regardless of the value of this parameter, OpenMM makes water molecules
 completely rigid, constraining both their bond lengths and angles.  You can
@@ -997,7 +1001,9 @@ step size, typically to about 0.5 fs.

 .. note::

-   The AMOEBA forcefield is intended to be used without constraints.
+   The AMOEBA forcefield is designed to be used without constraints, so by
+   default OpenMM makes AMOEBA water flexible.  You can still force it to be
+   rigid by specifying :code:`rigidWater=True`.

 Heavy Hydrogens
 ===============
@@ -1012,39 +1018,45 @@ optionally tell OpenMM to increase the mass of hydrogen atoms.  For example,
 This applies only to hydrogens that are bonded to heavy atoms, and any mass
 added to the hydrogen is subtracted from the heavy atom.  This keeps their total
 mass constant while slowing down the fast motions of hydrogens.  When combined
-with constraints (typically :code:`constraints=AllBonds`\ ), this allows a
+with constraints (typically :code:`constraints=AllBonds`\ ), this often allows a
 further increase in integration step size.

+.. _integrators:
+
 Integrators
 ===========


 OpenMM offers a choice of several different integration methods.  You select
 which one to use by creating an integrator object of the appropriate type.
+Detailed descriptions of all these integrators can be found in Chapter
+:ref:`integrators-theory`.  In addition to these built in integrators, lots of
+others are available as part of the `OpenMMTools <https://openmmtools.readthedocs.io>`_ package.

-BAOAB Langevin Integrator
-------------------------
+Langevin Middle Integrator
+--------------------------

 In the examples of the previous sections, we used Langevin integration:
 ::

-    integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+    integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)

 The three parameter values in this line are the simulation temperature (300 K),
-the friction coefficient (1 ps\ :sup:`-1`\ ), and the step size (0.002 ps).  You
+the friction coefficient (1 ps\ :sup:`-1`\ ), and the step size (0.004 ps).  You
 are free to change these to whatever values you want.  Be sure to specify units
 on all values.  For example, the step size could be written either as
-:code:`0.002*picoseconds` or :code:`2*femtoseconds`\ .  They are exactly
-equivalent.
+:code:`0.004*picoseconds` or :code:`4*femtoseconds`\ .  They are exactly
+equivalent.  Note that :code:`LangevinMiddleIntegrator` is a leapfrog
+integrator, so the velocities are offset by half a time step from the positions.

 Langevin Integrator
 -------------------

-:code:`LangevinIntegrator` is very similar to :code:`BAOABLangevinIntegrator`,
+:code:`LangevinIntegrator` is very similar to :code:`LangevinMiddleIntegrator`,
 but it uses a different discretization of the Langevin equation.
-:code:`BAOABLangevinIntegrator` tends to produce more accurate configurational
+:code:`LangevinMiddleIntegrator` tends to produce more accurate configurational
 sampling, and therefore is preferred for most applications.  Also note that
-:code:`LangevinIntegrator` (unlike :code:`BAOABLangevinIntegrator`) is a leapfrog
+:code:`LangevinIntegrator`\ , like :code:`LangevinMiddleIntegrator`\ , is a leapfrog
 integrator, so the velocities are offset by half a time step from the positions.

 Leapfrog Verlet Integrator
@@ -1155,7 +1167,7 @@ previous section:
    system = prmtop.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer,
            constraints=HBonds)
    system.addForce(MonteCarloBarostat(1*bar, 300*kelvin))
-    integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+    integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
    ...

 The parameters of the Monte Carlo barostat are the pressure (1 bar) and
@@ -1715,7 +1727,7 @@ executing 1000 time steps at each temperature:
        :autonumber:`Example,simulated annealing`

 This code needs very little explanation.  The loop is executed 100 times.  Each
-time through, it adjusts the temperature of the :class:`BAOABLangevinIntegrator` and then
+time through, it adjusts the temperature of the :class:`LangevinMiddleIntegrator` and then
 calls :code:`step(1000)` to take 1000 time steps.

 Applying an External Force to Particles: a Spherical Container
@@ -1747,7 +1759,7 @@ coordinates.  Here is the code to do it:
        system.addForce(force)
        for i in range(system.getNumParticles()):
            force.addParticle(i, [])
-        integrator = BAOABLangevinIntegrator(300*kelvin, 91/picosecond, 0.002*picoseconds)
+        integrator = LangevinMiddleIntegrator(300*kelvin, 91/picosecond, 0.004*picoseconds)
        ...

    .. caption::
@@ -2331,6 +2343,22 @@ second atom has class OS and the third has class P:

    <Proper class1="" class2="OS" class3="P" class4="" periodicity1="3" phase1="0.0" k1="1.046"/>

+The :code:`<PeriodicTorsionForce>` tag also supports an optional
+:code:`ordering` attribute to provide better compatibility with the way
+impropers are assigned in different simulation packages:
+
+ * :code:`ordering="default"` specifies the default behavior if the attribute
+   is omitted. 
+ * :code:`ordering="amber"` produces behavior that replicates the behavior of
+   AmberTools LEaP
+ * :code:`ordering="charmm"` produces behavior more consistent with CHARMM
+ * :code:`ordering="smirnoff"` allows multiple impropers to be added using
+   exact matching to replicate the beheavior of `SMIRNOFF <https://open-forcefield-toolkit.readthedocs.io/en/latest/smirnoff.html>`_
+   impropers
+
+Different :code:`<PeriodicTorsionForce>` tags can specify different :code:`ordering`
+values to be used for the sub-elements appearing within their tags.
+
 <RBTorsionForce>
 ================


--- a/docs-source/usersguide/library.rst
+++ b/docs-source/usersguide/library.rst
@@ -243,14 +243,14 @@ simulation might look like:
        angles->addAngle(angle[i].particle1, angle[i].particle2,
            angle[i].particle3, angle[i].angle, angle[i].k);
    // ...create and initialize other force field terms in the same way
-    BAOABLangevinIntegrator integrator(temperature, friction, stepSize);
+    LangevinMiddleIntegrator integrator(temperature, friction, stepSize);
    Context context(system, integrator);
    context.setPositions(initialPositions);
    context.setVelocities(initialVelocities);
    integrator.step(10000);

 We create a System, add various Forces to it, and set parameters on both the
-System and the Forces.  We then create a BAOABLangevinIntegrator, initialize a
+System and the Forces.  We then create a LangevinMiddleIntegrator, initialize a
 Context in which to run a simulation, and instruct the Integrator to advance the
 simulation for 10,000 time steps.

@@ -1297,7 +1297,7 @@ existing MD program.
    static const double SolventDielectric   = 80.;     // typical for water
    static const double SoluteDielectric    = 2.;      // typical for protein

-    static const double StepSizeInFs        = 2;       // integration step size (fs)
+    static const double StepSizeInFs        = 4;       // integration step size (fs)
    static const double ReportIntervalInFs  = 50;      // how often to issue PDB frame (fs)
    static const double SimulationTimeInPs  = 100;     // total simulation time (ps)

@@ -1631,7 +1631,7 @@ along with the handle :code:`omm`\ , back to the calling function.
    // best available Platform. Initialize the configuration from the default
    // positions we collected above. Initial velocities will be zero but could
    // have been set here.
-    omm->integrator = new OpenMM::LangevinIntegrator(temperature,
+    omm->integrator = new OpenMM::LangevinMiddleIntegrator(temperature,
    frictionInPs,
    stepSizeInFs * OpenMM::PsPerFs);
    omm->context    = new OpenMM::Context(*omm->system, *omm->integrator);

--- a/docs-source/usersguide/references.bib
+++ b/docs-source/usersguide/references.bib
@@ -240,19 +240,6 @@
   type = {Journal Article}
 }

-@article{Leimkuhler2013,
-    author = {Leimkuhler, Benedict and Matthews, Charles},
-    title = {Rational Construction of Stochastic Numerical Methods for Molecular Sampling},
-    journal = {Applied Mathematics Research eXpress},
-    volume = {2013},
-    number = {1},
-    pages = {34-56},
-    year = {2012},
-    month = {06},
-    doi = {10.1093/amrx/abs010},
-    type = {Journal Article}
-}
-
 @article{Li2010
   author = {Li, D.W. and Br{\"u}schweiler, R.},
   title = {{NMR}-based protein potentials},
@@ -629,3 +616,15 @@
   year = {2015},
   type = {Journal Article}
 }
+
+@article{Zhang2019,
+    author = {Zhang, Zhijun and Liu, Xinzijian and Yan, Kangyu and Tuckerman, Mark E. and Liu, Jian},
+    title = {Unified Efficient Thermostat Scheme for the Canonical Ensemble with Holonomic or Isokinetic Constraints via Molecular Dynamics},
+    journal = {The Journal of Physical Chemistry A},
+    volume = {123},
+    number = {28},
+    pages = {6056-6079},
+    year = {2019},
+    doi = {10.1021/acs.jpca.9b02771},
+    type = {Journal Article}
+}
--- a/docs-source/usersguide/theory.rst
+++ b/docs-source/usersguide/theory.rst
@@ -1278,6 +1278,8 @@ algorithm.  This can be used to implement algorithms such as lambda-dynamics,
 where a global parameter is integrated as a dynamic variable.


+.. _integrators-theory:
+
 Integrators
 ###########

@@ -1334,24 +1336,60 @@ components are chosen from a normal distribution with mean zero and variance
 :math:`2m_i \gamma k_B T`\ , where *T* is the temperature of
 the heat bath.

-The integration is done using a leap-frog method similar to VerletIntegrator.
-:cite:`Izaguirre2010` The same comments about the offset between positions and
-velocities apply to this integrator as to that one.
+The integration is done using the Langevin leap-frog method. :cite:`Izaguirre2010`
+In each step, the positions and velocities are updated as follows:
+
+
+.. math::
+   \mathbf{v}_{i}(t+\Delta t/2)=\mathbf{v}_{i}(t-\Delta t/2)\alpha+\mathbf{f}_{i}(t)(1-\alpha)/\gamma{m}_{i} + \sqrt{kT(1-\alpha^2)/m}R
+
+
+.. math::
+   \mathbf{r}_{i}(t+\Delta t)=\mathbf{r}_{i}(t)+\mathbf{v}_{i}(t+\Delta t/2)\Delta t
+
+
+where :math:`k` is Boltzmann's constant, :math:`T` is the temperature,
+:math:`\gamma` is the friction coefficient, :math:`R` is a normally distributed
+random number, and :math:`\alpha=\exp(-\gamma\Delta t)`.
+
+The same comments about the offset between positions and velocities apply to
+this integrator as to VerletIntegrator.
+
+LangevinMiddleIntegrator
+************************
+
+This integrator is similar to LangevinIntegerator, but it instead uses the LFMiddle
+discretization. :cite:`Zhang2019` In each step, the positions and velocities
+are updated as follows:
+
+
+.. math::
+   \mathbf{v}_{i}(t+\Delta t/2) = \mathbf{v}_{i}(t-\Delta t/2) + \mathbf{f}_{i}(t)\Delta t/{m}_{i}
+
+
+.. math::
+   \mathbf{r}_{i}(t+\Delta t/2) = \mathbf{r}_{i}(t) + \mathbf{v}_{i}(t+\Delta t/2)\Delta t/2
+
+
+.. math::
+   \mathbf{v'}_{i}(t+\Delta t/2) = \mathbf{v}_{i}(t+\Delta t/2)\alpha + \sqrt{kT(1-\alpha^2)/m}R
+
+
+.. math::
+   \mathbf{r}_{i}(t+\Delta t) = \mathbf{r}_{i}(t+\Delta t/2) + \mathbf{v'}_{i}(t+\Delta t/2)\Delta t/2

-BAOABLangevinIntegrator
-***********************

-This integrator is similar to LangevinIntegerator, but it instead uses the BAOAB
-discretization. :cite:`Leimkuhler2013` This tends to produce more accurate
-sampling of configurational properties (such as free energies), but less
-accurate sampling of kinetic properties (such as mean kinetic energy).  Because
+This tends to produce more accurate sampling of configurational properties (such
+as free energies), but less accurate sampling of kinetic properties.  Because
 configurational properties are much more important than kinetic ones in most
 simulations, this integrator is generally preferred over LangevinIntegrator.  It
 often allows one to use a larger time step while still maintaining similar or
 better accuracy.

-Unlike LangevinIntegrator, this does not use a leap-frog algorithm.  The
-positions and velocities all correspond to the same point in time.
+One disadvantage of this integrator is that it requires applying constraints
+twice per time step, compared to only once for LangevinIntegrator.  This
+can make it slightly slower for systems that involve constraints.  However, this
+usually is more than compensated by allowing you to use a larger time step.

 BrownianIntegrator
 ******************

--- a/examples/HelloSodiumChloride.cpp
+++ b/examples/HelloSodiumChloride.cpp
@@ -28,7 +28,7 @@ static const double FrictionInPerPs     = 91.;     // collisions per picosecond
 static const double SolventDielectric   = 80.;     // typical for water
 static const double SoluteDielectric    = 2.;      // typical for protein

-static const double StepSizeInFs        = 2;       // integration step size (fs)
+static const double StepSizeInFs        = 4;       // integration step size (fs)
 static const double ReportIntervalInFs  = 50;      // how often to issue PDB frame (fs)
 static const double SimulationTimeInPs  = 100;     // total simulation time (ps)

@@ -249,7 +249,7 @@ myInitializeOpenMM( const MyAtomInfo    atoms[],
    // best available Platform. Initialize the configuration from the default
    // positions we collected above. Initial velocities will be zero but could
    // have been set here.
-    omm->integrator = new OpenMM::LangevinIntegrator(temperature, frictionInPs, 
+    omm->integrator = new OpenMM::LangevinMiddleIntegrator(temperature, frictionInPs, 
                                                     stepSizeInFs * OpenMM::PsPerFs);
    omm->context    = new OpenMM::Context(*omm->system, *omm->integrator);
    omm->context->setPositions(initialPosInNm);

--- a/examples/HelloSodiumChlorideInC.c
+++ b/examples/HelloSodiumChlorideInC.c
@@ -28,7 +28,7 @@ static const double FrictionInPerPs     = 91.;    /*collisions per ps*/
 static const double SolventDielectric   = 80.;    /*typical for water    */
 static const double SoluteDielectric    = 2.;     /*typical for protein  */

-static const double StepSizeInFs        = 2;      /*integration step size (fs)  */
+static const double StepSizeInFs        = 4;      /*integration step size (fs)  */
 static const double ReportIntervalInFs  = 50;     /*how often for PDB frame (fs)*/
 static const double SimulationTimeInPs  = 100;    /*total simulation time (ps)  */

@@ -252,7 +252,7 @@ myInitializeOpenMM( const MyAtomInfo    atoms[],
     * best available Platform. Initialize the configuration from the default
     * positions we collected above. Initial velocities will be zero but could
     * have been set here. */
-    omm->integrator = (OpenMM_Integrator*)OpenMM_LangevinIntegrator_create(
+    omm->integrator = (OpenMM_Integrator*)OpenMM_LangevinMiddleIntegrator_create(
                                            temperature, frictionInPerPs, 
                                            stepSizeInFs * OpenMM_PsPerFs);
    omm->context    = OpenMM_Context_create(omm->system, omm->integrator);

--- a/examples/HelloSodiumChlorideInFortran.f90
+++ b/examples/HelloSodiumChlorideInFortran.f90
@@ -36,7 +36,7 @@ MODULE MyAtomInfo
    parameter(SoluteDielectric   = 2)   !typical for protein
    
    real*8 StepSizeInFs, ReportIntervalInFs, SimulationTimeInPs
-    parameter(StepSizeInFs       = 2)   !integration step size (fs)
+    parameter(StepSizeInFs       = 4)   !integration step size (fs)
    parameter(ReportIntervalInFs = 50)  !how often for PDB frame (fs)
    parameter(SimulationTimeInPs = 100) !total simulation time (ps)
    
@@ -171,7 +171,7 @@ SUBROUTINE myInitializeOpenMM(ommHandle, platformName)
    ! These are the objects we'll create here thare are stored in the
    ! Context for later access. Don't forget to delete them at the end.
    type (OpenMM_System)             system
-    type (OpenMM_LangevinIntegrator) langevin
+    type (OpenMM_LangevinMiddleIntegrator) langevin
    type (OpenMM_Context)            context

    ! These are temporary OpenMM objects used and discarded here.
@@ -236,11 +236,11 @@ SUBROUTINE myInitializeOpenMM(ommHandle, platformName)
    ! best available Platform. Initialize the configuration from the default
    ! positions we collected above. Initial velocities will be zero but could
    ! have been set here.
-    call OpenMM_LangevinIntegrator_create(langevin,                     &
+    call OpenMM_LangevinMiddleIntegrator_create(langevin,                     &
                                          Temperature, FrictionInPerPs, &
                                          StepSizeInFs * OpenMM_PsPerFs)

-    ! Convert LangevinIntegrator to generic Integrator type for this call.
+    ! Convert LangevinMiddleIntegrator to generic Integrator type for this call.
    call OpenMM_Context_create(context, system,                         &
                               transfer(langevin, OpenMM_Integrator(0)))
    call OpenMM_Context_setPositions(context, initialPosInNm)

--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -4,7 +4,7 @@ import simtk.openmm as mm
 import simtk.unit as unit
 import sys
 from datetime import datetime
-from optparse import OptionParser
+from argparse import ArgumentParser

 def timeIntegration(context, steps, initialSteps):
    """Integrate a Context for a specified number of steps, then return how many seconds it took."""
@@ -127,35 +127,33 @@ def runOneTest(testName, options):

 # Parse the command line options.

-parser = OptionParser()
+parser = ArgumentParser()
 platformNames = [mm.Platform.getPlatform(i).getName() for i in range(mm.Platform.getNumPlatforms())]
-parser.add_option('--platform', dest='platform', choices=platformNames, help='name of the platform to benchmark')
-parser.add_option('--test', dest='test', choices=('gbsa', 'rf', 'pme', 'apoa1rf', 'apoa1pme', 'apoa1ljpme', 'amoebagk', 'amoebapme'), help='the test to perform: gbsa, rf, pme, apoa1rf, apoa1pme, apoa1ljpme, amoebagk, or amoebapme [default: all]')
-parser.add_option('--pme-cutoff', default='0.9', dest='cutoff', type='float', help='direct space cutoff for PME in nm [default: 0.9]')
-parser.add_option('--seconds', default='60', dest='seconds', type='float', help='target simulation length in seconds [default: 60]')
-parser.add_option('--polarization', default='mutual', dest='polarization', choices=('direct', 'extrapolated', 'mutual'), help='the polarization method for AMOEBA: direct, extrapolated, or mutual [default: mutual]')
-parser.add_option('--mutual-epsilon', default='1e-5', dest='epsilon', type='float', help='mutual induced epsilon for AMOEBA [default: 1e-5]')
-parser.add_option('--heavy-hydrogens', action='store_true', default=False, dest='heavy', help='repartition mass to allow a larger time step')
-parser.add_option('--device', default=None, dest='device', help='device index for CUDA or OpenCL')
-parser.add_option('--precision', default='single', dest='precision', choices=('single', 'mixed', 'double'), help='precision mode for CUDA or OpenCL: single, mixed, or double [default: single]')
-(options, args) = parser.parse_args()
-if len(args) > 0:
-    parser.error('Unknown argument: '+args[0])
-if options.platform is None:
+parser.add_argument('--platform', dest='platform', choices=platformNames, help='name of the platform to benchmark')
+parser.add_argument('--test', dest='test', choices=('gbsa', 'rf', 'pme', 'apoa1rf', 'apoa1pme', 'apoa1ljpme', 'amoebagk', 'amoebapme'), help='the test to perform: gbsa, rf, pme, apoa1rf, apoa1pme, apoa1ljpme, amoebagk, or amoebapme [default: all]')
+parser.add_argument('--pme-cutoff', default=0.9, dest='cutoff', type=float, help='direct space cutoff for PME in nm [default: 0.9]')
+parser.add_argument('--seconds', default=60, dest='seconds', type=float, help='target simulation length in seconds [default: 60]')
+parser.add_argument('--polarization', default='mutual', dest='polarization', choices=('direct', 'extrapolated', 'mutual'), help='the polarization method for AMOEBA: direct, extrapolated, or mutual [default: mutual]')
+parser.add_argument('--mutual-epsilon', default=1e-5, dest='epsilon', type=float, help='mutual induced epsilon for AMOEBA [default: 1e-5]')
+parser.add_argument('--heavy-hydrogens', action='store_true', default=False, dest='heavy', help='repartition mass to allow a larger time step')
+parser.add_argument('--device', default=None, dest='device', help='device index for CUDA or OpenCL')
+parser.add_argument('--precision', default='single', dest='precision', choices=('single', 'mixed', 'double'), help='precision mode for CUDA or OpenCL: single, mixed, or double [default: single]')
+args = parser.parse_args()
+if args.platform is None:
    parser.error('No platform specified')
-print('Platform:', options.platform)
-if options.platform in ('CUDA', 'OpenCL'):
-    print('Precision:', options.precision)
-    if options.device is not None:
-        print('Device:', options.device)
+print('Platform:', args.platform)
+if args.platform in ('CUDA', 'OpenCL'):
+    print('Precision:', args.precision)
+    if args.device is not None:
+        print('Device:', args.device)

 # Run the simulations.

-if options.test is None:
+if args.test is None:
    for test in ('gbsa', 'rf', 'pme', 'apoa1rf', 'apoa1pme', 'apoa1ljpme', 'amoebagk', 'amoebapme'):
        try:
-            runOneTest(test, options)
+            runOneTest(test, args)
        except Exception as ex:
            print('Test failed: %s' % ex.message)
 else:
-    runOneTest(options.test, options)
+    runOneTest(args.test, args)
--- a/examples/simulateAmber.py
+++ b/examples/simulateAmber.py
@@ -6,7 +6,7 @@ from sys import stdout
 prmtop = AmberPrmtopFile('input.prmtop')
 inpcrd = AmberInpcrdFile('input.inpcrd')
 system = prmtop.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer, constraints=HBonds)
-integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
 simulation = Simulation(prmtop.topology, system, integrator)
 simulation.context.setPositions(inpcrd.positions)
 if inpcrd.boxVectors is not None:

--- a/examples/simulateCharmm.py
+++ b/examples/simulateCharmm.py
@@ -20,9 +20,8 @@ params = CharmmParameterSet('charmm22.rtf', 'charmm22.par')
 # http://mackerell.umaryland.edu/CHARMM_ff_params.html

 # Instantiate the system
-system = psf.createSystem(params, nonbondedMethod=NoCutoff,
-                          nonbondedCutoff=None)
-integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+system = psf.createSystem(params, nonbondedMethod=NoCutoff, constraints=HBonds)
+integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
 simulation = Simulation(psf.topology, system, integrator)
 simulation.context.setPositions(pdb.getPositions())
 simulation.minimizeEnergy()

--- a/examples/simulateGromacs.py
+++ b/examples/simulateGromacs.py
@@ -6,7 +6,7 @@ from sys import stdout
 gro = GromacsGroFile('input.gro')
 top = GromacsTopFile('input.top', periodicBoxVectors=gro.getPeriodicBoxVectors())
 system = top.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer, constraints=HBonds)
-integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
 simulation = Simulation(top.topology, system, integrator)
 simulation.context.setPositions(gro.positions)
 simulation.minimizeEnergy()

--- a/examples/simulatePdb.py
+++ b/examples/simulatePdb.py
@@ -6,7 +6,7 @@ from sys import stdout
 pdb = PDBFile('input.pdb')
 forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
 system = forcefield.createSystem(pdb.topology, nonbondedMethod=PME, nonbondedCutoff=1*nanometer, constraints=HBonds)
-integrator = BAOABLangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
+integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
 simulation = Simulation(pdb.topology, system, integrator)
 simulation.context.setPositions(pdb.positions)
 simulation.minimizeEnergy()

--- a/libraries/asmjit/asmjit_apibegin.h
+++ b/libraries/asmjit/asmjit_apibegin.h
@@ -53,8 +53,8 @@
 // [GCC]
 #if ASMJIT_CC_GCC
 # pragma GCC diagnostic push
-# pragma GCC diagnostic ignored "-Wbool-operation"
 # if ASMJIT_CC_GCC_GE(8, 0, 0)
+#  pragma GCC diagnostic ignored "-Wbool-operation"
 #  pragma GCC diagnostic ignored "-Wclass-memaccess"
 # endif
 #endif // ASMJIT_CC_GCC