resolved PR #2611 merge conflicts

5270f858 · Charlles Abreu · 697ab72e · eec9cd69 · 5270f858 · 5270f858
Commit 5270f858 authored Jun 25, 2020 by Charlles Abreu
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -216,7 +216,7 @@ script:
  - python devtools/run-ctest.py --start-time $START_TIME
  - if [[ ! -z "${DOCS_DEPLOY}" && "${DOCS_DEPLOY}" = "true" ]]; then
-      pip install sphinx sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen;
+      pip install sphinx==2.3.1 sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen;
      make sphinxhtml;
      make sphinxpdf;
      make C++ApiDocs PythonApiDocs;

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -275,7 +275,7 @@ ENDIF (MSVC)
 IF(OPENMM_BUILD_SHARED_LIB)
    ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
-    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DPTHREAD_BUILDING_SHARED_LIBRARY")
+    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DPTHREAD_BUILDING_SHARED_LIBRARY" SOVERSION "${OPENMM_MAJOR_VERSION}.${OPENMM_MINOR_VERSION}")
 ENDIF(OPENMM_BUILD_SHARED_LIB)
 IF(OPENMM_BUILD_STATIC_LIB)

--- a/docs-source/usersguide/application.rst
+++ b/docs-source/usersguide/application.rst
@@ -650,8 +650,8 @@ such as :file:`charmm36/water.xml`, which specifies the default CHARMM water mod
 .. warning:: Drude polarizable sites and lone pairs are not yet supported
             by `ParmEd <https://github.com/parmed/parmed>`_ and the CHARMM36 forcefields
             that depend on these features are not included in this port.
-             To use the CHARMM 2013 polarizable force field\ :cite:`Lopes2013`,
+             To use the CHARMM 2019 polarizable force field\ :cite:`Lopes2013`,
-             include the single file :file:`charmm_polar_2013.xml`.
+             include the single file :file:`charmm_polar_2019.xml`.
 .. tip:: The solvent model XML files included under the :file:`charmm36/` directory
         include both water *and* ions compatible with that water model, so if you
@@ -712,17 +712,20 @@ recommended for most simulations.
 CHARMM Polarizable Force Field
 ------------------------------
-To use the CHARMM 2013 polarizable force field\ :cite:`Lopes2013`, include the
+To use the CHARMM 2019 polarizable force field\ :cite:`Lopes2013`, include the
-single file :file:`charmm_polar_2013.xml`.  It includes parameters for proteins,
+single file :file:`charmm_polar_2019.xml`.  It includes parameters for proteins, lipids,
 water, and ions.  When using this force field, remember to add extra particles to
 the :class:`Topology` as described in section :ref:`adding-or-removing-extra-particles`.
+This force field also requires that you use one of the special integrators that
+supports Drude particles.  The options are DrudeLangevinIntegrator, DrudeNoseHooverIntegrator,
+and DrudeSCFIntegrator.
-Older Amber Force Fields
+Older Force Fields
------------------------
+------------------
-OpenMM includes several older Amber force fields as well.  For most simulations
+OpenMM includes several older force fields as well.  For most simulations, the
-Amber14 is preferred over any of these, but they are still useful for reproducing
+newer force fields described above are preferred over any of these, but they are
-older results.
+still useful for reproducing older results.
 .. tabularcolumns:: |l|L|
@@ -735,6 +738,7 @@ File                           Force Field
 :code:`amber99sbnmr.xml`       Amber99SB with modifications to fit NMR data\ :cite:`Li2010`
 :code:`amber03.xml`            Amber03\ :cite:`Duan2003`
 :code:`amber10.xml`            Amber10 (documented in the AmberTools_ manual as `ff10`)
+:code:`charmm_polar_2013.xml`  2013 version of the CHARMM polarizable force field\ :cite:`Lopes2013`
 =============================  ================================================================================
 Several of these force fields support implicit solvent.  To enable it, also
@@ -1059,6 +1063,34 @@ sampling, and therefore is preferred for most applications.  Also note that
 :code:`LangevinIntegrator`\ , like :code:`LangevinMiddleIntegrator`\ , is a leapfrog
 integrator, so the velocities are offset by half a time step from the positions.
+Nosé-Hoover Integrator
+----------------------
+The :code:`NoseHooverIntegrator` uses the same "middle" leapfrog propagation
+algorithm as :code:`LangevinMiddleIntegrator`, but replaces the stochastic
+temperature control with a velocity scaling algorithm that produces more
+accurate transport properties :cite:`Basconi2013`.  This velocity scaling
+results from propagating a chain of extra variables, which slightly reduces the
+computational efficiency with respect to :code:`LangevinMiddleIntegrator`.  The
+thermostated integrator is minimally created with syntax analogous to the
+:code:`LangevinMiddleIntegrator` example above::
+    NoseHooverIntegrator integrator(300*kelvin, 1/picosecond,
+                                    0.004*picoseconds);
+The first argument specifies the target temperature.  The second specifies the
+frequency of interaction with the heat bath: a lower value interacts minimally,
+yielding the microcanonical ensemble in the limit of a zero frequency, while a
+larger frequency will perturb the system greater, keeping it closer to the
+target temperature.  The third argument is the integration timestep that, like
+the other arguments, must be specified with units.  For initial equilibration
+to the target temperature, a larger interaction frequency is recommended,
+*e.g.* 25 ps\ :sup:`-1`.
+This integrator supports lots of other options, including the ability to couple
+different parts of the system to thermostats at different temperatures. See the
+API documentation for details.
 Leapfrog Verlet Integrator
 --------------------------

--- a/docs-source/usersguide/library.rst
+++ b/docs-source/usersguide/library.rst
@@ -3634,7 +3634,7 @@ the Drude particle and the spring constant *k* by
 A damped interaction\ :cite:`Thole1981` is used between dipoles that are
 bonded to each other.
-The equations of motion can be integrated with two different methods:
+The equations of motion can be integrated with three different methods:
 #. In the Self Consistent Field (SCF) method, the ordinary particles are first
   updated as usual.  A local energy minimization is then performed to select new
@@ -3644,8 +3644,25 @@ The equations of motion can be integrated with two different methods:
 #. In the extended Lagrangian method, the positions of the Drude particles are
   treated as dynamical variables, just like any other particles.  A small amount
   of mass is transferred from the parent particles to the Drude particles,
-   allowing them to be integrated normally.  A dual Langevin integrator is used to
+   allowing them to be integrated normally.  A dual Langevin or Nose-Hoover integrator is used to
   maintain the center of mass of each Drude particle pair at the system
   temperature, while using a much lower temperature for their relative internal
   motion.  In practice, this produces dipole moments very close to those from the
   SCF solution while being much faster to compute.
+#. The Nosé-Hoover dual thermostat method.  In this approach the motion of
+   non-Drude sites and center of mass motion of Drude pairs are thermostated to
+   the target temperature with one thermostat.  Another thermostat is used to keep
+   relative motion of Drude pairs to a different, typically much lower,
+   temperature to maintain separation of nuclear and electronic degrees of
+   freedom.  The minimal specification is as follows::
+      DrudeNoseHooverIntegrator integrator(temperature, frequency,
+                                           temperatureDrude, frequencyDrude,
+                                           1*femtoseconds)
+   Where the first and third arguments specify the center-of-mass temperature and
+   relative temperature for each Drude pair, respecitvely.  The second and fourth
+   arguments describe the frequency of interaction with the center-of-mass and
+   relative heat baths, respectively, and should be specified with inverse time
+   units.  The fifth argument is the timestep.  The multi-timestep and Nosé-Hoover
+   chain length may also be specified, but sensible defaults are provided.
--- a/docs-source/usersguide/references.bib
+++ b/docs-source/usersguide/references.bib
@@ -41,6 +41,17 @@
  doi = {10.1103/PhysRevLett.100.020603},
 }
+@article{Basconi2013,
+  title = {Effects of Temperature Control Algorithms on Transport Properties and Kinetics in Molecular Dynamics Simulations},
+  author = {Joseph E. Bascon and Michael R. Shirts},
+  journal = {Journal of Chemical Theory and Computation},
+  volume = {9},
+  issue = {7},
+  pages = {2887-2899},
+  year = {2013},
+  doi= {10.1021/ct400109a}
+}
 @article{Berendsen1987
   author = {Berendsen, H. J. C. and Grigera, J. R. and Straatsma, T. P.},
   title = {The missing term in effective pair potentials},
@@ -314,6 +325,16 @@
    journal = {Europhysics Letters ({EPL})},
 }
+@article{Martyna1992,
+   author = {Glenn J. Martyna and Michael L. Klein and Mark Tuckerman},
+   year = 1992,
+   title = {Nos\'{e}–Hoover chains: The canonical ensemble via continuous dynamics},
+   pages = {2635-2643},
+   journal  = {Journal of Chemical Physics},
+   volume = {97},
+   issue = {4},
+}
 @article{Markland2008
   author = {Markland, Thomas E. and Manolopoulos, David E.},
   title = {An efficient ring polymer contraction scheme for imaginary time path integral simulations},

--- a/docs-source/usersguide/theory.rst
+++ b/docs-source/usersguide/theory.rst
@@ -1391,6 +1391,52 @@ twice per time step, compared to only once for LangevinIntegrator.  This
 can make it slightly slower for systems that involve constraints.  However, this
 usually is more than compensated by allowing you to use a larger time step.
+.. _nosehoover-integrators-theory:
+NoseHooverIntegrator
+********************
+Like LangevinMiddleIntegerator, this uses the LFMiddle discretization.
+:cite:`Zhang2019` In each step, the positions and velocities are updated as
+follows:
+.. math::
+   \mathbf{v}_{i}(t+\Delta t/2) = \mathbf{v}_{i}(t-\Delta t/2) + \mathbf{f}_{i}(t)\Delta t/{m}_{i}
+.. math::
+   \mathbf{r}_{i}(t+\Delta t/2) = \mathbf{r}_{i}(t) + \mathbf{v}_{i}(t+\Delta t/2)\Delta t/2
+.. math::
+   \mathbf{v'}_{i}(t+\Delta t/2) = \mathrm{scale}\times\mathbf{v}_{i}(t+\Delta t/2)
+.. math::
+   \mathbf{r}_{i}(t+\Delta t) = \mathbf{r}_{i}(t+\Delta t/2) + \mathbf{v'}_{i}(t+\Delta t/2)\Delta t/2
+The universal scale factor used in the third step is determined by propagating
+auxilliary degrees of freedom alongside the regular particles.  The original
+Nosé-Hoover formulation used a single harmonic oscillator for the heat bath,
+but this is problematic in small or stiff systems, which are non-ergodic, so
+the chain formulation extends this by replacing the single oscillator
+thermostat with a chain of connected oscillators.  :cite:`Martyna1992`  For
+large systems a single oscillator (*i.e.* a chain length of one) will suffice,
+but longer chains are necessary to properly thermostat non-ergodic systems.
+The OpenMM default is to use a chain length of three to cover the latter case,
+but this can be safely reduced to increase efficiency in large systems.
+The heat bath propagation is performed using a multi-timestep algorithm.  Each
+propagation step is discretized into substeps using a factorization from
+Yoshida and Suzuki; the default discretization uses a :math:`\mathcal{O}(\Delta
+t^6)` approach that uses 7 points, but 1, 3 or 5 points may also be used to
+increase performace, at the expense of accuracy.  Each step is further
+subdivided into multi-timesteps with a default of 3 multi time steps per
+propagation; as with the number of Yoshida-Suziki points this value may be
+increase to increase accuracy but with additional computational expense.
 BrownianIntegrator
 ******************
@@ -1623,7 +1669,8 @@ Force Groups
 ************
 It is possible to split the Force objects in a System into groups.  Those groups
-can then be evaluated independently of each other.  Some Force classes also
+can then be evaluated independently of each other.  This is done by calling
+:code:`setForceGroup()` on the Force.  Some Force classes also
 provide finer grained control over grouping.  For example, NonbondedForce allows
 direct space computations to be in one group and reciprocal space computations
 in a different group.
@@ -1631,8 +1678,15 @@ in a different group.
 The most important use of force groups is for implementing multiple time step
 algorithms with CustomIntegrator.  For example, you might evaluate the slowly
 changing nonbonded interactions less frequently than the quickly changing bonded
-ones.  It also is useful if you want the ability to query a subset of the forces
+ones.  This can be done by putting the slow and fast forces into separate
-acting on the system.
+groups, then using a :class:`MTSIntegrator` or :class:`MTSLangevinIntegrator`
+that evaluates the groups at different frequencies.
+Another important use is to define forces that are not used when integrating
+the equations of motion, but can still be queried efficiently.  To do this,
+call :code:`setIntegrationForceGroups()` on the :class:`Integrator`.  Any groups
+omitted will be ignored during simulation, but can be queried at any time by
+calling :code:`getState()`.
 Virtual Sites
 *************

--- a/examples/apoa1.pdb
+++ b/examples/apoa1.pdb
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -88,12 +88,13 @@ def runOneTest(testName, options):
            dt = 0.005*unit.picoseconds
            constraints = app.AllBonds
            hydrogenMass = 4*unit.amu
+            integ = mm.LangevinIntegrator(300*unit.kelvin, friction, dt)
        else:
-            dt = 0.002*unit.picoseconds
+            dt = 0.004*unit.picoseconds
            constraints = app.HBonds
            hydrogenMass = None
+            integ = mm.LangevinMiddleIntegrator(300*unit.kelvin, friction, dt)
        system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
-        integ = mm.LangevinIntegrator(300*unit.kelvin, friction, dt)
    print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
    properties = {}
    initialSteps = 5

--- a/libraries/pthreads/include/pthread.h
+++ b/libraries/pthreads/include/pthread.h
@@ -53,6 +53,7 @@
 #include <errno.h>
 #include <sys/timeb.h>
 #include <process.h>
+#include <stdint.h>
 #ifndef ETIMEDOUT
 #define ETIMEDOUT	110
@@ -78,7 +79,7 @@
 #define PTHREAD_DEFAULT_ATTR (PTHREAD_CANCEL_ENABLE)
-#define PTHREAD_CANCELED ((void *) 0xDEADBEEF)
+#define PTHREAD_CANCELED ((void *)(uintptr_t) 0xDEADBEEF)
 #define PTHREAD_ONCE_INIT 0
 #define PTHREAD_MUTEX_INITIALIZER {(void*)-1,-1,0,0,0,0}
@@ -1090,7 +1091,7 @@ static int pthread_barrierattr_destroy(void **attr)
 static int pthread_barrierattr_setpshared(void **attr, int s)
 {
-	*attr = (void *) s;
+	*attr = (void *)(intptr_t) s;
 	return 0;
 }

--- a/openmmapi/include/openmm/Integrator.h
+++ b/openmmapi/include/openmm/Integrator.h
@@ -85,6 +85,18 @@ public:
     * @param steps   the number of time steps to take
     */
    virtual void step(int steps) = 0;
+    /**
+     * Get which force groups to use for integration.  By default, all force groups
+     * are included.  This is interpreted as a set of bit flags: the forces from group i
+     * will be included if (groups&(1<<i)) != 0.
+     */
+    virtual int getIntegrationForceGroups() const;
+    /**
+     * Set which force groups to use for integration.  By default, all force groups
+     * are included.  This is interpreted as a set of bit flags: the forces from group i
+     * will be included if (groups&(1<<i)) != 0.
+     */
+    virtual void setIntegrationForceGroups(int groups);
 protected:
    friend class Context;
    friend class ContextImpl;
@@ -166,6 +178,7 @@ protected:
    }
 private:
    double stepSize, constraintTol;
+    int forceGroups;
 };
 } // namespace OpenMM

--- a/openmmapi/include/openmm/LocalEnergyMinimizer.h
+++ b/openmmapi/include/openmm/LocalEnergyMinimizer.h
@@ -43,6 +43,10 @@ namespace OpenMM {
 * force to the potential function.  The strength of the restraining force is steadily increased
 * until the minimum energy configuration satisfies all constraints to within the tolerance
 * specified by the Context's Integrator.
+ * 
+ * Energy minimization is done using the force groups defined by the Integrator.
+ * If you have called setIntegrationForceGroups() on it to restrict the set of forces
+ * used for integration, only the energy of the included forces will be minimized.
 */
 class OPENMM_EXPORT LocalEnergyMinimizer {

--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
-#ifndef OPENMM_VECTORIZE8_H_
+#ifndef OPENMM_VECTORIZEAVX_H_
-#define OPENMM_VECTORIZE8_H_
+#define OPENMM_VECTORIZEAVX_H_
 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
@@ -57,9 +57,7 @@ public:
     * @param table The table from which to do a lookup.
     * @param indexes The indexes to gather.
     */
-    fvec8(const float* table, const int idx[8]) {
+    fvec8(const float* table, const int32_t idx[8]) {
-        // :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
-        // Use gather and static assert? Conditional code?
        val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
    }
@@ -75,55 +73,55 @@ public:
    void store(float* v) const {
        _mm256_storeu_ps(v, val);
    }
-    fvec8 operator+(const fvec8& other) const {
+    fvec8 operator+(fvec8 other) const {
        return _mm256_add_ps(val, other);
    }
-    fvec8 operator-(const fvec8& other) const {
+    fvec8 operator-(fvec8 other) const {
        return _mm256_sub_ps(val, other);
    }
-    fvec8 operator*(const fvec8& other) const {
+    fvec8 operator*(fvec8 other) const {
        return _mm256_mul_ps(val, other);
    }
-    fvec8 operator/(const fvec8& other) const {
+    fvec8 operator/(fvec8 other) const {
        return _mm256_div_ps(val, other);
    }
-    void operator+=(const fvec8& other) {
+    void operator+=(fvec8 other) {
        val = _mm256_add_ps(val, other);
    }
-    void operator-=(const fvec8& other) {
+    void operator-=(fvec8 other) {
        val = _mm256_sub_ps(val, other);
    }
-    void operator*=(const fvec8& other) {
+    void operator*=(fvec8 other) {
        val = _mm256_mul_ps(val, other);
    }
-    void operator/=(const fvec8& other) {
+    void operator/=(fvec8 other) {
        val = _mm256_div_ps(val, other);
    }
    fvec8 operator-() const {
        return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
    }
-    fvec8 operator&(const fvec8& other) const {
+    fvec8 operator&(fvec8 other) const {
        return _mm256_and_ps(val, other);
    }
-    fvec8 operator|(const fvec8& other) const {
+    fvec8 operator|(fvec8& other) const {
        return _mm256_or_ps(val, other);
    }
-    fvec8 operator==(const fvec8& other) const {
+    fvec8 operator==(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
    }
-    fvec8 operator!=(const fvec8& other) const {
+    fvec8 operator!=(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
    }
-    fvec8 operator>(const fvec8& other) const {
+    fvec8 operator>(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
    }
-    fvec8 operator<(const fvec8& other) const {
+    fvec8 operator<(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
    }
-    fvec8 operator>=(const fvec8& other) const {
+    fvec8 operator>=(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
    }
-    fvec8 operator<=(const fvec8& other) const {
+    fvec8 operator<=(fvec8 other) const {
        return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
    }
    operator ivec8() const;
@@ -159,10 +157,10 @@ public:
    void store(int* v) const {
        _mm256_storeu_si256((__m256i*) v, val);
    }
-    ivec8 operator&(const ivec8& other) const {
+    ivec8 operator&(ivec8 other) const {
        return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
    }
-    ivec8 operator|(const ivec8& other) const {
+    ivec8 operator|(ivec8 other) const {
        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
    }
    operator fvec8() const;
@@ -193,36 +191,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec8s.
-static inline fvec8 floor(const fvec8& v) {
+static inline fvec8 floor(fvec8 v) {
    return fvec8(_mm256_round_ps(v.val, 0x09));
 }
-static inline fvec8 ceil(const fvec8& v) {
+static inline fvec8 ceil(fvec8 v) {
    return fvec8(_mm256_round_ps(v.val, 0x0A));
 }
-static inline fvec8 round(const fvec8& v) {
+static inline fvec8 round(fvec8 v) {
    return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
 }
-static inline fvec8 min(const fvec8& v1, const fvec8& v2) {
+static inline fvec8 min(fvec8 v1, fvec8 v2) {
    return fvec8(_mm256_min_ps(v1.val, v2.val));
 }
-static inline fvec8 max(const fvec8& v1, const fvec8& v2) {
+static inline fvec8 max(fvec8 v1, fvec8 v2) {
    return fvec8(_mm256_max_ps(v1.val, v2.val));
 }
-static inline fvec8 abs(const fvec8& v) {
+static inline fvec8 abs(fvec8 v) {
    static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    return fvec8(_mm256_and_ps(v.val, mask));
 }
-static inline fvec8 sqrt(const fvec8& v) {
+static inline fvec8 sqrt(fvec8 v) {
    return fvec8(_mm256_sqrt_ps(v.val));
 }
-static inline fvec8 rsqrt(const fvec8& v) {
+static inline fvec8 rsqrt(fvec8 v) {
    // Initial estimate of rsqrt().
    fvec8 y(_mm256_rsqrt_ps(v.val));
@@ -234,17 +232,17 @@ static inline fvec8 rsqrt(const fvec8& v) {
    return y;
 }
-static inline float dot8(const fvec8& v1, const fvec8& v2) {
+static inline float dot8(fvec8 v1, fvec8 v2) {
    fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
 }
-static inline float reduceAdd(const fvec8 v) {
+static inline float reduceAdd(fvec8 v) {
    // :TODO: There are more efficient ways to do this.
    return dot8(v, fvec8(1.0f));
 }
-static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
+static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
    fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
    fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
    _MM_TRANSPOSE4_PS(i1, i2, i3, i4);
@@ -275,7 +273,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8&
    transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
 }
-static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
+static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
    out1 = in1.lowerVec();
    out2 = in2.lowerVec();
    out3 = in3.lowerVec();
@@ -291,40 +289,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
 /**
 * Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
 */
-static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) {
+static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4 out[8]) {
    transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
 }
 // Functions that operate on ivec8s.
-static inline bool any(const ivec8& v) {
+static inline bool any(ivec8 v) {
    return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec8 operator+(float v1, const fvec8& v2) {
+static inline fvec8 operator+(float v1, fvec8 v2) {
    return fvec8(v1)+v2;
 }
-static inline fvec8 operator-(float v1, const fvec8& v2) {
+static inline fvec8 operator-(float v1, fvec8 v2) {
    return fvec8(v1)-v2;
 }
-static inline fvec8 operator*(float v1, const fvec8& v2) {
+static inline fvec8 operator*(float v1, fvec8 v2) {
    return fvec8(v1)*v2;
 }
-static inline fvec8 operator/(float v1, const fvec8& v2) {
+static inline fvec8 operator/(float v1, fvec8 v2) {
    return fvec8(v1)/v2;
 }
 // Operation for blending fvec8 from a full bitmask.
-static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) {
+static inline fvec8 blend(fvec8 v1, fvec8 v2, fvec8 mask) {
    return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
 }
-static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
+static inline fvec8 blendZero(fvec8 v, fvec8 mask) {
    return blend(0.0f, v, mask);
 }
@@ -333,7 +331,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) {
+static inline void gatherVecPair(const float* table, ivec8 index, fvec8& out0, fvec8& out1) {
    const auto lower = index.lowerVec();
    const auto upper = index.upperVec();
@@ -368,7 +366,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o
 *   output[2] = (Z0 + Z1 + Z2 + ...)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
+static inline fvec4 reduceToVec3(fvec8 x, fvec8 y, fvec8 z) {
    // The general strategy for a vector reduce-add operation is to take values from
    // different parts of the vector and overlap them a different part of the vector and then
    // add together. Repeat this several times until all values have been summed. Initially 8
@@ -415,4 +413,4 @@ static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
    return laneResult.lowerVec() + laneResult.upperVec();
 }
-#endif /*OPENMM_VECTORIZE8_H_*/
+#endif /*OPENMM_VECTORIZEAVX_H_*/
--- a/openmmapi/include/openmm/internal/vectorizeAvx2.h
+++ b/openmmapi/include/openmm/internal/vectorizeAvx2.h
+#ifndef OPENMM_VECTORIZE_AVX2_H_
+#define OPENMM_VECTORIZE_AVX2_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Authors: Daniel Towner                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "vectorizeAvx.h"
+#include <immintrin.h>
+// This file defines classes and functions to simplify vectorizing code with AVX.
+bool isAvx2Supported() {
+    // Provide an alternative implementation of CPUID to support AVX2. On older
+    // non-Windows OSes the hardware.h support for CPUID doesn't set the CX register
+    // properly and gives the wrong answer when detecting AVX2 and beyond. On Windows
+    // the cpuid seems to work as expected so can be used.
+#if !(defined(_WIN32) || defined(WIN32))
+    auto cpuid = [](int output[4], int functionnumber) {
+        int a, b, c, d;
+        __asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber), "c"(0) : );
+        output[0] = a;
+        output[1] = b;
+        output[2] = c;
+        output[3] = d;
+    };
+#endif
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 7) {
+        cpuInfo[2] = 0;
+        cpuid(cpuInfo, 7);
+        return ((cpuInfo[1] & ((int) 1 << 5)) != 0);
+    }
+    return false;
+}
+/** 
+ * Derive from fvec8 so that default implementations of everything are provided,
+ * but can be overriden with AVX2-specific variants where possible.
+ */
+class fvecAvx2 : public fvec8 {
+public:
+    fvecAvx2() = default;
+    fvecAvx2(fvec8 v) : fvec8(v) {}
+    fvecAvx2(float v) : fvec8(v) {}
+    fvecAvx2(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : fvec8(v8, v7, v6, v5, v4, v3, v2, v1) {}
+    fvecAvx2(__m256 v) : fvec8(v) {}
+    fvecAvx2(const float* v) : fvec8(v) {}
+    /** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
+     * be loaded from table[idx[i]].
+     * @param table The table from which to do a lookup.
+     * @param indexes The indexes to gather.
+     */
+    fvecAvx2(const float* table, const int idx[8])
+        : fvec8(_mm256_i32gather_ps(table, _mm256_loadu_si256((const __m256i*)idx), 4)) {}
+    static fvecAvx2 expandBitsToMask(int bitmask);
+};
+inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
+    // Put a copy of all bits into each vector element and then shift so that the
+    // appropriate sub-bit becomes the MSB. For masking purposes, only the MSB matters and
+    // the other bits can be completely arbitrary.
+    const auto msb = _mm256_sllv_epi32(_mm256_set1_epi8(bitmask),
+                                       _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+    return _mm256_castsi256_ps(msb);
+}
+/**
+ * Given a table of floating-point values and a set of indexes, perform a gather read into a pair
+ * of vectors. The first result vector contains the values at the given indexes, and the second
+ * result vector contains the values from each respective index+1.
+ */
+static inline void gatherVecPair(const float* table, ivec8 index, fvecAvx2& out0, fvecAvx2& out1) {
+    const double* tableAsDbl = (const double*)table;
+    // The input is a set of 8 indexes, each of which refers to a pair of floating point
+    // values. The most efficient way to load from indexes in a vector is the gather instruction,
+    // and the 64-bit variant should be used to get the pairs.
+    // Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
+    // 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
+    const auto lowerIdx = _mm256_and_si256(index, _mm256_set1_epi64x(0xFFFFFFFF));
+    const auto lowerGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, lowerIdx, 4));
+    // Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
+    const auto upperIdx = _mm256_srli_epi64(index, 32);
+    const auto upperGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, upperIdx, 4));
+    // All the first values must now be brought together. The lower values are already in the
+    // correct place, but the upper gather values must be moved over and blended in.
+    const auto swapUpper = _mm256_permute_ps(upperGather, 0b10110001);
+    out0 = fvecAvx2(_mm256_blend_ps(lowerGather, swapUpper, 0b10101010));
+    // And the same for the upper values.
+    const auto swapLower = _mm256_permute_ps(lowerGather, 0b10110001);
+    out1 = fvecAvx2(_mm256_blend_ps(swapLower, upperGather, 0b10101010));
+}
+#endif /*OPENMM_VECTORIZE_AVX2_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -76,10 +76,7 @@ public:
    fvec4() = default;
    fvec4(float v) : val(vdupq_n_f32(v)) {}
-    fvec4(float v1, float v2, float v3, float v4) {
+    fvec4(float v1, float v2, float v3, float v4) : val {v1, v2, v3, v4} {}
-        float v[] = {v1, v2, v3, v4};
-        val = vld1q_f32(v);
-    }
    fvec4(float32x4_t v) : val(v) {}
    fvec4(const float* v) : val(vld1q_f32(v)) {}
    operator float32x4_t() const {
@@ -92,7 +89,7 @@ public:
      * @param table The table from which to do a lookup.
      * @param indexes The indexes to gather.
      */
-    fvec4(const float* table, const int idx[4])
+    fvec4(const float* table, const int32_t idx[4])
        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    float operator[](int i) const {
@@ -121,16 +118,16 @@ public:
        v[2] = vgetq_lane_f32(val, 2);
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return vaddq_f32(val, other);
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return vsubq_f32(val, other);
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return vmulq_f32(val, other);
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
        float32x4_t reciprocal = vrecpeq_f32(other);
@@ -139,45 +136,34 @@ public:
        fvec4 result = vmulq_f32(val,reciprocal);
        return result;
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = vaddq_f32(val, other);
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = vsubq_f32(val, other);
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = vmulq_f32(val, other);
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = *this/other;
    }
    fvec4 operator-() const {
        return vnegq_f32(val);
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
    }
-    fvec4 operator==(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
+    ivec4 operator==(fvec4 other) const;
-    }
+    ivec4 operator!=(fvec4 other) const;
-    fvec4 operator!=(const fvec4& other) const {
+    ivec4 operator>(fvec4 other) const;
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))
+    ivec4 operator<(fvec4 other) const;
-    }
+    ivec4 operator>=(fvec4 other) const;
-    fvec4 operator>(const fvec4& other) const {
+    ivec4 operator<=(fvec4 other) const;
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
-    }
-    fvec4 operator<(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
-    }
-    fvec4 operator>=(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
-    }
-    fvec4 operator<=(const fvec4& other) const {
-        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
-    }
    operator ivec4() const;
    /**
@@ -198,10 +184,7 @@ public:
    ivec4() {}
    ivec4(int v) : val(vdupq_n_s32(v)) {}
-    ivec4(int v1, int v2, int v3, int v4) {
+    ivec4(int v1, int v2, int v3, int v4) : val {v1, v2, v3, v4} {}
-        int v[] = {v1, v2, v3, v4};
-        val = vld1q_s32(v);
-    }
    ivec4(int32x4_t v) : val(v) {}
    ivec4(const int* v) : val(vld1q_s32(v)) {}
    operator int32x4_t() const {
@@ -223,49 +206,49 @@ public:
    void store(int* v) const {
        vst1q_s32(v, val);
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return vaddq_s32(val, other);
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return vsubq_s32(val, other);
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return vmulq_s32(val, other);
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = vaddq_s32(val, other);
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = vsubq_s32(val, other);
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = vmulq_s32(val, other);
    }
    ivec4 operator-() const {
        return vnegq_s32(val);
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return vandq_s32(val, other);
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return vorrq_s32(val, other);
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return vreinterpretq_s32_u32(vceqq_s32(val, other));
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return vreinterpretq_s32_u32(vcgtq_s32(val, other));
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return vreinterpretq_s32_u32(vcltq_s32(val, other));
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return vreinterpretq_s32_u32(vcgeq_s32(val, other));
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return vreinterpretq_s32_u32(vcleq_s32(val, other));
    }
    operator fvec4() const;
@@ -287,54 +270,84 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
                 bitmask & 4 ? -1 : 0,
                 bitmask & 8 ? -1 : 0);
 }
+// Comparison operators
+inline ivec4 fvec4::operator==(fvec4 other) const {
+    return vreinterpretq_s32_u32(vceqq_f32(val, other));
+}
+inline ivec4 fvec4::operator!=(fvec4 other) const {
+    return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other))); // not(equals(val, other))
+}
+inline ivec4 fvec4::operator>(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcgtq_f32(val, other));
+}
+inline ivec4 fvec4::operator<(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcltq_f32(val, other));
+}
+inline ivec4 fvec4::operator>=(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcgeq_f32(val, other));
+}
+inline ivec4 fvec4::operator<=(fvec4 other) const {
+    return vreinterpretq_s32_u32(vcleq_f32(val, other));
+}
 // Functions that operate on fvec4s.
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return vminq_f32(v1, v2);
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return vmaxq_f32(v1, v2);
 }
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    return vabsq_f32(v);
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    float32x4_t recipSqrt = vrsqrteq_f32(v);
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
    return recipSqrt;
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return rsqrt(v)*v;
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(exp_ps(v.val));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(log_ps(v.val));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    fvec4 result = v1*v2;
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    fvec4 result = v1*v2;
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
+#ifdef __ARM64__
+    return vaddvq_f32(v);
+#else
    return dot4(v, fvec4(1.0f));
+#endif
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
                 v1[2]*v2[0] - v1[0]*v2[2],
                 v1[0]*v2[1] - v1[1]*v2[0], 0);
@@ -362,71 +375,79 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return vminq_s32(v1, v2);
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return vmaxq_s32(v1, v2);
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return vabdq_s32(v, ivec4(0));
 }
-static inline bool any(const ivec4& v) {
+static inline bool any(ivec4 v) {
+#ifdef __ARM64__
+    return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
+#else
    return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
+#endif
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
    return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
 }
-static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
-    return blend(0.0f, v, mask);
+    return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), mask));
+}
+static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
+    return v & mask;
 }
 // These are at the end since they involve other functions defined above.
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    fvec4 shift(0x1.0p23f);
    fvec4 absResult = (abs(v)+shift)-shift;
    return blend(v, absResult, ivec4(0x7FFFFFFF));
 }
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    fvec4 rounded = round(v);
    return rounded + blend(0.0f, -1.0f, rounded>v);
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    fvec4 rounded = round(v);
    return rounded + blend(0.0f, 1.0f, rounded<v);
 }
@@ -435,7 +456,7 @@ static inline fvec4 ceil(const fvec4& v) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -458,7 +479,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);
    const auto nz = reduceAdd(z);

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -74,7 +74,7 @@ public:
     * @param table The table from which to do a lookup.
     * @param indexes The indexes to gather.
     */
-    fvec4(const float* table, const int idx[4])
+    fvec4(const float* table, const int32_t idx[4])
        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    operator __m128() const {
@@ -95,45 +95,45 @@ public:
        v[1] = val[1];
        v[2] = val[2];
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return val+other;
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return val-other;
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return val*other;
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        return val/other;
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = val+other;
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = val-other;
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = val*other;
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = val/other;
    }
    fvec4 operator-() const {
        return -val;
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return (fvec4) (((__m128i)val)&((__m128i)other.val));
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return (fvec4) (((__m128i)val)|((__m128i)other.val));
    }
-    ivec4 operator==(const fvec4& other) const;
+    ivec4 operator==(fvec4 other) const;
-    ivec4 operator!=(const fvec4& other) const;
+    ivec4 operator!=(fvec4 other) const;
-    ivec4 operator>(const fvec4& other) const;
+    ivec4 operator>(fvec4 other) const;
-    ivec4 operator<(const fvec4& other) const;
+    ivec4 operator<(fvec4 other) const;
-    ivec4 operator>=(const fvec4& other) const;
+    ivec4 operator>=(fvec4 other) const;
-    ivec4 operator<=(const fvec4& other) const;
+    ivec4 operator<=(fvec4 other) const;
    operator ivec4() const;
    /**
@@ -171,49 +171,49 @@ public:
    void store(int* v) const {
        *((__m128*) v) = val;
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return val+other;
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return val-other;
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return val*other;
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = val+other;
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = val-other;
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = val*other;
    }
    ivec4 operator-() const {
        return -val;
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return val&other.val;
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return val|other.val;
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return (val==other.val);
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return (val!=other.val);
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return (val>other.val);
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return (val<other.val);
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return (val>=other.val);
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return (val<=other.val);
    }
    operator fvec4() const;
@@ -221,27 +221,27 @@ public:
 // Conversion operators.
-inline ivec4 fvec4::operator==(const fvec4& other) const {
+inline ivec4 fvec4::operator==(fvec4 other) const {
    return (__m128i) (val==other.val);
 }
-inline ivec4 fvec4::operator!=(const fvec4& other) const {
+inline ivec4 fvec4::operator!=(fvec4 other) const {
    return (__m128i) (val!=other.val);
 }
-inline ivec4 fvec4::operator>(const fvec4& other) const {
+inline ivec4 fvec4::operator>(fvec4 other) const {
    return (__m128i) (val>other.val);
 }
-inline ivec4 fvec4::operator<(const fvec4& other) const {
+inline ivec4 fvec4::operator<(fvec4 other) const {
    return (__m128i) (val<other.val);
 }
-inline ivec4 fvec4::operator>=(const fvec4& other) const {
+inline ivec4 fvec4::operator>=(fvec4 other) const {
    return (__m128i) (val>=other.val);
 }
-inline ivec4 fvec4::operator<=(const fvec4& other) const {
+inline ivec4 fvec4::operator<=(fvec4 other) const {
    return (__m128i) (val<=other.val);
 }
@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec4s.
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    return v&(__m128) ivec4(0x7FFFFFFF);
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    return r[0]+r[1]+r[2];
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    fvec4 temp = __builtin_shufflevector(r.val, r.val, 0, 1, -1, -1)+__builtin_shufflevector(r.val, r.val, 2, 3, -1, -1);
    return temp[0]+temp[1];
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
    return dot4(v, fvec4(1.0f));
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
                  v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
    return __builtin_shufflevector(temp, temp, 2, 0, 1, 3);
@@ -317,85 +317,89 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3]));
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3]));
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return ivec4(abs(v[0]), abs(v[1]), abs(v[2]), abs(v[3]));
 }
-static inline bool any(const __m128i& v) {
+static inline bool any(__m128i v) {
    ivec4 temp = __builtin_shufflevector(v, v, 0, 1, -1, -1) | __builtin_shufflevector(v, v, 2, 3, -1, -1);
    return (temp[0] || temp[1]);
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
    return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
 }
-static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
    return blend(0.0f, v, mask);
 }
+static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
+    return v & mask;
+}
 // These are at the end since they involve other functions defined above.
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return blend(v1, v2, v1 > v2);
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return blend(v1, v2, v1 < v2);
 }
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    fvec4 shift(0x1.0p23f);
    fvec4 absResult = (abs(v)+shift)-shift;
    return (__m128) ((ivec4(0x80000000)&(__m128i)v) + (ivec4(0x7FFFFFFF)&(__m128i)absResult));
 }
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
    return truncated + blend(0.0f, -1.0f, truncated>v);
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
    return truncated + blend(0.0f, 1.0f, truncated<v);
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    // Initial estimate of rsqrt().
    ivec4 i = (__m128i) v;
@@ -411,7 +415,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return rsqrt(v)*v;
 }
@@ -420,7 +424,7 @@ static inline fvec4 sqrt(const fvec4& v) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -443,7 +447,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);
    const auto nz = reduceAdd(z);

--- a/openmmapi/include/openmm/internal/vectorize_ppc.h
+++ b/openmmapi/include/openmm/internal/vectorize_ppc.h
@@ -97,45 +97,45 @@ public:
        v[2] = val[2];
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return vec_add(val, other.val);
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return vec_sub(val, other.val);
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return vec_mul(val, other.val);
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        return vec_div(val, other.val);
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = vec_add(val, other.val);
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = vec_sub(val, other.val);
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = vec_mul(val, other.val);
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = vec_div(val, other.val);
    }
    fvec4 operator-() const {
        return -val;
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return vec_and(val, other.val);
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return vec_or(val, other.val);
    }
-    ivec4 operator==(const fvec4& other) const;
+    ivec4 operator==(fvec4 other) const;
-    ivec4 operator!=(const fvec4& other) const;
+    ivec4 operator!=(fvec4 other) const;
-    ivec4 operator>(const fvec4& other) const;
+    ivec4 operator>(fvec4 other) const;
-    ivec4 operator<(const fvec4& other) const;
+    ivec4 operator<(fvec4 other) const;
-    ivec4 operator>=(const fvec4& other) const;
+    ivec4 operator>=(fvec4 other) const;
-    ivec4 operator<=(const fvec4& other) const;
+    ivec4 operator<=(fvec4 other) const;
    operator ivec4() const;
    /***
@@ -173,49 +173,49 @@ public:
    void store(int* v) const {
        *((__m128i*) v) = val;
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return vec_add(val, other.val);
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return vec_sub(val, other.val);
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return val*other.val;
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = vec_add(val, other.val);
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = vec_sub(val, other.val);
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = val*other.val;
    }
    ivec4 operator-() const {
        return -val;
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return val&other.val;
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return val|other.val;
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return (val==other.val);
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return (val!=other.val);
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return (val>other.val);
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return (val<other.val);
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return (val>=other.val);
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return (val<=other.val);
    }
    operator fvec4() const;
@@ -223,27 +223,27 @@ public:
 // Conversion operators.
-inline ivec4 fvec4::operator==(const fvec4& other) const {
+inline ivec4 fvec4::operator==(fvec4 other) const {
    return  (val==other.val);
 }
-inline ivec4 fvec4::operator!=(const fvec4& other) const {
+inline ivec4 fvec4::operator!=(fvec4 other) const {
    return  (val!=other.val);
 }
-inline ivec4 fvec4::operator>(const fvec4& other) const {
+inline ivec4 fvec4::operator>(fvec4 other) const {
    return  (val>other.val);
 }
-inline ivec4 fvec4::operator<(const fvec4& other) const {
+inline ivec4 fvec4::operator<(fvec4 other) const {
    return  (val<other.val);
 }
-inline ivec4 fvec4::operator>=(const fvec4& other) const {
+inline ivec4 fvec4::operator>=(fvec4 other) const {
    return  (val>=other.val);
 }
-inline ivec4 fvec4::operator<=(const fvec4& other) const {
+inline ivec4 fvec4::operator<=(fvec4 other) const {
    return  (val<=other.val);
 }
@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec4s.
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    return vec_abs(v.val);
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    return r[0]+r[1]+r[2];
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    fvec4 r = v1*v2;
    fvec4 temp = r + vec_sld(r.val, r.val, 8);
    return temp[0]+temp[1];
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
    return dot4(v, fvec4(1.0f));
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
    __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
                  v1.val*vec_perm(v2.val, v2.val, perm);
@@ -324,80 +324,84 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return vec_min(v1.val, v2.val);
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return vec_max(v1.val, v2.val);
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return vec_abs(v.val);
 }
-static inline bool any(const ivec4 v) {
+static inline bool any(ivec4 v) {
    return !vec_all_eq(v.val, ivec4(0).val);
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
    return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
 }
-static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) {
+static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
    return blend(0.0f, v, mask);
 }
+static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
+    return v & mask;
+}
 // These are at the end since they involve other functions defined above.
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return vec_min(v1.val, v2.val);
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return vec_max(v1.val, v2.val);
 }
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    return vec_round(v.val);
 }
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    return vec_floor(v.val);
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    return vec_ceil(v.val);
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    // Initial estimate of rsqrt().
    fvec4 y(vec_rsqrte(v.val));
@@ -409,7 +413,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return vec_sqrt(v.val);
 }
@@ -417,7 +421,7 @@ static inline fvec4 sqrt(const fvec4& v) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -440,7 +444,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);
    const auto nz = reduceAdd(z);

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -80,7 +80,7 @@ public:
     * @param table The table from which to do a lookup.
     * @param indexes The indexes to gather.
     */
-    fvec4(const float* table, const int idx[4])
+    fvec4(const float* table, const int32_t idx[4])
        : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
    operator __m128() const {
@@ -108,55 +108,55 @@ public:
 #endif
    }
-    fvec4 operator+(const fvec4& other) const {
+    fvec4 operator+(fvec4 other) const {
        return _mm_add_ps(val, other);
    }
-    fvec4 operator-(const fvec4& other) const {
+    fvec4 operator-(fvec4 other) const {
        return _mm_sub_ps(val, other);
    }
-    fvec4 operator*(const fvec4& other) const {
+    fvec4 operator*(fvec4 other) const {
        return _mm_mul_ps(val, other);
    }
-    fvec4 operator/(const fvec4& other) const {
+    fvec4 operator/(fvec4 other) const {
        return _mm_div_ps(val, other);
    }
-    void operator+=(const fvec4& other) {
+    void operator+=(fvec4 other) {
        val = _mm_add_ps(val, other);
    }
-    void operator-=(const fvec4& other) {
+    void operator-=(fvec4 other) {
        val = _mm_sub_ps(val, other);
    }
-    void operator*=(const fvec4& other) {
+    void operator*=(fvec4 other) {
        val = _mm_mul_ps(val, other);
    }
-    void operator/=(const fvec4& other) {
+    void operator/=(fvec4 other) {
        val = _mm_div_ps(val, other);
    }
    fvec4 operator-() const {
        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
    }
-    fvec4 operator&(const fvec4& other) const {
+    fvec4 operator&(fvec4 other) const {
        return _mm_and_ps(val, other);
    }
-    fvec4 operator|(const fvec4& other) const {
+    fvec4 operator|(fvec4 other) const {
        return _mm_or_ps(val, other);
    }
-    fvec4 operator==(const fvec4& other) const {
+    fvec4 operator==(fvec4 other) const {
        return _mm_cmpeq_ps(val, other);
    }
-    fvec4 operator!=(const fvec4& other) const {
+    fvec4 operator!=(fvec4 other) const {
        return _mm_cmpneq_ps(val, other);
    }
-    fvec4 operator>(const fvec4& other) const {
+    fvec4 operator>(fvec4 other) const {
        return _mm_cmpgt_ps(val, other);
    }
-    fvec4 operator<(const fvec4& other) const {
+    fvec4 operator<(fvec4 other) const {
        return _mm_cmplt_ps(val, other);
    }
-    fvec4 operator>=(const fvec4& other) const {
+    fvec4 operator>=(fvec4 other) const {
        return _mm_cmpge_ps(val, other);
    }
-    fvec4 operator<=(const fvec4& other) const {
+    fvec4 operator<=(fvec4 other) const {
        return _mm_cmple_ps(val, other);
    }
    operator ivec4() const;
@@ -191,49 +191,49 @@ public:
    void store(int* v) const {
        _mm_storeu_si128((__m128i*) v, val);
    }
-    ivec4 operator+(const ivec4& other) const {
+    ivec4 operator+(ivec4 other) const {
        return _mm_add_epi32(val, other);
    }
-    ivec4 operator-(const ivec4& other) const {
+    ivec4 operator-(ivec4 other) const {
        return _mm_sub_epi32(val, other);
    }
-    ivec4 operator*(const ivec4& other) const {
+    ivec4 operator*(ivec4 other) const {
        return _mm_mullo_epi32(val, other);
    }
-    void operator+=(const ivec4& other) {
+    void operator+=(ivec4 other) {
        val = _mm_add_epi32(val, other);
    }
-    void operator-=(const ivec4& other) {
+    void operator-=(ivec4 other) {
        val = _mm_sub_epi32(val, other);
    }
-    void operator*=(const ivec4& other) {
+    void operator*=(ivec4 other) {
        val = _mm_mullo_epi32(val, other);
    }
    ivec4 operator-() const {
        return _mm_sub_epi32(_mm_set1_epi32(0), val);
    }
-    ivec4 operator&(const ivec4& other) const {
+    ivec4 operator&(ivec4 other) const {
        return _mm_and_si128(val, other);
    }
-    ivec4 operator|(const ivec4& other) const {
+    ivec4 operator|(ivec4 other) const {
        return _mm_or_si128(val, other);
    }
-    ivec4 operator==(const ivec4& other) const {
+    ivec4 operator==(ivec4 other) const {
        return _mm_cmpeq_epi32(val, other);
    }
-    ivec4 operator!=(const ivec4& other) const {
+    ivec4 operator!=(ivec4 other) const {
        return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
    }
-    ivec4 operator>(const ivec4& other) const {
+    ivec4 operator>(ivec4 other) const {
        return _mm_cmpgt_epi32(val, other);
    }
-    ivec4 operator<(const ivec4& other) const {
+    ivec4 operator<(ivec4 other) const {
        return _mm_cmplt_epi32(val, other);
    }
-    ivec4 operator>=(const ivec4& other) const {
+    ivec4 operator>=(ivec4 other) const {
        return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
    }
-    ivec4 operator<=(const ivec4& other) const {
+    ivec4 operator<=(ivec4 other) const {
        return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
    }
    operator fvec4() const;
@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) {
 // Functions that operate on fvec4s.
-static inline fvec4 floor(const fvec4& v) {
+static inline fvec4 floor(fvec4 v) {
    return fvec4(_mm_floor_ps(v.val));
 }
-static inline fvec4 ceil(const fvec4& v) {
+static inline fvec4 ceil(fvec4 v) {
    return fvec4(_mm_ceil_ps(v.val));
 }
-static inline fvec4 round(const fvec4& v) {
+static inline fvec4 round(fvec4 v) {
    return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
 }
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
    return fvec4(_mm_min_ps(v1.val, v2.val));
 }
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
    return fvec4(_mm_max_ps(v1.val, v2.val));
 }
-static inline fvec4 abs(const fvec4& v) {
+static inline fvec4 abs(fvec4 v) {
    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    return fvec4(_mm_and_ps(v.val, mask));
 }
-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 sqrt(fvec4 v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }
-static inline fvec4 rsqrt(const fvec4& v) {
+static inline fvec4 rsqrt(fvec4 v) {
    // Initial estimate of rsqrt().
    fvec4 y(_mm_rsqrt_ps(v.val));
@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
-static inline fvec4 exp(const fvec4& v) {
+static inline fvec4 exp(fvec4 v) {
    return fvec4(exp_ps(v.val));
 }
-static inline fvec4 log(const fvec4& v) {
+static inline fvec4 log(fvec4 v) {
    return fvec4(log_ps(v.val));
 }
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
+static inline float dot3(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
+static inline float dot4(fvec4 v1, fvec4 v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
 }
-static inline float reduceAdd(const fvec4 v) {
+static inline float reduceAdd(fvec4 v) {
    return dot4(v, fvec4(1.0f));
 }
-static inline fvec4 cross(const fvec4& v1, const fvec4& v2) {
+static inline fvec4 cross(fvec4 v1, fvec4 v2) {
    fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
                 fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
    return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(3, 0, 2, 1));
@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
 /**
 * Out-of-place transpose from named variables into an array.
 */
-static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) {
+static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
    out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
    transpose(out[0], out[1], out[2], out[3]);
 }
 // Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
    return ivec4(_mm_min_epi32(v1.val, v2.val));
 }
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
    return ivec4(_mm_max_epi32(v1.val, v2.val));
 }
-static inline ivec4 abs(const ivec4& v) {
+static inline ivec4 abs(ivec4 v) {
    return ivec4(_mm_abs_epi32(v.val));
 }
-static inline bool any(const ivec4& v) {
+static inline bool any(ivec4 v) {
    return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
 }
 // Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
+static inline fvec4 operator+(float v1, fvec4 v2) {
    return fvec4(v1)+v2;
 }
-static inline fvec4 operator-(float v1, const fvec4& v2) {
+static inline fvec4 operator-(float v1, fvec4 v2) {
    return fvec4(v1)-v2;
 }
-static inline fvec4 operator*(float v1, const fvec4& v2) {
+static inline fvec4 operator*(float v1, fvec4 v2) {
    return fvec4(v1)*v2;
 }
-static inline fvec4 operator/(float v1, const fvec4& v2) {
+static inline fvec4 operator/(float v1, fvec4 v2) {
    return fvec4(v1)/v2;
 }
 // Operations for blending fvec4
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) {
+static inline fvec4 blend(fvec4 v1, fvec4 v2, fvec4 mask) {
    return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
 }
-static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
+static inline fvec4 blendZero(fvec4 v, fvec4 mask) {
    return blend(0.0f, v, mask);
 }
@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
 * of vectors. The first result vector contains the values at the given indexes, and the second
 * result vector contains the values from each respective index+1.
 */
-static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) {
+static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
    fvec4 t0(table + index[0]);
    fvec4 t1(table + index[1]);
    fvec4 t2(table + index[2]);
@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
 *   output[2] = (Z0 + Z1 + Z2 + Z3)
 *   output[3] = undefined
 */
-static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) {
+static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
    // :TODO: Could be made more efficient.
    const auto nx = reduceAdd(x);
    const auto ny = reduceAdd(y);

--- a/openmmapi/src/BrownianIntegrator.cpp
+++ b/openmmapi/src/BrownianIntegrator.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -80,7 +80,7 @@ void BrownianIntegrator::step(int steps) {
        throw OpenMMException("This Integrator is not bound to a context!");  
    for (int i = 0; i < steps; ++i) {
        context->updateContextState();
-        context->calcForcesAndEnergy(true, false);
+        context->calcForcesAndEnergy(true, false, getIntegrationForceGroups());
        kernel.getAs<IntegrateBrownianStepKernel>().execute(*context, *this);
    }
 }
--- a/openmmapi/src/Integrator.cpp
+++ b/openmmapi/src/Integrator.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -39,7 +39,7 @@
 using namespace OpenMM;
-Integrator::Integrator() : owner(NULL), context(NULL) {
+Integrator::Integrator() : owner(NULL), context(NULL), forceGroups(0xFFFFFFFF) {
 }
 Integrator::~Integrator() {
@@ -69,6 +69,14 @@ void Integrator::setConstraintTolerance(double tol) {
    constraintTol = tol;
 }
+int Integrator::getIntegrationForceGroups() const {
+    return forceGroups;
+}
+void Integrator::setIntegrationForceGroups(int groups) {
+    forceGroups = groups;
+}
 std::vector<Vec3> Integrator::getVelocitiesForTemperature(const System &system, double temperature, int randomSeed) const {
    // Generate the list of Gaussian random numbers.
    OpenMM_SFMT::SFMT sfmt;

--- a/openmmapi/src/LangevinIntegrator.cpp
+++ b/openmmapi/src/LangevinIntegrator.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -76,7 +76,7 @@ void LangevinIntegrator::step(int steps) {
        throw OpenMMException("This Integrator is not bound to a context!");  
    for (int i = 0; i < steps; ++i) {
        context->updateContextState();
-        context->calcForcesAndEnergy(true, false);
+        context->calcForcesAndEnergy(true, false, getIntegrationForceGroups());
        kernel.getAs<IntegrateLangevinStepKernel>().execute(*context, *this);
    }
 }