Merge branch 'master' of https://github.com/SimTk/openmm

caefd490 · Jason Swails · 508f989d · 0b35240d · caefd490 · caefd490
Commit caefd490 authored Dec 21, 2013 by Jason Swails
20 changed files
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -88,7 +88,7 @@ def runOneTest(testName, options):
        if platform.getName() == 'CUDA':
            properties['CudaPrecision'] = options.precision
        elif platform.getName() == 'OpenCL':
-            properties['OpenCLPrecision'] = options.device
+            properties['OpenCLPrecision'] = options.precision
    
    # Run the simulation.
    

--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
+#ifndef OPENMM_VECTORIZE8_H_
+#define OPENMM_VECTORIZE8_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "vectorize.h"
+#include <immintrin.h>
+
+// This file defines classes and functions to simplify vectorizing code with AVX.
+
+class ivec8;
+
+/**
+ * An eight element vector of floats.
+ */
+class fvec8 {
+public:
+    __m256 val;
+    
+    fvec8() {}
+    fvec8(float v) : val(_mm256_set1_ps(v)) {}
+    fvec8(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : val(_mm256_set_ps(v8, v7, v6, v5, v4, v3, v2, v1)) {}
+    fvec8(__m256 v) : val(v) {}
+    fvec8(const float* v) : val(_mm256_loadu_ps(v)) {}
+    operator __m256() const {
+        return val;
+    }
+    fvec4 lowerVec() const {
+        return _mm256_castps256_ps128(val);
+    }
+    fvec4 upperVec() const {
+        return _mm256_extractf128_ps(val, 1);
+    }
+    void store(float* v) const {
+        _mm256_storeu_ps(v, val);
+    }
+    fvec8 operator+(fvec8 other) const {
+        return _mm256_add_ps(val, other);
+    }
+    fvec8 operator-(fvec8 other) const {
+        return _mm256_sub_ps(val, other);
+    }
+    fvec8 operator*(fvec8 other) const {
+        return _mm256_mul_ps(val, other);
+    }
+    fvec8 operator/(fvec8 other) const {
+        return _mm256_div_ps(val, other);
+    }
+    void operator+=(fvec8 other) {
+        val = _mm256_add_ps(val, other);
+    }
+    void operator-=(fvec8 other) {
+        val = _mm256_sub_ps(val, other);
+    }
+    void operator*=(fvec8 other) {
+        val = _mm256_mul_ps(val, other);
+    }
+    void operator/=(fvec8 other) {
+        val = _mm256_div_ps(val, other);
+    }
+    fvec8 operator-() const {
+        return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
+    }
+    fvec8 operator&(fvec8 other) const {
+        return _mm256_and_ps(val, other);
+    }
+    fvec8 operator|(fvec8 other) const {
+        return _mm256_or_ps(val, other);
+    }
+    fvec8 operator==(fvec8 other) const {
+        return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
+    }
+    fvec8 operator!=(fvec8 other) const {
+        return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
+    }
+    fvec8 operator>(fvec8 other) const {
+        return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
+    }
+    fvec8 operator<(fvec8 other) const {
+        return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
+    }
+    fvec8 operator>=(fvec8 other) const {
+        return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
+    }
+    fvec8 operator<=(fvec8 other) const {
+        return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
+    }
+    operator ivec8() const;
+};
+
+/**
+ * An eight element vector of ints.
+ */
+class ivec8 {
+public:
+    __m256i val;
+    
+    ivec8() {}
+    ivec8(int v) : val(_mm256_set1_epi32(v)) {}
+    ivec8(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) : val(_mm256_set_epi32(v8, v7, v6, v5, v4, v3, v2, v1)) {}
+    ivec8(__m256i v) : val(v) {}
+    ivec8(const int* v) : val(_mm256_loadu_si256((const __m256i*) v)) {}
+    operator __m256i() const {
+        return val;
+    }
+    ivec4 lowerVec() const {
+        return _mm256_castsi256_si128(val);
+    }
+    ivec4 upperVec() const {
+        return _mm256_extractf128_si256(val, 1);
+    }
+    void store(int* v) const {
+        _mm256_storeu_si256((__m256i*) v, val);
+    }
+    ivec8 operator&(ivec8 other) const {
+        return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
+    }
+    ivec8 operator|(ivec8 other) const {
+        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
+    }
+    operator fvec8() const;
+};
+
+// Conversion operators.
+
+inline fvec8::operator ivec8() const {
+    return _mm256_cvttps_epi32(val);
+}
+
+inline ivec8::operator fvec8() const {
+    return _mm256_cvtepi32_ps(val);
+}
+
+// Functions that operate on fvec8s.
+
+static inline fvec8 floor(fvec8 v) {
+    return fvec8(_mm256_floor_ps(v.val));
+}
+
+static inline fvec8 ceil(fvec8 v) {
+    return fvec8(_mm256_ceil_ps(v.val));
+}
+
+static inline fvec8 round(fvec8 v) {
+    return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
+}
+
+static inline fvec8 min(fvec8 v1, fvec8 v2) {
+    return fvec8(_mm256_min_ps(v1.val, v2.val));
+}
+
+static inline fvec8 max(fvec8 v1, fvec8 v2) {
+    return fvec8(_mm256_max_ps(v1.val, v2.val));
+}
+
+static inline fvec8 abs(fvec8 v) {
+    static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+    return fvec8(_mm256_and_ps(v.val, mask));
+}
+
+static inline fvec8 sqrt(fvec8 v) {
+    return fvec8(_mm256_sqrt_ps(v.val));
+}
+
+static inline float dot8(fvec8 v1, fvec8 v2) {
+    fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
+    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
+}
+
+static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
+    _MM_TRANSPOSE4_PS(in1, in2, in3, in4);
+    _MM_TRANSPOSE4_PS(in5, in6, in7, in8);
+    out1 = _mm256_castps128_ps256(in1);
+    out1 = _mm256_insertf128_ps(out1, in5, 1);
+    out2 = _mm256_castps128_ps256(in2);
+    out2 = _mm256_insertf128_ps(out2, in6, 1);
+    out3 = _mm256_castps128_ps256(in3);
+    out3 = _mm256_insertf128_ps(out3, in7, 1);
+    out4 = _mm256_castps128_ps256(in4);
+    out4 = _mm256_insertf128_ps(out4, in8, 1);
+}
+
+static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
+    out1 = in1.lowerVec();
+    out2 = in2.lowerVec();
+    out3 = in3.lowerVec();
+    out4 = in4.lowerVec();
+    _MM_TRANSPOSE4_PS(out1, out2, out3, out4);
+    out5 = in1.upperVec();
+    out6 = in2.upperVec();
+    out7 = in3.upperVec();
+    out8 = in4.upperVec();
+    _MM_TRANSPOSE4_PS(out5, out6, out7, out8);
+}
+
+// Functions that operate on ivec8s.
+
+static inline bool any(ivec8 v) {
+    return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+// Mathematical operators involving a scalar and a vector.
+
+static inline fvec8 operator+(float v1, fvec8 v2) {
+    return fvec8(v1)+v2;
+}
+
+static inline fvec8 operator-(float v1, fvec8 v2) {
+    return fvec8(v1)-v2;
+}
+
+static inline fvec8 operator*(float v1, fvec8 v2) {
+    return fvec8(v1)*v2;
+}
+
+static inline fvec8 operator/(float v1, fvec8 v2) {
+    return fvec8(v1)/v2;
+}
+
+// Operations for blending fvec8s based on an ivec8.
+
+static inline fvec8 blend(fvec8 v1, fvec8 v2, ivec8 mask) {
+    return fvec8(_mm256_blendv_ps(v1.val, v2.val, _mm256_castsi256_ps(mask.val)));
+}
+
+#endif /*OPENMM_VECTORIZE8_H_*/
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -33,6 +33,7 @@
 * -------------------------------------------------------------------------- */

 #include "CpuGBSAOBCForce.h"
+#include "CpuLangevinDynamics.h"
 #include "CpuNeighborList.h"
 #include "CpuNonbondedForce.h"
 #include "CpuPlatform.h"
@@ -48,6 +49,7 @@ namespace OpenMM {
 */
 class CpuCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
 public:
+    class SumForceTask;
    CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context);
    /**
     * Initialize the kernel.
@@ -88,9 +90,7 @@ private:
 */
 class CpuCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
-    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
-            data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false) {
-    }
+    CpuCalcNonbondedForceKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data);
    ~CpuCalcNonbondedForceKernel();
    /**
     * Initialize the kernel.
@@ -130,8 +130,8 @@ private:
    std::vector<std::pair<float, float> > particleParams;
    std::vector<RealVec> lastPositions;
    NonbondedMethod nonbondedMethod;
-    CpuNeighborList neighborList;
-    CpuNonbondedForce nonbonded;
+    CpuNeighborList* neighborList;
+    CpuNonbondedForce* nonbonded;
    Kernel optimizedPme;
 };

@@ -173,6 +173,43 @@ private:
    CpuGBSAOBCForce obc;
 };

+/**
+ * This kernel is invoked by LangevinIntegrator to take one time step.
+ */
+class CpuIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
+public:
+    CpuIntegrateLangevinStepKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : IntegrateLangevinStepKernel(name, platform),
+        data(data), dynamics(NULL) {
+    }
+    ~CpuIntegrateLangevinStepKernel();
+    /**
+     * Initialize the kernel, setting up the particle masses.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param integrator the LangevinIntegrator this kernel will be used for
+     */
+    void initialize(const System& system, const LangevinIntegrator& integrator);
+    /**
+     * Execute the kernel.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the LangevinIntegrator this kernel is being used for
+     */
+    void execute(ContextImpl& context, const LangevinIntegrator& integrator);
+    /**
+     * Compute the kinetic energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @param integrator the LangevinIntegrator this kernel is being used for
+     */
+    double computeKineticEnergy(ContextImpl& context, const LangevinIntegrator& integrator);
+private:
+    CpuPlatform::PlatformData& data;
+    CpuLangevinDynamics* dynamics;
+    std::vector<RealOpenMM> masses;
+    double prevTemp, prevFriction, prevStepSize;
+};
+
 } // namespace OpenMM

 #endif /*OPENMM_CPUKERNELS_H_*/

--- a/platforms/cpu/include/CpuLangevinDynamics.h
+++ b/platforms/cpu/include/CpuLangevinDynamics.h
+
+/* Portions copyright (c) 2013 Stanford University and Simbios.
+ * Authors: Peter Eastman
+ * Contributors: 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __CPU_LANGEVIN_DYNAMICS_H__
+#define __CPU_LANGEVIN_DYNAMICS_H__
+
+#include "ReferenceStochasticDynamics.h"
+#include "CpuRandom.h"
+#include "openmm/internal/ThreadPool.h"
+#include "sfmt/SFMT.h"
+
+// ---------------------------------------------------------------------------------------
+
+class CpuLangevinDynamics : public ReferenceStochasticDynamics {
+public:
+    class Update1Task;
+    class Update2Task;
+    /**
+     * Constructor.
+     *
+     * @param numberOfAtoms  number of atoms
+     * @param deltaT         delta t for dynamics
+     * @param tau            viscosity
+     * @param temperature    temperature
+     * @param threads        thread pool for parallelizing computation
+     * @param random         random number generator
+     */
+    CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);
+
+    /**
+     * Destructor.
+     */
+    ~CpuLangevinDynamics();
+
+    /**
+     * First update step.
+     * 
+     * @param numberOfAtoms       number of atoms
+     * @param atomCoordinates     atom coordinates
+     * @param velocities          velocities
+     * @param forces              forces
+     * @param inverseMasses       inverse atom masses
+     * @param xPrime              xPrime
+     */
+    void updatePart1(int numberOfAtoms, std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities,
+                     std::vector<OpenMM::RealVec>& forces, std::vector<RealOpenMM>& inverseMasses, std::vector<OpenMM::RealVec>& xPrime);
+      
+    /**
+     * Second update step.
+     * 
+     * @param numberOfAtoms       number of atoms
+     * @param atomCoordinates     atom coordinates
+     * @param velocities          velocities
+     * @param forces              forces
+     * @param inverseMasses       inverse atom masses
+     * @param xPrime              xPrime
+     */
+    void updatePart2(int numberOfAtoms, std::vector<OpenMM::RealVec>& atomCoordinates, std::vector<OpenMM::RealVec>& velocities,
+                     std::vector<OpenMM::RealVec>& forces, std::vector<RealOpenMM>& inverseMasses, std::vector<OpenMM::RealVec>& xPrime);
+
+private:
+    void threadUpdate1(int threadIndex);
+    void threadUpdate2(int threadIndex);
+    OpenMM::ThreadPool& threads;
+    OpenMM::CpuRandom& random;
+    std::vector<OpenMM_SFMT::SFMT> threadRandom;
+    // The following variables are used to make information accessible to the individual threads.
+    int numberOfAtoms;
+    OpenMM::RealVec* atomCoordinates;
+    OpenMM::RealVec* velocities;
+    OpenMM::RealVec* forces;
+    RealOpenMM* inverseMasses;
+    OpenMM::RealVec* xPrime;
+};
+
+// ---------------------------------------------------------------------------------------
+
+#endif // __CPU_LANGEVIN_DYNAMICS_H__
--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -45,8 +45,7 @@ class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
    class ThreadTask;
    class Voxels;
-    static const int BlockSize;
-    CpuNeighborList();
+    CpuNeighborList(int blockSize);
    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
@@ -59,6 +58,7 @@ public:
    void threadComputeNeighborList(ThreadPool& threads, int threadIndex);
    void runThread(int index);
 private:
+    int blockSize;
    std::vector<int> sortedAtoms;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -48,7 +48,13 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */

       CpuNonbondedForce();
+       
+        /**
+         * Virtual destructor.
+         */

+        virtual ~CpuNonbondedForce();
+        
      /**---------------------------------------------------------------------------------------
      
         Set the force to use a cutoff.
@@ -151,7 +157,7 @@ class CpuNonbondedForce {
     */
    void threadComputeDirect(ThreadPool& threads, int threadIndex);

-private:
+protected:
        bool cutoff;
        bool useSwitch;
        bool periodic;
@@ -204,7 +210,7 @@ private:
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      virtual void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;
            
      /**---------------------------------------------------------------------------------------
      
@@ -216,7 +222,7 @@ private:
            
         --------------------------------------------------------------------------------------- */
          
-      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      virtual void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) = 0;

      /**
       * Compute the displacement and squared distance between two points, optionally using
@@ -224,26 +230,15 @@ private:
       */
      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

-      /**
-       * Compute the displacement and squared distance between a collection of points, optionally using
-       * periodic boundary conditions.
-       */
-      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
-
-      /**
-       * Compute a fast approximation to erfc(x).
-       */
-      static fvec4 erfcApprox(fvec4 x);
-
      /**
       * Create a lookup table for the scale factor used with Ewald and PME.
       */
      void tabulateEwaldScaleFactor();
-      
+
      /**
-       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       * Compute a fast approximation to erfc(x).
       */
-      fvec4 ewaldScaleFunction(fvec4 x);
+      static float erfcApprox(float x);
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
+#define OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
+
+#include "CpuNonbondedForce.h"
+// ---------------------------------------------------------------------------------------
+
+namespace OpenMM {
+
+class CpuNonbondedForceVec4 : public CpuNonbondedForce {
+public:
+      /**---------------------------------------------------------------------------------------
+      
+         Constructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       CpuNonbondedForceVec4();
+
+protected:
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
+      /**
+       * Compute a fast approximation to erfc(x).
+       */
+      static fvec4 erfcApprox(fvec4 x);
+      
+      /**
+       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       */
+      fvec4 ewaldScaleFunction(fvec4 x);
+};
+
+} // namespace OpenMM
+
+// ---------------------------------------------------------------------------------------
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_VEC4_H__
--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
+#define OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
+
+#ifdef __AVX__
+
+#include "CpuNonbondedForce.h"
+#include "openmm/internal/vectorize8.h"
+
+// ---------------------------------------------------------------------------------------
+
+namespace OpenMM {
+
+class CpuNonbondedForceVec8 : public CpuNonbondedForce {
+public:
+       CpuNonbondedForceVec8();
+
+protected:            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate all the interactions for one atom block.
+      
+         @param blockIndex       the index of the atom block
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+            
+         --------------------------------------------------------------------------------------- */
+          
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+
+      /**
+       * Compute the displacement and squared distance between a collection of points, optionally using
+       * periodic boundary conditions.
+       */
+      void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+
+      /**
+       * Compute a fast approximation to erfc(x).
+       */
+      static fvec8 erfcApprox(fvec8 x);
+      
+      /**
+       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+       */
+      fvec8 ewaldScaleFunction(fvec8 x);
+};
+
+} // namespace OpenMM
+
+// ---------------------------------------------------------------------------------------
+
+#endif // __AVX__
+
+#endif // OPENMM_CPU_NONBONDED_FORCE_VEC8_H__
--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -33,6 +33,7 @@
 * -------------------------------------------------------------------------- */

 #include "AlignedArray.h"
+#include "CpuRandom.h"
 #include "ReferencePlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/ThreadPool.h"
@@ -74,6 +75,7 @@ public:
    std::vector<AlignedArray<float> > threadForce;
    ThreadPool threads;
    bool isPeriodic;
+    CpuRandom random;
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuRandom.h
+++ b/platforms/cpu/include/CpuRandom.h
+#ifndef OPENMM_CPURANDOM_H_
+#define OPENMM_CPURANDOM_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "sfmt/SFMT.h"
+#include <vector>
+
+namespace OpenMM {
+
+/**
+ * This class provides a multithreaded random number generator.
+ */
+class OPENMM_EXPORT CpuRandom {
+public:
+    CpuRandom();
+    ~CpuRandom();
+    void initialize(int seed, int numThreads);
+    float getGaussianRandom(int threadIndex);
+    float getUniformRandom(int threadIndex);
+private:
+    bool hasInitialized;
+    int randomSeed;
+    std::vector<OpenMM_SFMT::SFMT*> threadRandom;
+    std::vector<float> nextGaussian;
+    std::vector<int> nextGaussianIsValid;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_CPURANDOM_H_*/
--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
-SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
+FOREACH(file ${SOURCE_FILES})
+    IF (file MATCHES ".*Vec8.*")
+        SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
+    ENDIF (file MATCHES ".*Vec8.*")
+ENDFOREACH(file)
 ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})

 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
@@ -7,6 +11,6 @@ ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB})
-SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -DOPENMM_CPU_BUILDING_SHARED_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/platforms/cpu/src/CpuKernelFactory.cpp
+++ b/platforms/cpu/src/CpuKernelFactory.cpp
@@ -45,5 +45,7 @@ KernelImpl* CpuKernelFactory::createKernelImpl(std::string name, const Platform&
        return new CpuCalcNonbondedForceKernel(name, platform, data);
    if (name == CalcGBSAOBCForceKernel::Name())
        return new CpuCalcGBSAOBCForceKernel(name, platform, data);
+    if (name == IntegrateLangevinStepKernel::Name())
+        return new CpuIntegrateLangevinStepKernel(name, platform, data);
    throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str());
 }
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -31,6 +31,7 @@

 #include "CpuKernels.h"
 #include "ReferenceBondForce.h"
+#include "ReferenceConstraints.h"
 #include "ReferenceKernelFactory.h"
 #include "ReferenceKernels.h"
 #include "ReferenceLJCoulomb14.h"
@@ -64,6 +65,71 @@ static RealVec& extractBoxSize(ContextImpl& context) {
    return *(RealVec*) data->periodicBoxSize;
 }

+static ReferenceConstraints& extractConstraints(ContextImpl& context) {
+    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    return *(ReferenceConstraints*) data->constraints;
+}
+
+/**
+ * Compute the kinetic energy of the system, possibly shifting the velocities in time to account
+ * for a leapfrog integrator.
+ */
+static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>& masses, double timeShift) {
+    vector<RealVec>& posData = extractPositions(context);
+    vector<RealVec>& velData = extractVelocities(context);
+    vector<RealVec>& forceData = extractForces(context);
+    int numParticles = context.getSystem().getNumParticles();
+    
+    // Compute the shifted velocities.
+    
+    vector<RealVec> shiftedVel(numParticles);
+    for (int i = 0; i < numParticles; ++i) {
+        if (masses[i] > 0)
+            shiftedVel[i] = velData[i]+forceData[i]*(timeShift/masses[i]);
+        else
+            shiftedVel[i] = velData[i];
+    }
+    
+    // Apply constraints to them.
+    
+    vector<double> inverseMasses(numParticles);
+    for (int i = 0; i < numParticles; i++)
+        inverseMasses[i] = (masses[i] == 0 ? 0 : 1/masses[i]);
+    extractConstraints(context).applyToVelocities(posData, shiftedVel, inverseMasses, 1e-4);
+    
+    // Compute the kinetic energy.
+    
+    double energy = 0.0;
+    for (int i = 0; i < numParticles; ++i)
+        if (masses[i] > 0)
+            energy += masses[i]*(shiftedVel[i].dot(shiftedVel[i]));
+    return 0.5*energy;
+}
+
+class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
+public:
+    SumForceTask(int numParticles, vector<RealVec>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        // Sum the contributions to forces that have been calculated by different threads.
+        
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        for (int i = start; i < end; i++) {
+            fvec4 f(0.0f);
+            for (int j = 0; j < numThreads; j++)
+                f += fvec4(&data.threadForce[j][4*i]);
+            forceData[i][0] += f[0];
+            forceData[i][1] += f[1];
+            forceData[i][2] += f[2];
+        }
+    }
+    int numParticles;
+    vector<RealVec>& forceData;
+    CpuPlatform::PlatformData& data;
+};
+
 CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
        CalcForcesAndEnergyKernel(name, platform), data(data) {
    // Create a Reference platform version of this kernel.
@@ -111,17 +177,9 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
 double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    // Sum the forces from all the threads.
    
-    int numParticles = context.getSystem().getNumParticles();
-    int numThreads = data.threads.getNumThreads();
-    vector<RealVec>& forceData = extractForces(context);
-    for (int i = 0; i < numParticles; i++) {
-        fvec4 f(0.0f);
-        for (int j = 0; j < numThreads; j++)
-            f += fvec4(&data.threadForce[j][4*i]);
-        forceData[i][0] += f[0];
-        forceData[i][1] += f[1];
-        forceData[i][2] += f[2];
-    }
+    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
+    data.threads.execute(task);
+    data.threads.waitForThreads();
    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups);
 }

@@ -145,6 +203,22 @@ private:
    int numParticles;
 };

+bool isVec8Supported();
+CpuNonbondedForce* createCpuNonbondedForceVec4();
+CpuNonbondedForce* createCpuNonbondedForceVec8();
+
+CpuCalcNonbondedForceKernel::CpuCalcNonbondedForceKernel(string name, const Platform& platform, CpuPlatform::PlatformData& data) : CalcNonbondedForceKernel(name, platform),
+        data(data), bonded14IndexArray(NULL), bonded14ParamArray(NULL), hasInitializedPme(false), neighborList(NULL), nonbonded(NULL) {
+    if (isVec8Supported()) {
+        neighborList = new CpuNeighborList(8);
+        nonbonded = createCpuNonbondedForceVec8();
+    }
+    else {
+        neighborList = new CpuNeighborList(4);
+        nonbonded = createCpuNonbondedForceVec4();
+    }
+}
+
 CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
    if (bonded14ParamArray != NULL) {
        for (int i = 0; i < num14; i++) {
@@ -154,6 +228,10 @@ CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
        delete bonded14IndexArray;
        delete bonded14ParamArray;
    }
+    if (nonbonded != NULL)
+        delete nonbonded;
+    if (neighborList != NULL)
+        delete neighborList;
 }

 void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
@@ -305,26 +383,26 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
                }
        }
        if (needRecompute) {
-            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
+            neighborList->computeNeighborList(numParticles, posq, exclusions, floatBoxSize, data.isPeriodic, nonbondedCutoff+padding, data.threads);
            lastPositions = posData;
        }
-        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
+        nonbonded->setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
    }
    if (data.isPeriodic) {
        double minAllowedSize = 1.999999*nonbondedCutoff;
        if (boxSize[0] < minAllowedSize || boxSize[1] < minAllowedSize || boxSize[2] < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
-        nonbonded.setPeriodic(floatBoxSize);
+        nonbonded->setPeriodic(floatBoxSize);
    }
    if (ewald)
-        nonbonded.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+        nonbonded->setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
    if (pme)
-        nonbonded.setUsePME(ewaldAlpha, gridSize);
+        nonbonded->setUsePME(ewaldAlpha, gridSize);
    if (useSwitchingFunction)
-        nonbonded.setUseSwitchingFunction(switchingDistance);
+        nonbonded->setUseSwitchingFunction(switchingDistance);
    double nonbondedEnergy = 0;
    if (includeDirect)
-        nonbonded.calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
+        nonbonded->calculateDirectIxn(numParticles, &posq[0], posData, particleParams, exclusions, data.threadForce, includeEnergy ? &nonbondedEnergy : NULL, data.threads);
    if (includeReciprocal) {
        if (useOptimizedPme) {
            PmeIO io(&posq[0], &data.threadForce[0][0], numParticles);
@@ -333,7 +411,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            nonbondedEnergy += optimizedPme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
        }
        else
-            nonbonded.calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
+            nonbonded->calculateReciprocalIxn(numParticles, &posq[0], posData, particleParams, exclusions, forceData, includeEnergy ? &nonbondedEnergy : NULL);
    }
    energy += nonbondedEnergy;
    if (includeDirect) {
@@ -440,3 +518,45 @@ void CpuCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, co
    }
    obc.setParticleParameters(particleParams);
 }
+
+CpuIntegrateLangevinStepKernel::~CpuIntegrateLangevinStepKernel() {
+    if (dynamics)
+        delete dynamics;
+}
+
+void CpuIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
+    int numParticles = system.getNumParticles();
+    masses.resize(numParticles);
+    for (int i = 0; i < numParticles; ++i)
+        masses[i] = static_cast<RealOpenMM>(system.getParticleMass(i));
+    data.random.initialize(integrator.getRandomNumberSeed(), data.threads.getNumThreads());
+}
+
+void CpuIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
+    double temperature = integrator.getTemperature();
+    double friction = integrator.getFriction();
+    double stepSize = integrator.getStepSize();
+    vector<RealVec>& posData = extractPositions(context);
+    vector<RealVec>& velData = extractVelocities(context);
+    vector<RealVec>& forceData = extractForces(context);
+    if (dynamics == 0 || temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
+        // Recreate the computation objects with the new parameters.
+        
+        if (dynamics)
+            delete dynamics;
+        RealOpenMM tau = (friction == 0.0 ? 0.0 : 1.0/friction);
+        dynamics = new CpuLangevinDynamics(context.getSystem().getNumParticles(), stepSize, tau, temperature, data.threads, data.random);
+        dynamics->setReferenceConstraintAlgorithm(&extractConstraints(context));
+        prevTemp = temperature;
+        prevFriction = friction;
+        prevStepSize = stepSize;
+    }
+    dynamics->update(context.getSystem(), posData, velData, forceData, masses, integrator.getConstraintTolerance());
+    ReferencePlatform::PlatformData* refData = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    refData->time += stepSize;
+    refData->stepCount++;
+}
+
+double CpuIntegrateLangevinStepKernel::computeKineticEnergy(ContextImpl& context, const LangevinIntegrator& integrator) {
+    return computeShiftedKineticEnergy(context, masses, 0.5*integrator.getStepSize());
+}
--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Authors: Peter Eastman
+ * Contributors: 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMLog.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuLangevinDynamics.h"
+
+using namespace OpenMM;
+using namespace std;
+
+class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
+public:
+    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadUpdate1(threadIndex);
+    }
+    CpuLangevinDynamics& owner;
+};
+
+class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
+public:
+    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        owner.threadUpdate2(threadIndex);
+    }
+    CpuLangevinDynamics& owner;
+};
+
+CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
+           ReferenceStochasticDynamics(numberOfAtoms, deltaT, tau, temperature), threads(threads), random(random) {
+}
+
+CpuLangevinDynamics::~CpuLangevinDynamics() {
+}
+
+void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+    // Record the parameters for the threads.
+    
+    this->numberOfAtoms = numberOfAtoms;
+    this->atomCoordinates = &atomCoordinates[0];
+    this->velocities = &velocities[0];
+    this->forces = &forces[0];
+    this->inverseMasses = &inverseMasses[0];
+    this->xPrime = &xPrime[0];
+    
+    // Signal the threads to start running and wait for them to finish.
+    
+    Update1Task task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+
+void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCoordinates, vector<RealVec>& velocities,
+                                      vector<RealVec>& forces, vector<RealOpenMM>& inverseMasses, vector<RealVec>& xPrime) {
+    // Record the parameters for the threads.
+    
+    this->numberOfAtoms = numberOfAtoms;
+    this->atomCoordinates = &atomCoordinates[0];
+    this->velocities = &velocities[0];
+    this->forces = &forces[0];
+    this->inverseMasses = &inverseMasses[0];
+    this->xPrime = &xPrime[0];
+    
+    // Signal the threads to start running and wait for them to finish.
+    
+    Update2Task task(*this);
+    threads.execute(task);
+    threads.waitForThreads();
+}
+
+void CpuLangevinDynamics::threadUpdate1(int threadIndex) {
+    const RealOpenMM tau = getTau();
+    const RealOpenMM vscale = EXP(-getDeltaT()/tau);
+    const RealOpenMM fscale = (1-vscale)*tau;
+    const RealOpenMM kT = BOLTZ*getTemperature();
+    const RealOpenMM noisescale = SQRT(2*kT/tau)*SQRT(0.5*(1-vscale*vscale)*tau);
+    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
+    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
+
+    for (int i = start; i < end; i++) {
+        if (inverseMasses[i] != 0.0) {
+            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            RealVec noise(random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex), random.getGaussianRandom(threadIndex));
+            velocities[i]  = velocities[i]*vscale + forces[i]*(fscale*inverseMasses[i]) + noise*(noisescale*sqrtInvMass);
+        }
+   }
+}
+
+void CpuLangevinDynamics::threadUpdate2(int threadIndex) {
+    const RealOpenMM dt = getDeltaT();
+    int start = threadIndex*numberOfAtoms/threads.getNumThreads();
+    int end = (threadIndex+1)*numberOfAtoms/threads.getNumThreads();
+
+    for (int i = start; i < end; i++) {
+        if (inverseMasses[i] != 0.0) {
+            RealOpenMM sqrtInvMass = SQRT(inverseMasses[i]);
+            xPrime[i] = atomCoordinates[i]+velocities[i]*dt;
+        }
+   }
+}
--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -43,8 +43,6 @@ using namespace std;

 namespace OpenMM {

-const int CpuNeighborList::BlockSize = 4;
-
 class VoxelIndex 
 {
 public:
@@ -62,8 +60,8 @@ public:
 */
 class CpuNeighborList::Voxels {
 public:
-    Voxels(float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
-            voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
+    Voxels(int blockSize, float vsx, float vsy, float minx, float maxx, float miny, float maxy, const float* periodicBoxSize, bool usePeriodic) :
+            blockSize(blockSize), voxelSizeX(vsx), voxelSizeY(vsy), minx(minx), maxx(maxx), miny(miny), maxy(maxy), periodicBoxSize(periodicBoxSize), usePeriodic(usePeriodic) {
        if (usePeriodic) {
            nx = (int) floorf(periodicBoxSize[0]/voxelSizeX+0.5f);
            ny = (int) floorf(periodicBoxSize[1]/voxelSizeY+0.5f);
@@ -156,7 +154,7 @@ public:
        return VoxelIndex(x, y);
    }

-    void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int> blockAtoms, const float* atomLocations) const {
+    void getNeighbors(vector<int>& neighbors, int blockIndex, fvec4 blockCenter, fvec4 blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
        neighbors.resize(0);
        exclusions.resize(0);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
@@ -175,9 +173,6 @@ public:
        float centerPos[4];
        blockCenter.store(centerPos);
        VoxelIndex centerVoxelIndex = getVoxelIndex(centerPos);
-        VoxelIndex atomVoxelIndex[BlockSize];
-        for (int i = 0; i < (int) blockAtoms.size(); i++)
-            atomVoxelIndex[i] = getVoxelIndex(&atomLocations[4*blockAtoms[i]]);
        int startx = centerVoxelIndex.x-dIndexX;
        int starty = centerVoxelIndex.y-dIndexY;
        int endx = centerVoxelIndex.x+dIndexX;
@@ -193,7 +188,7 @@ public:
            endx = min(endx, nx-1);
            endy = min(endy, ny-1);
        }
-        int lastSortedIndex = BlockSize*(blockIndex+1);
+        int lastSortedIndex = blockSize*(blockIndex+1);
        VoxelIndex voxelIndex(0, 0);
        for (int x = startx; x <= endx; ++x) {
            voxelIndex.x = x;
@@ -300,10 +295,12 @@ public:
                        // Add this atom to the list of neighbors.
                        
                        neighbors.push_back(sortedAtoms[sortedIndex]);
-                        if (sortedIndex < BlockSize*blockIndex)
+                        if (sortedIndex < blockSize*blockIndex)
                            exclusions.push_back(0);
-                        else
-                            exclusions.push_back(0xF & (0xF<<(sortedIndex-BlockSize*blockIndex)));
+                        else {
+                            int mask = (1<<blockSize)-1;
+                            exclusions.push_back(mask & (mask<<(sortedIndex-blockSize*blockIndex)));
+                        }
                    }
                }
            }
@@ -311,6 +308,7 @@ public:
    }

 private:
+    int blockSize;
    float voxelSizeX, voxelSizeY;
    float minx, maxx, miny, maxy;
    int nx, ny;
@@ -329,12 +327,12 @@ public:
    CpuNeighborList& owner;
 };

-CpuNeighborList::CpuNeighborList() {
+CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }

 void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
-    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
+    int numBlocks = (numAtoms+blockSize-1)/blockSize;
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
    sortedAtoms.resize(numAtoms);
@@ -381,7 +379,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
        edgeSizeX = 0.6f*periodicBoxSize[0]/floorf(periodicBoxSize[0]/maxDistance);
        edgeSizeY = 0.6f*periodicBoxSize[1]/floorf(periodicBoxSize[1]/maxDistance);
    }
-    Voxels voxels(edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
+    Voxels voxels(blockSize, edgeSizeX, edgeSizeY, minx, maxx, miny, maxy, periodicBoxSize, usePeriodic);
    for (int i = 0; i < numAtoms; i++) {
        int atomIndex = atomBins[i].second;
        sortedAtoms[i] = atomIndex;
@@ -397,9 +395,9 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    
    // Add padding atoms to fill up the last block.
    
-    int numPadding = numBlocks*BlockSize-numAtoms;
+    int numPadding = numBlocks*blockSize-numAtoms;
    if (numPadding > 0) {
-        char mask = (0xF0 >> numPadding) & 0xF;
+        char mask = ((0xFFFF-(1<<blockSize)+1) >> numPadding);
        for (int i = 0; i < numPadding; i++)
            sortedAtoms.push_back(0);
        vector<char>& exc = blockExclusions[blockExclusions.size()-1];
@@ -409,7 +407,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
 }

 int CpuNeighborList::getNumBlocks() const {
-    return sortedAtoms.size()/BlockSize;
+    return sortedAtoms.size()/blockSize;
 }

 const std::vector<int>& CpuNeighborList::getSortedAtoms() const {
@@ -446,14 +444,18 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI

    int numBlocks = blockNeighbors.size();
    vector<int> blockAtoms;
+    vector<VoxelIndex> atomVoxelIndex;
    for (int i = threadIndex; i < numBlocks; i += numThreads) {
        // Find the atoms in this block and compute their bounding box.
        
-        int firstIndex = BlockSize*i;
-        int atomsInBlock = min(BlockSize, numAtoms-firstIndex);
+        int firstIndex = blockSize*i;
+        int atomsInBlock = min(blockSize, numAtoms-firstIndex);
        blockAtoms.resize(atomsInBlock);
-        for (int j = 0; j < atomsInBlock; j++)
+        atomVoxelIndex.resize(atomsInBlock);
+        for (int j = 0; j < atomsInBlock; j++) {
            blockAtoms[j] = sortedAtoms[firstIndex+j];
+            atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
+        }
        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
        fvec4 maxPos = minPos;
        for (int j = 1; j < atomsInBlock; j++) {
@@ -461,7 +463,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
-        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations);
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);

        // Record the exclusions for this block.


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -29,7 +29,6 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/vectorize.h"
 #include "gmx_atomic.h"

 // In case we're using some primitive version of Visual Studio this will
@@ -61,6 +60,9 @@ public:
 CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
 }

+CpuNonbondedForce::~CpuNonbondedForce() {
+}
+
 /**---------------------------------------------------------------------------------------

   Set the force to use a cutoff.
@@ -356,7 +358,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
                    float inverseR = 1/r;
                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
                    float alphaR = alphaEwald*r;
-                    float erfcAlphaR = erfcApprox(alphaR)[0];
+                    float erfcAlphaR = erfcApprox(alphaR);
                    float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
                    dEdR = (float) (dEdR * (1.0f-erfcAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
                    fvec4 result = deltaR*dEdR;
@@ -446,228 +448,6 @@ void CpuNonbondedForce::calculateOneIxn(int ii, int jj, float* forces, double* t
    (fvec4(forces+4*jj)-result).store(forces+4*jj);
  }

-void CpuNonbondedForce::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    int blockAtom[4];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForce[4];
-    for (int i = 0; i < 4; i++) {
-        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        blockAtomForce[i] = fvec4(0.0f);
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = false;
-    if (periodic) {
-        for (int i = 0; i < 4 && !needPeriodic; i++)
-            for (int j = 0; j < 3; j++)
-                if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
-                    needPeriodic = true;
-                    break;
-                }
-    }
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        fvec4 atomPosq(posq+4*atom);
-        
-        // Compute the distances to the block atoms.
-        
-        fvec4 dx, dy, dz, r2;
-        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        if (cutoff)
-            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
-        else
-            dEdR += chargeProd*inverseR;
-        dEdR *= inverseR*inverseR;
-
-        // Accumulate energies.
-
-        if (totalEnergy) {
-            if (cutoff)
-                energy += chargeProd*(inverseR+krf*r2-crf);
-            else
-                energy += chargeProd*inverseR;
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, 1.0f);
-        }
-
-        // Accumulate forces.
-
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
-        transpose(result[0], result[1], result[2], result[3]);
-        fvec4 atomForce(forces+4*atom);
-        for (int j = 0; j < 4; j++) {
-            blockAtomForce[j] += result[j];
-            atomForce -= result[j];
-        }
-        atomForce.store(forces+4*atom);
-    }
-    
-    // Record the forces on the block atoms.
-
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
-  }
-
-void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    // Load the positions and parameters of the atoms in the block.
-    
-    int blockAtom[4];
-    fvec4 blockAtomPosq[4];
-    fvec4 blockAtomForce[4];
-    for (int i = 0; i < 4; i++) {
-        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
-        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
-        blockAtomForce[i] = fvec4(0.0f);
-    }
-    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
-    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
-    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
-    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
-    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
-    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = false;
-    for (int i = 0; i < 4 && !needPeriodic; i++)
-        for (int j = 0; j < 3; j++)
-            if (blockAtomPosq[i][j]-cutoffDistance < 0.0 || blockAtomPosq[i][j]+cutoffDistance > boxSize[j]) {
-                needPeriodic = true;
-                break;
-            }
-    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
-    
-    // Loop over neighbors for this block.
-    
-    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
-    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
-    for (int i = 0; i < (int) neighbors.size(); i++) {
-        // Load the next neighbor.
-        
-        int atom = neighbors[i];
-        fvec4 atomPosq(posq+4*atom);
-        
-        // Compute the distances to the block atoms.
-        
-        fvec4 dx, dy, dz, r2;
-        getDeltaR(atomPosq, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
-        ivec4 include;
-        char excl = exclusions[i];
-        if (excl == 0)
-            include = -1;
-        else
-            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
-        include = include & (r2 < cutoffDistance*cutoffDistance);
-        if (!any(include))
-            continue; // No interactions to compute.
-        
-        // Compute the interactions.
-        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
-        fvec4 energy, dEdR;
-        float atomEpsilon = atomParameters[atom].second;
-        if (atomEpsilon != 0.0f) {
-            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
-            fvec4 sig2 = inverseR*sig;
-            sig2 *= sig2;
-            fvec4 sig6 = sig2*sig2*sig2;
-            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
-            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
-            energy = epsSig6*(sig6-1.0f);
-            if (useSwitch) {
-                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
-                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
-                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
-                dEdR = switchValue*dEdR - energy*switchDeriv*r;
-                energy *= switchValue;
-            }
-        }
-        else {
-            energy = 0.0f;
-            dEdR = 0.0f;
-        }
-        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
-        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
-        dEdR *= inverseR*inverseR;        
-
-        // Accumulate energies.
-
-        if (totalEnergy) {
-            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
-            energy = blend(0.0f, energy, include);
-            *totalEnergy += dot4(energy, 1.0f);
-        }
-
-        // Accumulate forces.
-
-        dEdR = blend(0.0f, dEdR, include);
-        fvec4 result[4] = {dx*dEdR, dy*dEdR, dz*dEdR, 0.0f};
-        transpose(result[0], result[1], result[2], result[3]);
-        fvec4 atomForce(forces+4*atom);
-        for (int j = 0; j < 4; j++) {
-            blockAtomForce[j] += result[j];
-            atomForce -= result[j];
-        }
-        atomForce.store(forces+4*atom);
-    }
-    
-    // Record the forces on the block atoms.
-    
-    for (int j = 0; j < 4; j++)
-        (fvec4(forces+4*blockAtom[j])+blockAtomForce[j]).store(forces+4*blockAtom[j]);
-}
-
 void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    deltaR = posJ-posI;
    if (periodic) {
@@ -677,41 +457,15 @@ void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& d
    r2 = dot3(deltaR, deltaR);
 }

-void CpuNonbondedForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
-    dx = x-posI[0];
-    dy = y-posI[1];
-    dz = z-posI[2];
-    if (periodic) {
-        dx -= round(dx*invBoxSize[0])*boxSize[0];
-        dy -= round(dy*invBoxSize[1])*boxSize[1];
-        dz -= round(dz*invBoxSize[2])*boxSize[2];
-    }
-    r2 = dx*dx + dy*dy + dz*dz;
-}
-
-fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
+float CpuNonbondedForce::erfcApprox(float x) {
    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
    // error of 3e-7.

-    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    float t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
    t *= t;
    t *= t;
    t *= t;
    return 1.0f/(t*t);
 }

-fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
-    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
-
-    fvec4 x1 = x*ewaldDXInv;
-    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
-    fvec4 coeff2 = x1-index;
-    fvec4 coeff1 = 1.0f-coeff2;
-    fvec4 t1(&ewaldScaleTable[index[0]]);
-    fvec4 t2(&ewaldScaleTable[index[1]]);
-    fvec4 t3(&ewaldScaleTable[index[2]]);
-    fvec4 t4(&ewaldScaleTable[index[3]]);
-    transpose(t1, t2, t3, t4);
-    return coeff1*t1 + coeff2*t2;
-}
--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceVec4.h"
+
+using namespace std;
+using namespace OpenMM;
+
+/**
+ * Factory method to create a CpuNonbondedForceVec4.
+ */
+CpuNonbondedForce* createCpuNonbondedForceVec4() {
+    return new CpuNonbondedForceVec4();
+}
+
+/**---------------------------------------------------------------------------------------
+
+   CpuNonbondedForceVec4 constructor
+
+   --------------------------------------------------------------------------------------- */
+
+CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
+}
+
+void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
+        dEdR *= inverseR*inverseR;
+
+        // Accumulate energies.
+
+        fvec4 one(1.0f);
+        if (totalEnergy) {
+            if (cutoff)
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec4 fx = dx*dEdR;
+        fvec4 fy = dy*dEdR;
+        fvec4 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot4(fx, one);
+        atomForce[1] -= dot4(fy, one);
+        atomForce[2] -= dot4(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+
+    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+    transpose(f[0], f[1], f[2], f[3]);
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+  }
+
+void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[4];
+    fvec4 blockAtomPosq[4];
+    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    for (int i = 0; i < 4; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[4*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
+    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
+    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
+    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
+    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
+    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec4 dx, dy, dz, r2;
+        getDeltaR(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec4 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec4(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec4 r = sqrt(r2);
+        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec4 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec4 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec4 sig6 = sig2*sig2*sig2;
+            fvec4 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec4 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
+
+        // Accumulate energies.
+
+        fvec4 one(1.0f);
+        if (totalEnergy) {
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot4(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec4 fx = dx*dEdR;
+        fvec4 fy = dy*dEdR;
+        fvec4 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot4(fx, one);
+        atomForce[1] -= dot4(fy, one);
+        atomForce[2] -= dot4(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+    
+    fvec4 f[4] = {blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f};
+    transpose(f[0], f[1], f[2], f[3]);
+    for (int j = 0; j < 4; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+fvec4 CpuNonbondedForceVec4::erfcApprox(fvec4 x) {
+    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+    // error of 3e-7.
+
+    fvec4 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    t *= t;
+    t *= t;
+    t *= t;
+    return 1.0f/(t*t);
+}
+
+fvec4 CpuNonbondedForceVec4::ewaldScaleFunction(fvec4 x) {
+    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+
+    fvec4 x1 = x*ewaldDXInv;
+    ivec4 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    fvec4 t1(&ewaldScaleTable[index[0]]);
+    fvec4 t2(&ewaldScaleTable[index[1]]);
+    fvec4 t3(&ewaldScaleTable[index[2]]);
+    fvec4 t4(&ewaldScaleTable[index[3]]);
+    transpose(t1, t2, t3, t4);
+    return coeff1*t1 + coeff2*t2;
+}
--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+
+/* Portions copyright (c) 2006-2013 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKOpenMMCommon.h"
+#include "SimTKOpenMMUtilities.h"
+#include "CpuNonbondedForceVec8.h"
+#include "openmm/internal/hardware.h"
+
+using namespace std;
+using namespace OpenMM;
+
+#ifndef __AVX__
+bool isVec8Supported() {
+    return false;
+}
+
+CpuNonbondedForce* createCpuNonbondedForceVec8() {
+    throw OpenMMException("Internal error: OpenMM was compiled without AVX support");
+}
+#else
+/**
+ * Check whether 8 component vectors are supported with the current CPU.
+ */
+bool isVec8Supported() {
+    // Make sure the CPU supports AVX.
+    
+    int cpuInfo[4];
+    cpuid(cpuInfo, 0);
+    if (cpuInfo[0] >= 1) {
+        cpuid(cpuInfo, 1);
+        return ((cpuInfo[2] & ((int) 1 << 28)) != 0);
+    }
+    return false;
+}
+
+/**
+ * Factory method to create a CpuNonbondedForceVec8.
+ */
+CpuNonbondedForce* createCpuNonbondedForceVec8() {
+    return new CpuNonbondedForceVec8();
+}
+
+/**---------------------------------------------------------------------------------------
+
+   CpuNonbondedForceVec8 constructor
+
+   --------------------------------------------------------------------------------------- */
+
+CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
+}
+
+void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[8];
+    fvec4 blockAtomPosq[8];
+    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    for (int i = 0; i < 8; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    blockAtomCharge *= ONE_4PI_EPS0;
+    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
+    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec8 dx, dy, dz, r2;
+        getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec8 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec8 r = sqrt(r2);
+        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec8 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec8 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec8 sig6 = sig2*sig2*sig2;
+            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
+        if (cutoff)
+            dEdR += chargeProd*(inverseR-2.0f*krf*r2);
+        else
+            dEdR += chargeProd*inverseR;
+        dEdR *= inverseR*inverseR;
+
+        // Accumulate energies.
+
+        fvec8 one(1.0f);
+        if (totalEnergy) {
+            if (cutoff)
+                energy += chargeProd*(inverseR+krf*r2-crf);
+            else
+                energy += chargeProd*inverseR;
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot8(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec8 fx = dx*dEdR;
+        fvec8 fy = dy*dEdR;
+        fvec8 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot8(fx, one);
+        atomForce[1] -= dot8(fy, one);
+        atomForce[2] -= dot8(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+
+    fvec4 f[8];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
+    for (int j = 0; j < 8; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+  }
+
+void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+    // Load the positions and parameters of the atoms in the block.
+    
+    int blockAtom[8];
+    fvec4 blockAtomPosq[8];
+    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
+    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
+    for (int i = 0; i < 8; i++) {
+        blockAtom[i] = neighborList->getSortedAtoms()[8*blockIndex+i];
+        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+    }
+    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
+    blockAtomCharge *= ONE_4PI_EPS0;
+    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
+    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
+    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
+            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
+    
+    // Loop over neighbors for this block.
+    
+    const vector<int>& neighbors = neighborList->getBlockNeighbors(blockIndex);
+    const vector<char>& exclusions = neighborList->getBlockExclusions(blockIndex);
+    for (int i = 0; i < (int) neighbors.size(); i++) {
+        // Load the next neighbor.
+        
+        int atom = neighbors[i];
+        
+        // Compute the distances to the block atoms.
+        
+        fvec8 dx, dy, dz, r2;
+        getDeltaR(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        ivec8 include;
+        char excl = exclusions[i];
+        if (excl == 0)
+            include = -1;
+        else
+            include = ivec8(excl&1 ? 0 : -1, excl&2 ? 0 : -1, excl&4 ? 0 : -1, excl&8 ? 0 : -1, excl&16 ? 0 : -1, excl&32 ? 0 : -1, excl&64 ? 0 : -1, excl&128 ? 0 : -1);
+        include = include & (r2 < cutoffDistance*cutoffDistance);
+        if (!any(include))
+            continue; // No interactions to compute.
+        
+        // Compute the interactions.
+        
+        fvec8 r = sqrt(r2);
+        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 energy, dEdR;
+        float atomEpsilon = atomParameters[atom].second;
+        if (atomEpsilon != 0.0f) {
+            fvec8 sig = blockAtomSigma+atomParameters[atom].first;
+            fvec8 sig2 = inverseR*sig;
+            sig2 *= sig2;
+            fvec8 sig6 = sig2*sig2*sig2;
+            fvec8 epsSig6 = blockAtomEpsilon*atomEpsilon*sig6;
+            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
+            energy = epsSig6*(sig6-1.0f);
+            if (useSwitch) {
+                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
+                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
+                dEdR = switchValue*dEdR - energy*switchDeriv*r;
+                energy *= switchValue;
+            }
+        }
+        else {
+            energy = 0.0f;
+            dEdR = 0.0f;
+        }
+        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
+        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
+        dEdR *= inverseR*inverseR;        
+
+        // Accumulate energies.
+
+        fvec8 one(1.0f);
+        if (totalEnergy) {
+            energy += chargeProd*inverseR*erfcApprox(alphaEwald*r);
+            energy = blend(0.0f, energy, include);
+            *totalEnergy += dot8(energy, one);
+        }
+
+        // Accumulate forces.
+
+        dEdR = blend(0.0f, dEdR, include);
+        fvec8 fx = dx*dEdR;
+        fvec8 fy = dy*dEdR;
+        fvec8 fz = dz*dEdR;
+        blockAtomForceX += fx;
+        blockAtomForceY += fy;
+        blockAtomForceZ += fz;
+        float* atomForce = forces+4*atom;
+        atomForce[0] -= dot8(fx, one);
+        atomForce[1] -= dot8(fy, one);
+        atomForce[2] -= dot8(fz, one);
+    }
+    
+    // Record the forces on the block atoms.
+    
+    fvec4 f[8];
+    transpose(blockAtomForceX, blockAtomForceY, blockAtomForceZ, 0.0f, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
+    for (int j = 0; j < 8; j++)
+        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
+}
+
+void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+    dx = x-posI[0];
+    dy = y-posI[1];
+    dz = z-posI[2];
+    if (periodic) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
+    }
+    r2 = dx*dx + dy*dy + dz*dz;
+}
+
+fvec8 CpuNonbondedForceVec8::erfcApprox(fvec8 x) {
+    // This approximation for erfc is from Abramowitz and Stegun (1964) p. 299.  They cite the following as
+    // the original source: C. Hastings, Jr., Approximations for Digital Computers (1955).  It has a maximum
+    // error of 3e-7.
+
+    fvec8 t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*x)*x)*x)*x)*x)*x;
+    t *= t;
+    t *= t;
+    t *= t;
+    return 1.0f/(t*t);
+}
+
+fvec8 CpuNonbondedForceVec8::ewaldScaleFunction(fvec8 x) {
+    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
+
+    fvec8 x1 = x*ewaldDXInv;
+    ivec8 index = min(floor(x1), NUM_TABLE_POINTS);
+    fvec8 coeff2 = x1-index;
+    fvec8 coeff1 = 1.0f-coeff2;
+    ivec4 indexLower = index.lowerVec();
+    ivec4 indexUpper = index.upperVec();
+    fvec4 t1(&ewaldScaleTable[indexLower[0]]);
+    fvec4 t2(&ewaldScaleTable[indexLower[1]]);
+    fvec4 t3(&ewaldScaleTable[indexLower[2]]);
+    fvec4 t4(&ewaldScaleTable[indexLower[3]]);
+    fvec4 t5(&ewaldScaleTable[indexUpper[0]]);
+    fvec4 t6(&ewaldScaleTable[indexUpper[1]]);
+    fvec4 t7(&ewaldScaleTable[indexUpper[2]]);
+    fvec4 t8(&ewaldScaleTable[indexUpper[3]]);
+    fvec8 s1, s2, s3, s4;
+    transpose(t1, t2, t3, t4, t5, t6, t7, t8, s1, s2, s3, s4);
+    return coeff1*s1 + coeff2*s2;
+}
+#endif
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -53,6 +53,7 @@ CpuPlatform::CpuPlatform() {
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
+    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
 }

 double CpuPlatform::getSpeed() const {

--- a/platforms/cpu/src/CpuRandom.cpp
+++ b/platforms/cpu/src/CpuRandom.cpp
+/* Portions copyright (c) 2013 Stanford University and Simbios.
+ * Authors: Peter Eastman
+ * Contributors: 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "CpuRandom.h"
+#include "openmm/OpenMMException.h"
+#include <cmath>
+
+using namespace std;
+using namespace OpenMM;
+
+CpuRandom::CpuRandom() : hasInitialized(false) {
+}
+
+CpuRandom::~CpuRandom() {
+    for (int i = 0; i < threadRandom.size(); i++)
+        delete threadRandom[i];
+}
+
+void CpuRandom::initialize(int seed, int numThreads) {
+    if (hasInitialized) {
+        if (seed == randomSeed)
+            return; // Already initialized with the same seed.
+        throw OpenMMException("Random number generator initialized twice with different seeds");
+    }
+    randomSeed = seed;
+    hasInitialized = true;
+    threadRandom.resize(numThreads);
+    nextGaussian.resize(numThreads);
+    nextGaussianIsValid.resize(numThreads, false);
+
+    // Use a quick and dirty RNG to pick seeds for the real random number generator.
+
+    unsigned int r = (unsigned int) seed;
+    for (int i = 0; i < numThreads; i++) {
+        r = (1664525*r + 1013904223) & 0xFFFFFFFF;
+        threadRandom[i] = new OpenMM_SFMT::SFMT();
+        init_gen_rand(r, *threadRandom[i]);
+    }
+}
+
+float CpuRandom::getGaussianRandom(int threadIndex) {
+    if (nextGaussianIsValid[threadIndex]) {
+        nextGaussianIsValid[threadIndex] = false;
+        return nextGaussian[threadIndex];
+    }
+    
+    // Use the polar form of the Box-Muller transformation to generate two Gaussian random numbers.
+    
+    float x, y, r2;
+    do {
+        x = 2.0f*(float) genrand_real2(*threadRandom[threadIndex])-1.0f;
+        y = 2.0f*(float) genrand_real2(*threadRandom[threadIndex])-1.0f;
+        r2 = x*x + y*y;
+    } while (r2 >= 1.0f || r2 == 0.0f);
+    float multiplier = sqrtf((-2.0f*logf(r2))/r2);
+    nextGaussian[threadIndex] = y*multiplier;
+    nextGaussianIsValid[threadIndex] = true;
+    return x*multiplier;
+}
+
+float CpuRandom::getUniformRandom(int threadIndex) {
+    return genrand_real2(*threadRandom[threadIndex]);
+}