Merge branch 'master' of github.com:SimTk/openmm

99ef4344 · Lee-Ping Wang · 471bea82 · 79fd69c3 · 99ef4344 · 99ef4344
Commit 99ef4344 authored Oct 24, 2013 by Lee-Ping Wang
20 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1639,6 +1639,24 @@ void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& cont
        context.calcForcesAndEnergy(false, false, -1);
 }
+void CudaCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    ensureMultipolesValid(context);
+    int numParticles = cu.getNumAtoms();
+    dipoles.resize(numParticles);
+    if (cu.getUseDoublePrecision()) {
+        vector<double> d;
+        inducedDipole->download(d);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[i] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
+    }
+    else {
+        vector<float> d;
+        inducedDipole->download(d);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[i] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
+    }
+}
 void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const vector<Vec3>& inputGrid, vector<double>& outputElectrostaticPotential) {
    ensureMultipolesValid(context);
    int numPoints = inputGrid.size();

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -327,6 +327,13 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Get the induced dipole moments of all particles.
+     * 
+     * @param context    the Context for which to get the induced dipoles
+     * @param dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
    /**
     * Execute the kernel to calculate the electrostatic potential
     *

--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
@@ -2700,6 +2700,40 @@ static void testPMEMutualPolarizationLargeWater( FILE* log ) {
 }
+// test querying particle induced dipoles
+static void testParticleInducedDipoles() {
+    int numberOfParticles     = 8;
+    int inputPmeGridDimension = 0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    setupMultipoleAmmonia(system, amoebaMultipoleForce, AmoebaMultipoleForce::NoCutoff, AmoebaMultipoleForce::Mutual, 
+                                             cutoff, inputPmeGridDimension);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    getForcesEnergyMultipoleAmmonia(context, forces, energy);
+    std::vector<Vec3> dipole;
+    amoebaMultipoleForce->getInducedDipoles(context, dipole);
+    // Compare to values calculated by TINKER.
+    std::vector<Vec3> expectedDipole(numberOfParticles);
+    expectedDipole[0] = Vec3(0.0031710288, 9.3687453e-7, -0.0006919963);
+    expectedDipole[1] = Vec3(8.0279737504e-5, -0.000279376, 4.778060103e-5);
+    expectedDipole[2] = Vec3(0.000079322, 0.0002789804, 4.8696656126e-5);
+    expectedDipole[3] = Vec3(-0.0001407394, 1.540638116e-6, -0.0007077775);
+    expectedDipole[4] = Vec3(0.0019564439, -1.0409717e-7, 0.0007332188);
+    expectedDipole[5] = Vec3(0.0008213891, -0.0007749618, -0.0003883865);
+    expectedDipole[6] = Vec3(0.0046133992, -7.2868019e-7, 0.0002500622);
+    expectedDipole[7] = Vec3(0.0008204731, 0.0007772727, -0.0003856176);
+    for (int i = 0; i < numberOfParticles; i++)
+        ASSERT_EQUAL_VEC(expectedDipole[i], dipole[i], 1e-4);
+}
 // test computation of system multipole moments
 static void testSystemMultipoleMoments( FILE* log ) {
@@ -2963,6 +2997,10 @@ int main(int argc, char* argv[]) {
        testMultipoleIonsAndWaterPMEMutualPolarization( log );
        testMultipoleIonsAndWaterPMEDirectPolarization( log );
+        // test querying induced dipoles
+        testParticleInducedDipoles();
        // test computation of system multipole moments
        testSystemMultipoleMoments( log );

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
@@ -683,6 +683,25 @@ double ReferenceCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bo
    return static_cast<double>(energy);
 }
+void ReferenceCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& outputDipoles) {
+    int numParticles = context.getSystem().getNumParticles();
+    outputDipoles.resize(numParticles);
+    // Create an AmoebaReferenceMultipoleForce to do the calculation.
+    AmoebaReferenceMultipoleForce* amoebaReferenceMultipoleForce = setupAmoebaReferenceMultipoleForce( context );
+    vector<RealVec>& posData = extractPositions(context);
+    // Retrieve the induced dipoles.
+    vector<RealVec> inducedDipoles;
+    amoebaReferenceMultipoleForce->calculateInducedDipoles(posData, charges, dipoles, quadrupoles, tholes,
+            dampingFactors, polarity, axisTypes, multipoleAtomZs, multipoleAtomXs, multipoleAtomYs, multipoleAtomCovalentInfo, inducedDipoles);
+    for (int i = 0; i < numParticles; i++)
+        outputDipoles[i] = inducedDipoles[i];
+    delete amoebaReferenceMultipoleForce;
+}
 void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
                                                                        std::vector< double >& outputElectrostaticPotential ){
@@ -704,8 +723,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextI
    }
    delete amoebaReferenceMultipoleForce;
-    return;
 }
 void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, std::vector< double >& outputMultipoleMoments){
@@ -726,8 +743,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextI
                                                                          multipoleAtomCovalentInfo, outputMultipoleMoments );
    delete amoebaReferenceMultipoleForce;
-    return;
 }
 void ReferenceCalcAmoebaMultipoleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaMultipoleForce& force) {

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
@@ -366,6 +366,13 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Get the induced dipole moments of all particles.
+     * 
+     * @param context    the Context for which to get the induced dipoles
+     * @param dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
    /** 
     * Calculate the electrostatic potential given vector of grid coordinates.
     *

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
@@ -1563,6 +1563,28 @@ RealOpenMM AmoebaReferenceMultipoleForce::calculateForceAndEnergy( const std::ve
    return energy;
 }
+void AmoebaReferenceMultipoleForce::calculateInducedDipoles(const std::vector<RealVec>& particlePositions,
+                                                            const std::vector<RealOpenMM>& charges,
+                                                            const std::vector<RealOpenMM>& dipoles,
+                                                            const std::vector<RealOpenMM>& quadrupoles,
+                                                            const std::vector<RealOpenMM>& tholes,
+                                                            const std::vector<RealOpenMM>& dampingFactors,
+                                                            const std::vector<RealOpenMM>& polarity,
+                                                            const std::vector<int>& axisTypes,
+                                                            const std::vector<int>& multipoleAtomZs,
+                                                            const std::vector<int>& multipoleAtomXs,
+                                                            const std::vector<int>& multipoleAtomYs,
+                                                            const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo,
+                                                            std::vector<RealVec>& outputInducedDipoles) {
+    // setup, including calculating induced dipoles
+    std::vector<MultipoleParticleData> particleData;
+    setup( particlePositions, charges, dipoles, quadrupoles, tholes,
+           dampingFactors, polarity, axisTypes, multipoleAtomZs, multipoleAtomXs, multipoleAtomYs,
+           multipoleAtomCovalentInfo, particleData );
+    outputInducedDipoles = _inducedDipole;
+}
 void AmoebaReferenceMultipoleForce::calculateAmoebaSystemMultipoleMoments( const std::vector<RealOpenMM>& masses,
                                                                           const std::vector<RealVec>& particlePositions,
                                                                           const std::vector<RealOpenMM>& charges,

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
@@ -471,6 +471,38 @@ public:
                                        const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo,
                                        std::vector<OpenMM::RealVec>& forces );
+    /**
+     * Calculate particle induced dipoles.
+     *
+     * @param masses                    particle masses
+     * @param particlePositions         Cartesian coordinates of particles
+     * @param charges                   scalar charges for each particle
+     * @param dipoles                   molecular frame dipoles for each particle
+     * @param quadrupoles               molecular frame quadrupoles for each particle
+     * @param tholes                    Thole factors for each particle
+     * @param dampingFactors            dampling factors for each particle
+     * @param polarity                  polarity for each particle
+     * @param axisTypes                 axis type (Z-then-X, ... ) for each particle
+     * @param multipoleAtomZs           indicies of particle specifying the molecular frame z-axis for each particle
+     * @param multipoleAtomXs           indicies of particle specifying the molecular frame x-axis for each particle
+     * @param multipoleAtomYs           indicies of particle specifying the molecular frame y-axis for each particle
+     * @param multipoleAtomCovalentInfo covalent info needed to set scaling factors
+     * @param outputMultipoleMoments    output multipole moments
+     */
+    void calculateInducedDipoles(const std::vector<OpenMM::RealVec>& particlePositions,
+                                 const std::vector<RealOpenMM>& charges,
+                                 const std::vector<RealOpenMM>& dipoles,
+                                 const std::vector<RealOpenMM>& quadrupoles,
+                                 const std::vector<RealOpenMM>& tholes,
+                                 const std::vector<RealOpenMM>& dampingFactors,
+                                 const std::vector<RealOpenMM>& polarity,
+                                 const std::vector<int>& axisTypes,
+                                 const std::vector<int>& multipoleAtomZs,
+                                 const std::vector<int>& multipoleAtomXs,
+                                 const std::vector<int>& multipoleAtomYs,
+                                 const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo,
+                                 std::vector<RealVec>& outputInducedDipoles);
    /**
     * Calculate system multipole moments.
     *

--- a/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/tests/TestReferenceAmoebaMultipoleForce.cpp
@@ -2605,6 +2605,40 @@ static void testPMEMutualPolarizationLargeWater( FILE* log ) {
 }
+// test querying particle induced dipoles
+static void testParticleInducedDipoles() {
+    int numberOfParticles     = 8;
+    int inputPmeGridDimension = 0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    setupMultipoleAmmonia(system, amoebaMultipoleForce, AmoebaMultipoleForce::NoCutoff, AmoebaMultipoleForce::Mutual, 
+                                             cutoff, inputPmeGridDimension);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("Reference"));
+    getForcesEnergyMultipoleAmmonia(context, forces, energy);
+    std::vector<Vec3> dipole;
+    amoebaMultipoleForce->getInducedDipoles(context, dipole);
+    // Compare to values calculated by TINKER.
+    std::vector<Vec3> expectedDipole(numberOfParticles);
+    expectedDipole[0] = Vec3(0.0031710288, 9.3687453e-7, -0.0006919963);
+    expectedDipole[1] = Vec3(8.0279737504e-5, -0.000279376, 4.778060103e-5);
+    expectedDipole[2] = Vec3(0.000079322, 0.0002789804, 4.8696656126e-5);
+    expectedDipole[3] = Vec3(-0.0001407394, 1.540638116e-6, -0.0007077775);
+    expectedDipole[4] = Vec3(0.0019564439, -1.0409717e-7, 0.0007332188);
+    expectedDipole[5] = Vec3(0.0008213891, -0.0007749618, -0.0003883865);
+    expectedDipole[6] = Vec3(0.0046133992, -7.2868019e-7, 0.0002500622);
+    expectedDipole[7] = Vec3(0.0008204731, 0.0007772727, -0.0003856176);
+    for (int i = 0; i < numberOfParticles; i++)
+        ASSERT_EQUAL_VEC(expectedDipole[i], dipole[i], 1e-4);
+}
 // test computation of system multipole moments
 static void testSystemMultipoleMoments( FILE* log ) {
@@ -2778,6 +2812,10 @@ int main( int numberOfArguments, char* argv[] ) {
        testMultipoleAmmoniaDirectPolarization( log );
+        // test querying induced dipoles
+        testParticleInducedDipoles();
        // test mutual polarization, no cutoff
        testMultipoleAmmoniaMutualPolarization( log );

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -34,9 +34,10 @@
 #endif
 #include "CpuPmeKernels.h"
 #include "SimTKOpenMMRealType.h"
+#include "openmm/internal/hardware.h"
+#include "openmm/internal/vectorize.h"
 #include <cmath>
 #include <cstring>
-#include <smmintrin.h>
 using namespace OpenMM;
 using namespace std;
@@ -46,145 +47,73 @@ static const int PME_ORDER = 5;
 bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
 int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
-#define EXTRACT_FLOAT(v, element) _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, element)))
-// Define function to get the number of processors.
-#ifdef __APPLE__
-   #include <sys/sysctl.h>
-   #include <dlfcn.h>
-#else
-   #ifdef WIN32
-      #include <windows.h>
-   #else
-      #include <dlfcn.h>
-      #include <unistd.h>
-   #endif
-#endif
-static int getNumProcessors() {
-#ifdef __APPLE__
-    int ncpu;
-    size_t len = 4;
-    if (sysctlbyname("hw.logicalcpu", &ncpu, &len, NULL, 0) == 0)
-       return ncpu;
-    else
-       return 1;
-#else
-#ifdef WIN32
-    SYSTEM_INFO siSysInfo;
-    int ncpu;
-    GetSystemInfo(&siSysInfo);
-    ncpu = siSysInfo.dwNumberOfProcessors;
-    if (ncpu < 1)
-        ncpu = 1;
-    return ncpu;
-#else
-    long nProcessorsOnline = sysconf(_SC_NPROCESSORS_ONLN);
-    if (nProcessorsOnline == -1)
-        return 1;
-    else
-        return (int) nProcessorsOnline;
-#endif
-#endif
-}
-// Define a function to check the CPU's capabilities.
-#ifdef _WIN32
-#define cpuid __cpuid
-#else
-static void cpuid(int cpuInfo[4], int infoType){
-#ifdef __LP64__
-    __asm__ __volatile__ (
-        "cpuid":
-        "=a" (cpuInfo[0]),
-        "=b" (cpuInfo[1]),
-        "=c" (cpuInfo[2]),
-        "=d" (cpuInfo[3]) :
-        "a" (infoType)
-    );
-#else
-    __asm__ __volatile__ (
-        "pushl %%ebx\n"
-        "cpuid\n"
-        "movl %%ebx, %1\n"
-        "popl %%ebx\n" :
-        "=a" (cpuInfo[0]),
-        "=r" (cpuInfo[1]),
-        "=c" (cpuInfo[2]),
-        "=d" (cpuInfo[3]) :
-        "a" (infoType)
-    );
-#endif
-}
-#endif
 static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
    float temp[4];
-    __m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]);
+    fvec4 boxSize((float) periodicBoxSize[0], (float) periodicBoxSize[1], (float) periodicBoxSize[2], 0);
-    __m128 invBoxSize = _mm_set_ps(0, (float) (1/periodicBoxSize[2]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[0]));
+    fvec4 invBoxSize((float) (1/periodicBoxSize[0]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[2]), 0);
-    __m128 gridSize = _mm_set_ps(0, gridz, gridy, gridx);
+    fvec4 gridSize(gridx, gridy, gridz, 0);
-    __m128i gridSizeInt = _mm_set_epi32(0, gridz, gridy, gridx);
+    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
-    __m128 one  = _mm_set1_ps(1);
+    fvec4 one(1);
-    __m128 scale = _mm_set1_ps(1.0f/(PME_ORDER-1));
+    fvec4 scale(1.0f/(PME_ORDER-1));
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
    for (int i = start; i < end; i++) {
        // Find the position relative to the nearest grid point.
-        __m128 pos = _mm_loadu_ps(&posq[4*i]);
+        fvec4 pos(&posq[4*i]);
-        __m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
+        fvec4 posFloor = floor(pos*invBoxSize);
-        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
+        fvec4 posInBox = pos-boxSize*posFloor;
-        __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
+        fvec4 t = posInBox*invBoxSize*gridSize;
-        __m128i ti = _mm_cvttps_epi32(t);
+        ivec4 ti = t;
-        __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
+        fvec4 dr = t-ti;
-        __m128i gridIndex = _mm_sub_epi32(ti, _mm_and_si128(gridSizeInt, _mm_cmpeq_epi32(ti, gridSizeInt)));
+        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
        // Compute the B-spline coefficients.
-        __m128 data[PME_ORDER];
+        fvec4 data[PME_ORDER];
-        data[PME_ORDER-1] = _mm_setzero_ps();
+        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
-        data[0] = _mm_sub_ps(one, dr);
+        data[0] = one-dr;
        for (int j = 3; j < PME_ORDER; j++) {
-            __m128 div = _mm_set1_ps(1.0f/(j-1));
+            fvec4 div(1.0f/(j-1));
-            data[j-1] = _mm_mul_ps(_mm_mul_ps(div, dr), data[j-2]);
+            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < j-1; k++)
-                data[j-k-1] = _mm_mul_ps(div, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(k)), data[j-k-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(j-k), dr), data[j-k-1])));
+                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
-            data[0] = _mm_mul_ps(_mm_mul_ps(div, _mm_sub_ps(one, dr)), data[0]);
+            data[0] = div*(one-dr)*data[0];
        }
-        data[PME_ORDER-1] = _mm_mul_ps(_mm_mul_ps(scale, dr), data[PME_ORDER-2]);
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = _mm_mul_ps(scale, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(j)), data[PME_ORDER-j-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(PME_ORDER-j), dr), data[PME_ORDER-j-1])));
+            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
-        data[0] = _mm_mul_ps(_mm_mul_ps(scale, _mm_sub_ps(one, dr)), data[0]);
+        data[0] = scale*(one-dr)*data[0];
        // Spread the charges.
-        int gridIndexX = _mm_extract_epi32(gridIndex, 0);
+        int gridIndexX = gridIndex[0];
-        int gridIndexY = _mm_extract_epi32(gridIndex, 1);
+        int gridIndexY = gridIndex[1];
-        int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
+        int gridIndexZ = gridIndex[2];
+        if (gridIndexX < 0)
+            return; // This happens when a simulation blows up and coordinates become NaN.
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
        float charge = epsilonFactor*posq[4*i+3];
-        __m128 zdata0to3 = _mm_set_ps(EXTRACT_FLOAT(data[3], 2), EXTRACT_FLOAT(data[2], 2), EXTRACT_FLOAT(data[1], 2), EXTRACT_FLOAT(data[0], 2));
+        fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
-        float zdata4 = EXTRACT_FLOAT(data[4], 2);
+        float zdata4 = data[4][2];
        if (gridIndexZ+4 < gridz) {
            for (int ix = 0; ix < PME_ORDER; ix++) {
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
-                float xdata = charge*EXTRACT_FLOAT(data[ix], 0);
+                float xdata = charge*data[ix][0];
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
-                    float multiplier = xdata*EXTRACT_FLOAT(data[iy], 1);
+                    float multiplier = xdata*data[iy][1];
-                    __m128 add0to3 = _mm_mul_ps(zdata0to3, _mm_set1_ps(multiplier));
+                    fvec4 add0to3 = zdata0to3*multiplier;
-                    _mm_storeu_ps(&grid[ybase+gridIndexZ], _mm_add_ps(_mm_loadu_ps(&grid[ybase+gridIndexZ]), add0to3));
+                    (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
                    grid[ybase+zindex[4]] += multiplier*zdata4;
                }
            }
@@ -194,14 +123,14 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
                int xbase = gridIndexX+ix;
                xbase -= (xbase >= gridx ? gridx : 0);
                xbase = xbase*gridy*gridz;
-                float xdata = charge*EXTRACT_FLOAT(data[ix], 0);
+                float xdata = charge*data[ix][0];
                for (int iy = 0; iy < PME_ORDER; iy++) {
                    int ybase = gridIndexY+iy;
                    ybase -= (ybase >= gridy ? gridy : 0);
                    ybase = xbase + ybase*gridz;
-                    float multiplier = xdata*EXTRACT_FLOAT(data[iy], 1);
+                    float multiplier = xdata*data[iy][1];
-                    __m128 add0to3 = _mm_mul_ps(zdata0to3, _mm_set1_ps(multiplier));
+                    fvec4 add0to3 = zdata0to3*multiplier;
-                    _mm_store_ps(temp, add0to3);
+                    add0to3.store(temp);
                    grid[ybase+zindex[0]] += temp[0];
                    grid[ybase+zindex[1]] += temp[1];
                    grid[ybase+zindex[2]] += temp[2];
@@ -314,84 +243,86 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int g
 }
 static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
-    __m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]);
+    fvec4 boxSize((float) periodicBoxSize[0], (float) periodicBoxSize[1], (float) periodicBoxSize[2], 0);
-    __m128 invBoxSize = _mm_set_ps(0, (float) (1/periodicBoxSize[2]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[0]));
+    fvec4 invBoxSize((float) (1/periodicBoxSize[0]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[2]), 0);
-    __m128 gridSize = _mm_set_ps(0, gridz, gridy, gridx);
+    fvec4 gridSize(gridx, gridy, gridz, 0);
-    __m128i gridSizeInt = _mm_set_epi32(0, gridz, gridy, gridx);
+    ivec4 gridSizeInt(gridx, gridy, gridz, 0);
-    __m128 one  = _mm_set1_ps(1);
+    fvec4 one(1);
-    __m128 scale = _mm_set1_ps(1.0f/(PME_ORDER-1));
+    fvec4 scale(1.0f/(PME_ORDER-1));
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
    for (int i = start; i < end; i++) {
        // Find the position relative to the nearest grid point.
-        __m128 pos = _mm_loadu_ps(&posq[4*i]);
+        fvec4 pos(&posq[4*i]);
-        __m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
+        fvec4 posFloor = floor(pos*invBoxSize);
-        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
+        fvec4 posInBox = pos-boxSize*posFloor;
-        __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
+        fvec4 t = posInBox*invBoxSize*gridSize;
-        __m128i ti = _mm_cvttps_epi32(t);
+        ivec4 ti = t;
-        __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
+        fvec4 dr = t-ti;
-        __m128i gridIndex = _mm_sub_epi32(ti, _mm_and_si128(gridSizeInt, _mm_cmpeq_epi32(ti, gridSizeInt)));
+        ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
        // Compute the B-spline coefficients.
-        __m128 data[PME_ORDER];
+        fvec4 data[PME_ORDER];
-        __m128 ddata[PME_ORDER];
+        fvec4 ddata[PME_ORDER];
-        data[PME_ORDER-1] = _mm_setzero_ps();
+        data[PME_ORDER-1] = 0.0f;
        data[1] = dr;
-        data[0] = _mm_sub_ps(one, dr);
+        data[0] = one-dr;
        for (int j = 3; j < PME_ORDER; j++) {
-            __m128 div = _mm_set1_ps(1.0f/(j-1));
+            fvec4 div(1.0f/(j-1));
-            data[j-1] = _mm_mul_ps(_mm_mul_ps(div, dr), data[j-2]);
+            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < j-1; k++)
-                data[j-k-1] = _mm_mul_ps(div, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(k)), data[j-k-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(j-k), dr), data[j-k-1])));
+                data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
-            data[0] = _mm_mul_ps(_mm_mul_ps(div, _mm_sub_ps(one, dr)), data[0]);
+            data[0] = div*(one-dr)*data[0];
        }
-        ddata[0] = _mm_sub_ps(_mm_set1_ps(0), data[0]);
+        ddata[0] = -data[0];
        for (int j = 1; j < PME_ORDER; j++)
-            ddata[j] = _mm_sub_ps(data[j-1], data[j]);
+            ddata[j] = data[j-1]-data[j];
-        data[PME_ORDER-1] = _mm_mul_ps(_mm_mul_ps(scale, dr), data[PME_ORDER-2]);
+        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
-            data[PME_ORDER-j-1] = _mm_mul_ps(scale, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(j)), data[PME_ORDER-j-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(PME_ORDER-j), dr), data[PME_ORDER-j-1])));
+            data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
-        data[0] = _mm_mul_ps(_mm_mul_ps(scale, _mm_sub_ps(one, dr)), data[0]);
+        data[0] = scale*(one-dr)*data[0];
        // Compute the force on this atom.
-        int gridIndexX = _mm_extract_epi32(gridIndex, 0);
+        int gridIndexX = gridIndex[0];
-        int gridIndexY = _mm_extract_epi32(gridIndex, 1);
+        int gridIndexY = gridIndex[1];
-        int gridIndexZ = _mm_extract_epi32(gridIndex, 2);
+        int gridIndexZ = gridIndex[2];
+        if (gridIndexX < 0)
+            return; // This happens when a simulation blows up and coordinates become NaN.
        int zindex[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++) {
            zindex[j] = gridIndexZ+j;
            zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
        }
-        __m128 zdata[PME_ORDER];
+        fvec4 zdata[PME_ORDER];
        for (int j = 0; j < PME_ORDER; j++)
-            zdata[j] = _mm_set_ps(0, EXTRACT_FLOAT(ddata[j], 2), EXTRACT_FLOAT(data[j], 2), EXTRACT_FLOAT(data[j], 2));
+            zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
-        __m128 f = _mm_set1_ps(0);
+        fvec4 f = 0.0f;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndexX+ix;
            xbase -= (xbase >= gridx ? gridx : 0);
            xbase = xbase*gridy*gridz;
-            float dx = EXTRACT_FLOAT(data[ix], 0);
+            float dx = data[ix][0];
-            float ddx = EXTRACT_FLOAT(ddata[ix], 0);
+            float ddx = ddata[ix][0];
-            __m128 xdata = _mm_set_ps(0, dx, dx, ddx);
+            fvec4 xdata(ddx, dx, dx, 0);
            for (int iy = 0; iy < PME_ORDER; iy++) {
                int ybase = gridIndexY+iy;
                ybase -= (ybase >= gridy ? gridy : 0);
                ybase = xbase + ybase*gridz;
-                float dy = EXTRACT_FLOAT(data[iy], 1);
+                float dy = data[iy][1];
-                float ddy = EXTRACT_FLOAT(ddata[iy], 1);
+                float ddy = ddata[iy][1];
-                __m128 xydata = _mm_mul_ps(xdata, _mm_set_ps(0, dy, ddy, dy));
+                fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
                for (int iz = 0; iz < PME_ORDER; iz++) {
-                    __m128 gridValue = _mm_set1_ps(grid[ybase+zindex[iz]]);
+                    fvec4 gridValue(grid[ybase+zindex[iz]]);
-                    f = _mm_add_ps(f, _mm_mul_ps(xydata, _mm_mul_ps(zdata[iz], gridValue)));
+                    f = f+xydata*zdata[iz]*gridValue;
                }
            }
        }
-        f = _mm_mul_ps(invBoxSize, _mm_mul_ps(gridSize, _mm_mul_ps(f, _mm_set1_ps(-epsilonFactor*posq[4*i+3]))));
+        f = invBoxSize*gridSize*f*(-epsilonFactor*posq[4*i+3]);
-        _mm_store_ps(&force[4*i], f);        
+        f.store(&force[4*i]);
    }
 }
@@ -576,10 +507,10 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
            threadWait();
            int numGrids = threadData.size();
            for (int i = gridStart; i < gridEnd; i += 4) {
-                __m128 sum = _mm_load_ps(&realGrid[i]);
+                fvec4 sum(&realGrid[i]);
                for (int j = 1; j < numGrids; j++)
-                    sum = _mm_add_ps(sum, _mm_load_ps(&threadData[j]->tempGrid[i]));
+                    sum += fvec4(&threadData[j]->tempGrid[i]);
-                _mm_store_ps(&realGrid[i], sum);
+                sum.store(&realGrid[i]);
            }
            threadWait();
            if (lastBoxSize != periodicBoxSize) {

--- a/plugins/drude/platforms/reference/src/ReferenceDrudeKernels.cpp
+++ b/plugins/drude/platforms/reference/src/ReferenceDrudeKernels.cpp
@@ -34,7 +34,7 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
 #include "SimTKOpenMMUtilities.h"
-#include "ReferenceCCMAAlgorithm.h"
+#include "ReferenceConstraints.h"
 #include "ReferenceVirtualSites.h"
 #include <set>
@@ -56,20 +56,6 @@ static vector<RealVec>& extractForces(ContextImpl& context) {
    return *((vector<RealVec>*) data->forces);
 }
-static void findAnglesForCCMA(const System& system, vector<ReferenceCCMAAlgorithm::AngleInfo>& angles) {
-    for (int i = 0; i < system.getNumForces(); i++) {
-        const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
-        if (force != NULL) {
-            for (int j = 0; j < force->getNumAngles(); j++) {
-                int atom1, atom2, atom3;
-                double angle, k;
-                force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
-                angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, (RealOpenMM)angle));
-            }
-        }
-    }
-}
 static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>& inverseMasses, double timeShift, ReferenceConstraintAlgorithm* constraints) {
    const System& system = context.getSystem();
    int numParticles = system.getNumParticles();
@@ -91,7 +77,7 @@ static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>&
    if (constraints != NULL) {
        constraints->setTolerance(1e-4);
-        constraints->applyToVelocities(numParticles, posData, shiftedVel, inverseMasses);
+        constraints->applyToVelocities(posData, shiftedVel, inverseMasses);
    }
    // Compute the kinetic energy.
@@ -271,21 +257,19 @@ void ReferenceIntegrateDrudeLangevinStepKernel::initialize(const System& system,
    // Prepare constraints.
-    int numConstraints = system.getNumConstraints();
+    if (system.getNumConstraints() > 0) {
-    if (numConstraints > 0) {
+        vector<pair<int, int> > constraintIndices;
-        vector<pair<int, int> > constraintIndices(numConstraints);
+        vector<RealOpenMM> constraintDistances;
-        vector<RealOpenMM> constraintDistances(numConstraints);
+        for (int i = 0; i < system.getNumConstraints(); ++i) {
-        for (int i = 0; i < numConstraints; ++i) {
            int particle1, particle2;
            double distance;
            system.getConstraintParameters(i, particle1, particle2, distance);
-            constraintIndices[i].first = particle1;
+            if (system.getParticleMass(particle1) != 0 || system.getParticleMass(particle2) != 0) {
-            constraintIndices[i].second = particle2;
+                constraintIndices.push_back(make_pair(particle1, particle2));
-            constraintDistances[i] = static_cast<RealOpenMM>(distance);
+                constraintDistances.push_back(distance);
+            }
        }
-        vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
+        constraints = new ReferenceConstraints(system, (RealOpenMM) integrator.getConstraintTolerance());
-        findAnglesForCCMA(system, angles);
-        constraints = new ReferenceCCMAAlgorithm(system.getNumParticles(), numConstraints, constraintIndices, constraintDistances, particleMass, angles, (RealOpenMM)integrator.getConstraintTolerance());
    }
 }
@@ -347,7 +331,7 @@ void ReferenceIntegrateDrudeLangevinStepKernel::execute(ContextImpl& context, co
    // Apply constraints.
    if (constraints != NULL)
-        constraints->apply(numParticles, pos, xPrime, particleInvMass);
+        constraints->apply(pos, xPrime, particleInvMass);
    // Record the constrained positions and velocities.
@@ -395,21 +379,19 @@ void ReferenceIntegrateDrudeSCFStepKernel::initialize(const System& system, cons
    // Prepare constraints.
-    int numConstraints = system.getNumConstraints();
+    if (system.getNumConstraints() > 0) {
-    if (numConstraints > 0) {
+        vector<pair<int, int> > constraintIndices;
-        vector<pair<int, int> > constraintIndices(numConstraints);
+        vector<RealOpenMM> constraintDistances;
-        vector<RealOpenMM> constraintDistances(numConstraints);
+        for (int i = 0; i < system.getNumConstraints(); ++i) {
-        for (int i = 0; i < numConstraints; ++i) {
            int particle1, particle2;
            double distance;
            system.getConstraintParameters(i, particle1, particle2, distance);
-            constraintIndices[i].first = particle1;
+            if (system.getParticleMass(particle1) != 0 || system.getParticleMass(particle2) != 0) {
-            constraintIndices[i].second = particle2;
+                constraintIndices.push_back(make_pair(particle1, particle2));
-            constraintDistances[i] = static_cast<RealOpenMM>(distance);
+                constraintDistances.push_back(distance);
+            }
        }
-        vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
+        constraints = new ReferenceConstraints(system, (RealOpenMM) integrator.getConstraintTolerance());
-        findAnglesForCCMA(system, angles);
-        constraints = new ReferenceCCMAAlgorithm(system.getNumParticles(), numConstraints, constraintIndices, constraintDistances, particleMass, angles, (RealOpenMM)integrator.getConstraintTolerance());
    }
    // Initialize the energy minimizer.
@@ -443,7 +425,7 @@ void ReferenceIntegrateDrudeSCFStepKernel::execute(ContextImpl& context, const D
    // Apply constraints.
    if (constraints != NULL)
-        constraints->apply(numParticles, pos, xPrime, particleInvMass);
+        constraints->apply(pos, xPrime, particleInvMass);
    // Record the constrained positions and velocities.

--- a/plugins/rpmd/openmmapi/include/openmm/RPMDIntegrator.h
+++ b/plugins/rpmd/openmmapi/include/openmm/RPMDIntegrator.h
@@ -47,6 +47,9 @@ namespace OpenMM {
 * springs to form a ring.  This allows certain quantum mechanical effects to be efficiently
 * simulated.
 * 
+ * By default this Integrator applies a PILE thermostat to the system to simulate constant
+ * temperature dynamics.  You can disable the thermostat by calling setApplyThermostat(false).
+ * 
 * Because this Integrator simulates many copies of the System at once, it must be used
 * differently from other Integrators.  Instead of setting positions and velocities by
 * calling methods of the Context, you should use the corresponding methods of the Integrator
@@ -127,6 +130,18 @@ public:
    void setFriction(double coeff) {
        friction = coeff;
    }
+    /**
+     * Get whether a thermostat is applied to the system.
+     */
+    bool getApplyThermostat() const {
+        return applyThermostat;
+    }
+    /**
+     * Set whether a thermostat is applied to the system.
+     */
+    void setApplyThermostat(bool apply) {
+        applyThermostat = apply;
+    }
    /**
     * Get the random number seed.  See setRandomNumberSeed() for details.
     */
@@ -213,6 +228,7 @@ protected:
 private:
    double temperature, friction;
    int numCopies, randomNumberSeed;
+    bool applyThermostat;
    std::map<int, int> contractions;
    bool forcesAreValid, hasSetPosition, hasSetVelocity, isFirstStep;
    Kernel kernel;

--- a/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
+++ b/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
@@ -42,7 +42,7 @@ using namespace OpenMM;
 using namespace std;
 RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize, const map<int, int>& contractions) :
-        numCopies(numCopies), contractions(contractions), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
+        numCopies(numCopies), applyThermostat(true), contractions(contractions), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
    setTemperature(temperature);
    setFriction(frictionCoeff);
    setStepSize(stepSize);
@@ -51,7 +51,7 @@ RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictio
 }
 RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize) :
-        numCopies(numCopies), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
+        numCopies(numCopies), applyThermostat(true), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
    setTemperature(temperature);
    setFriction(frictionCoeff);
    setStepSize(stepSize);

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
@@ -205,7 +205,8 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
    void* frictionPtr = (useDoublePrecision ? (void*) &friction : (void*) &frictionFloat);
    int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
    void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, dtPtr, kTPtr, frictionPtr};
-    cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+    if (integrator.getApplyThermostat())
+        cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
    // Update positions and velocities.
@@ -223,8 +224,10 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
    // Apply the PILE-L thermostat again.
-    randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
+    if (integrator.getApplyThermostat()) {
-    cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+        randomIndex = integration.prepareRandomNumbers(numParticles*numCopies);
+        cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize);
+    }
    // Update the time and step count.

--- a/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+++ b/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
@@ -431,6 +431,71 @@ void testContractions() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testWithoutThermostat() {
+    const int numParticles = 20;
+    const int numCopies = 10;
+    const double temperature = 300.0;
+    const double mass = 2.0;
+    // Create a chain of particles.
+    System system;
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(mass);
+        if (i > 0)
+            bonds->addBond(i-1, i, 1.0, 1000.0);
+    }
+    RPMDIntegrator integ(numCopies, temperature, 1.0, 0.001);
+    integ.setApplyThermostat(false);
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    Context context(system, integ, platform);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<vector<Vec3> > positions(numCopies);
+    for (int i = 0; i < numCopies; i++) {
+        positions[i].resize(numParticles);
+        for (int j = 0; j < numParticles; j++)
+            positions[i][j] = Vec3(0.95*j, 0.01*genrand_real2(sfmt), 0.01*genrand_real2(sfmt));
+        integ.setPositions(i, positions[i]);
+    }
+    // Simulate it and see if the energy remains constant.
+    double initialEnergy;
+    int numSteps = 100;
+    const double hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
+    const double wn = numCopies*BOLTZ*temperature/hbar;
+    const double springConstant = mass*wn*wn;
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        // Sum the energies of all the copies.
+        double energy = 0.0;
+        for (int j = 0; j < numCopies; j++) {
+            State state = integ.getState(j, State::Positions | State::Energy);
+            positions[j] = state.getPositions();
+            energy += state.getPotentialEnergy()+state.getKineticEnergy();
+        }
+        // Add the energy from the springs connecting copies.
+        for (int j = 0; j < numCopies; j++) {
+            int previous = (j == 0 ? numCopies-1 : j-1);
+            for (int k = 0; k < numParticles; k++) {
+                Vec3 delta = positions[j][k]-positions[previous][k];
+                energy += 0.5*springConstant*delta.dot(delta);
+            }
+        }
+        if (i == 0)
+            initialEnergy = energy;
+        else
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 1e-4);
+    }
+}
 int main(int argc, char* argv[]) {
    try {
        registerRPMDCudaKernelFactories();
@@ -441,6 +506,7 @@ int main(int argc, char* argv[]) {
        testCMMotionRemoval();
        testVirtualSites();
        testContractions();
+        testWithoutThermostat();
    }
    catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
@@ -223,7 +223,8 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
        stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
    }
-    cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
+    if (integrator.getApplyThermostat())
+        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
    // Update positions and velocities.
@@ -238,8 +239,10 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
    // Apply the PILE-L thermostat again.
-    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
+    if (integrator.getApplyThermostat()) {
-    cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
+        pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
+        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
+    }
    // Update the time and step count.

--- a/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
+++ b/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
@@ -432,6 +432,71 @@ void testContractions() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testWithoutThermostat() {
+    const int numParticles = 20;
+    const int numCopies = 10;
+    const double temperature = 300.0;
+    const double mass = 2.0;
+    // Create a chain of particles.
+    System system;
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(mass);
+        if (i > 0)
+            bonds->addBond(i-1, i, 1.0, 1000.0);
+    }
+    RPMDIntegrator integ(numCopies, temperature, 1.0, 0.001);
+    integ.setApplyThermostat(false);
+    Platform& platform = Platform::getPlatformByName("OpenCL");
+    Context context(system, integ, platform);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<vector<Vec3> > positions(numCopies);
+    for (int i = 0; i < numCopies; i++) {
+        positions[i].resize(numParticles);
+        for (int j = 0; j < numParticles; j++)
+            positions[i][j] = Vec3(0.95*j, 0.01*genrand_real2(sfmt), 0.01*genrand_real2(sfmt));
+        integ.setPositions(i, positions[i]);
+    }
+    // Simulate it and see if the energy remains constant.
+    double initialEnergy;
+    int numSteps = 100;
+    const double hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
+    const double wn = numCopies*BOLTZ*temperature/hbar;
+    const double springConstant = mass*wn*wn;
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        // Sum the energies of all the copies.
+        double energy = 0.0;
+        for (int j = 0; j < numCopies; j++) {
+            State state = integ.getState(j, State::Positions | State::Energy);
+            positions[j] = state.getPositions();
+            energy += state.getPotentialEnergy()+state.getKineticEnergy();
+        }
+        // Add the energy from the springs connecting copies.
+        for (int j = 0; j < numCopies; j++) {
+            int previous = (j == 0 ? numCopies-1 : j-1);
+            for (int k = 0; k < numParticles; k++) {
+                Vec3 delta = positions[j][k]-positions[previous][k];
+                energy += 0.5*springConstant*delta.dot(delta);
+            }
+        }
+        if (i == 0)
+            initialEnergy = energy;
+        else
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 1e-4);
+    }
+}
 int main(int argc, char* argv[]) {
    try {
        registerRPMDOpenCLKernelFactories();
@@ -442,6 +507,7 @@ int main(int argc, char* argv[]) {
        testCMMotionRemoval();
        testVirtualSites();
        testContractions();
+        testWithoutThermostat();
    }
    catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;

--- a/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.cpp
@@ -135,36 +135,38 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
    const RealOpenMM twown = 2.0*nkT/hbar;
    const RealOpenMM c1_0 = exp(-halfdt*integrator.getFriction());
    const RealOpenMM c2_0 = sqrt(1.0-c1_0*c1_0);
-    for (int particle = 0; particle < numParticles; particle++) {
+    if (integrator.getApplyThermostat()) {
-        if (system.getParticleMass(particle) == 0.0)
+        for (int particle = 0; particle < numParticles; particle++) {
-            continue;
+            if (system.getParticleMass(particle) == 0.0)
-        const RealOpenMM c3_0 = c2_0*sqrt(nkT/system.getParticleMass(particle));
+                continue;
-        for (int component = 0; component < 3; component++) {
+            const RealOpenMM c3_0 = c2_0*sqrt(nkT/system.getParticleMass(particle));
-            for (int k = 0; k < numCopies; k++)
+            for (int component = 0; component < 3; component++) {
-                v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                for (int k = 0; k < numCopies; k++)
-            fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
+                    v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
-            // Apply a local Langevin thermostat to the centroid mode.
-            v[0].re = v[0].re*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+                // Apply a local Langevin thermostat to the centroid mode.
-            // Use critical damping white noise for the remaining modes.
+                v[0].re = v[0].re*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
-            for (int k = 1; k <= numCopies/2; k++) {
+                // Use critical damping white noise for the remaining modes.
-                const bool isCenter = (numCopies%2 == 0 && k == numCopies/2);
-                const RealOpenMM wk = twown*sin(k*M_PI/numCopies);
+                for (int k = 1; k <= numCopies/2; k++) {
-                const RealOpenMM c1 = exp(-2.0*wk*halfdt);
+                    const bool isCenter = (numCopies%2 == 0 && k == numCopies/2);
-                const RealOpenMM c2 = sqrt((1.0-c1*c1)/2) * (isCenter ? sqrt(2.0) : 1.0);
+                    const RealOpenMM wk = twown*sin(k*M_PI/numCopies);
-                const RealOpenMM c3 = c2*sqrt(nkT/system.getParticleMass(particle));
+                    const RealOpenMM c1 = exp(-2.0*wk*halfdt);
-                RealOpenMM rand1 = c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+                    const RealOpenMM c2 = sqrt((1.0-c1*c1)/2) * (isCenter ? sqrt(2.0) : 1.0);
-                RealOpenMM rand2 = (isCenter ? 0.0 : c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
+                    const RealOpenMM c3 = c2*sqrt(nkT/system.getParticleMass(particle));
-                v[k] = v[k]*c1 + t_complex(rand1, rand2);
+                    RealOpenMM rand1 = c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
-                if (k < numCopies-k)
+                    RealOpenMM rand2 = (isCenter ? 0.0 : c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
-                    v[numCopies-k] = v[numCopies-k]*c1 + t_complex(rand1, -rand2);
+                    v[k] = v[k]*c1 + t_complex(rand1, rand2);
+                    if (k < numCopies-k)
+                        v[numCopies-k] = v[numCopies-k]*c1 + t_complex(rand1, -rand2);
+                }
+                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
+                for (int k = 0; k < numCopies; k++)
+                    velocities[k][particle][component] = scale*v[k].re;
            }
-            fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
-            for (int k = 0; k < numCopies; k++)
-                velocities[k][particle][component] = scale*v[k].re;
        }
    }
@@ -220,36 +222,38 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
    // Apply the PILE-L thermostat again.
-    for (int particle = 0; particle < numParticles; particle++) {
+    if (integrator.getApplyThermostat()) {
-        if (system.getParticleMass(particle) == 0.0)
+        for (int particle = 0; particle < numParticles; particle++) {
-            continue;
+            if (system.getParticleMass(particle) == 0.0)
-        const RealOpenMM c3_0 = c2_0*sqrt(nkT/system.getParticleMass(particle));
+                continue;
-        for (int component = 0; component < 3; component++) {
+            const RealOpenMM c3_0 = c2_0*sqrt(nkT/system.getParticleMass(particle));
-            for (int k = 0; k < numCopies; k++)
+            for (int component = 0; component < 3; component++) {
-                v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                for (int k = 0; k < numCopies; k++)
-            fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
+                    v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
-            // Apply a local Langevin thermostat to the centroid mode.
-            v[0].re = v[0].re*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+                // Apply a local Langevin thermostat to the centroid mode.
-            // Use critical damping white noise for the remaining modes.
+                v[0].re = v[0].re*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
-            for (int k = 1; k <= numCopies/2; k++) {
+                // Use critical damping white noise for the remaining modes.
-                const bool isCenter = (numCopies%2 == 0 && k == numCopies/2);
-                const RealOpenMM wk = twown*sin(k*M_PI/numCopies);
+                for (int k = 1; k <= numCopies/2; k++) {
-                const RealOpenMM c1 = exp(-2.0*wk*halfdt);
+                    const bool isCenter = (numCopies%2 == 0 && k == numCopies/2);
-                const RealOpenMM c2 = sqrt((1.0-c1*c1)/2) * (isCenter ? sqrt(2.0) : 1.0);
+                    const RealOpenMM wk = twown*sin(k*M_PI/numCopies);
-                const RealOpenMM c3 = c2*sqrt(nkT/system.getParticleMass(particle));
+                    const RealOpenMM c1 = exp(-2.0*wk*halfdt);
-                RealOpenMM rand1 = c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+                    const RealOpenMM c2 = sqrt((1.0-c1*c1)/2) * (isCenter ? sqrt(2.0) : 1.0);
-                RealOpenMM rand2 = (isCenter ? 0.0 : c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
+                    const RealOpenMM c3 = c2*sqrt(nkT/system.getParticleMass(particle));
-                v[k] = v[k]*c1 + t_complex(rand1, rand2);
+                    RealOpenMM rand1 = c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
-                if (k < numCopies-k)
+                    RealOpenMM rand2 = (isCenter ? 0.0 : c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
-                    v[numCopies-k] = v[numCopies-k]*c1 + t_complex(rand1, -rand2);
+                    v[k] = v[k]*c1 + t_complex(rand1, rand2);
+                    if (k < numCopies-k)
+                        v[numCopies-k] = v[numCopies-k]*c1 + t_complex(rand1, -rand2);
+                }
+                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
+                for (int k = 0; k < numCopies; k++)
+                    velocities[k][particle][component] = scale*v[k].re;
            }
-            fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
-            for (int k = 0; k < numCopies; k++)
-                velocities[k][particle][component] = scale*v[k].re;
        }
    }

--- a/plugins/rpmd/platforms/reference/tests/TestReferenceRpmd.cpp
+++ b/plugins/rpmd/platforms/reference/tests/TestReferenceRpmd.cpp
@@ -313,12 +313,78 @@ void testContractions() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testWithoutThermostat() {
+    const int numParticles = 20;
+    const int numCopies = 10;
+    const double temperature = 300.0;
+    const double mass = 2.0;
+    // Create a chain of particles.
+    System system;
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    for (int i = 0; i < numParticles; i++) {
+        system.addParticle(mass);
+        if (i > 0)
+            bonds->addBond(i-1, i, 1.0, 1000.0);
+    }
+    RPMDIntegrator integ(numCopies, temperature, 1.0, 0.001);
+    integ.setApplyThermostat(false);
+    Platform& platform = Platform::getPlatformByName("Reference");
+    Context context(system, integ, platform);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<vector<Vec3> > positions(numCopies);
+    for (int i = 0; i < numCopies; i++) {
+        positions[i].resize(numParticles);
+        for (int j = 0; j < numParticles; j++)
+            positions[i][j] = Vec3(0.95*j, 0.01*genrand_real2(sfmt), 0.01*genrand_real2(sfmt));
+        integ.setPositions(i, positions[i]);
+    }
+    // Simulate it and see if the energy remains constant.
+    double initialEnergy;
+    int numSteps = 100;
+    const double hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
+    const double wn = numCopies*BOLTZ*temperature/hbar;
+    const double springConstant = mass*wn*wn;
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        // Sum the energies of all the copies.
+        double energy = 0.0;
+        for (int j = 0; j < numCopies; j++) {
+            State state = integ.getState(j, State::Positions | State::Energy);
+            positions[j] = state.getPositions();
+            energy += state.getPotentialEnergy()+state.getKineticEnergy();
+        }
+        // Add the energy from the springs connecting copies.
+        for (int j = 0; j < numCopies; j++) {
+            int previous = (j == 0 ? numCopies-1 : j-1);
+            for (int k = 0; k < numParticles; k++) {
+                Vec3 delta = positions[j][k]-positions[previous][k];
+                energy += 0.5*springConstant*delta.dot(delta);
+            }
+        }
+        if (i == 0)
+            initialEnergy = energy;
+        else
+            ASSERT_EQUAL_TOL(initialEnergy, energy, 1e-4);
+    }
+}
 int main() {
    try {
        testFreeParticles();
        testCMMotionRemoval();
        testVirtualSites();
        testContractions();
+        testWithoutThermostat();
    }
    catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;

--- a/tests/TestParser.cpp
+++ b/tests/TestParser.cpp
@@ -56,6 +56,12 @@ void verifyEvaluation(const string& expression, double expectedValue) {
    ExpressionProgram program = parsed.createProgram();
    value = program.evaluate();
    ASSERT_EQUAL_TOL(expectedValue, value, 1e-10);
+    // Create a CompiledExpression and see if that also gives the same result.
+    CompiledExpression compiled = parsed.createCompiledExpression();
+    value = compiled.evaluate();
+    ASSERT_EQUAL_TOL(expectedValue, value, 1e-10);
 }
 /**
@@ -86,6 +92,16 @@ void verifyEvaluation(const string& expression, double x, double y, double expec
    value = program.evaluate(variables);
    ASSERT_EQUAL_TOL(expectedValue, value, 1e-10);
+    // Create a CompiledExpression and see if that also gives the same result.
+    CompiledExpression compiled = parsed.createCompiledExpression();
+    if (compiled.getVariables().find("x") != compiled.getVariables().end())
+        compiled.getVariableReference("x") = x;
+    if (compiled.getVariables().find("y") != compiled.getVariables().end())
+        compiled.getVariableReference("y") = y;
+    value = compiled.evaluate();
+    ASSERT_EQUAL_TOL(expectedValue, value, 1e-10);
    // Make sure that variable renaming works.
    variables.clear();

--- a/wrappers/python/setup.py
+++ b/wrappers/python/setup.py
@@ -85,6 +85,7 @@ version = '%(version)s'
 full_version = '%(full_version)s'
 git_revision = '%(git_revision)s'
 release = %(isrelease)s
+openmm_library_path = '%(path)s'
 if not release:
    version = full_version
@@ -113,7 +114,8 @@ if not release:
        a.write(cnt % {'version': version,
                       'full_version' : full_version,
                       'git_revision' : git_revision,
-                       'isrelease': str(IS_RELEASED)})
+                       'isrelease': str(IS_RELEASED),
+                       'path': os.getenv('OPENMM_LIB_PATH')})
    finally:
        a.close()
@@ -197,7 +199,7 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
            macVersion = [int(x) for x in platform.mac_ver()[0].split('.')]
            if tuple(macVersion) < (10, 6):
                os.environ['MACOSX_DEPLOYMENT_TARGET']='10.5'
-            extra_link_args.append('-Wl,-rpath,@loader_path/OpenMM')
+            extra_link_args.append('-Wl,-rpath,'+openmm_lib_path)
    library_dirs=[openmm_lib_path]
@@ -209,6 +211,7 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
                 include_dirs = include_dirs,
                 define_macros = define_macros,
                 library_dirs = library_dirs,
+                 runtime_library_dirs = library_dirs,
                 libraries = libraries,
                 extra_compile_args=extra_compile_args,
                 extra_link_args=extra_link_args,
@@ -238,8 +241,8 @@ def main():
        uninstall()
    except:
        pass
-    writeVersionPy()
    setupKeywords=buildKeywordDictionary()
+    writeVersionPy()
    setup(**setupKeywords)
 if __name__ == '__main__':