Merge remote-tracking branch 'upstream/master'

71d33617 · John Chodera (MSKCC) · eb232608 · 9da36463 · 71d33617 · 71d33617
Commit 71d33617 authored Apr 24, 2015 by John Chodera (MSKCC)
20 changed files
--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }

+static inline fvec4 rsqrt(const fvec4& v) {
+    // Initial estimate of rsqrt().
+
+    fvec4 y(_mm_rsqrt_ps(v.val));
+
+    // Perform an iteration of Newton refinement.
+
+    fvec4 x2 = v*0.5f;
+    y *= fvec4(1.5f)-x2*y*y;
+    return y;
+}
+
 static inline float dot3(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }

--- a/openmmapi/src/CMAPTorsionForce.cpp
+++ b/openmmapi/src/CMAPTorsionForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -96,3 +96,6 @@ ForceImpl* CMAPTorsionForce::createImpl() const {
    return new CMAPTorsionForceImpl(*this);
 }

+void CMAPTorsionForce::updateParametersInContext(Context& context) {
+    dynamic_cast<CMAPTorsionForceImpl&>(getImplInContext(context)).updateParametersInContext(getContextImpl(context));
+}
--- a/openmmapi/src/CMAPTorsionForceImpl.cpp
+++ b/openmmapi/src/CMAPTorsionForceImpl.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -154,4 +154,8 @@ void CMAPTorsionForceImpl::calcMapDerivatives(int size, const vector<double>& en
            }
        }
    }
-}
\ No newline at end of file
+}
+
+void CMAPTorsionForceImpl::updateParametersInContext(ContextImpl& context) {
+    kernel.getAs<CalcCMAPTorsionForceKernel>().copyParametersToContext(context, owner);
+}
--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -61,6 +61,7 @@ public:
 private:
    int blockSize;
    std::vector<int> sortedAtoms;
+    std::vector<float> sortedPositions;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;
    // The following variables are used to make information accessible to the individual threads.

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
@@ -56,8 +56,8 @@ protected:
      /**
       * Templatized implementation of calculateBlockIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
            
      /**---------------------------------------------------------------------------------------
      
@@ -74,15 +74,15 @@ protected:
      /**
       * Templatized implementation of calculateBlockEwaldIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);

      /**
       * Compute the displacement and squared distance between a collection of points, optionally using
       * periodic boundary conditions.
       */
-      template <bool TRICLINIC>
-      void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      template <int PERIODIC_TYPE>
+      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

      /**
       * Compute a fast approximation to erfc(x).

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
@@ -55,8 +55,8 @@ protected:
      /**
       * Templatized implementation of calculateBlockIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
            
      /**---------------------------------------------------------------------------------------
      
@@ -73,15 +73,15 @@ protected:
      /**
       * Templatized implementation of calculateBlockEwaldIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);

      /**
       * Compute the displacement and squared distance between a collection of points, optionally using
       * periodic boundary conditions.
       */
-      template <bool TRICLINIC>
-      void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      template <int PERIODIC_TYPE>
+      void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

      /**
       * Compute a fast approximation to erfc(x).

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
@@ -199,7 +199,7 @@ void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
 }

 void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
-    assert(cutoff);
+    assert(useCutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[2][2] >= 2.0*cutoffDistance);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -164,8 +164,8 @@ public:
        
        return VoxelIndex(y, z);
    }
-
-    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
+        
+    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const {
        neighbors.resize(0);
        exclusions.resize(0);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
@@ -233,24 +233,34 @@ public:
                
                float minx = centerPos[0];
                float maxx = centerPos[0];
-                fvec4 offset(-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz), 0);
-                for (int k = 0; k < (int) blockAtoms.size(); k++) {
-                    const float* atomPos = &atomLocations[4*blockAtoms[k]];
-                    fvec4 posVec(atomPos);
-                    fvec4 delta1 = offset-posVec;
-                    fvec4 delta2 = delta1+fvec4(0, voxelSizeY, voxelSizeZ, 0);
-                    if (usePeriodic) {
-                        delta1 -= round(delta1*invBoxSize)*boxSize;
-                        delta2 -= round(delta2*invBoxSize)*boxSize;
+                float offset[3] = {-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz)};
+                for (int k = 0; k < (int) blockAtoms.size(); k += 4) {
+                    fvec4 dist2 = maxDistanceSquared;
+                    if (y != atomVoxelIndex[k].y) {
+                        fvec4 dy1 = offset[1]-fvec4(&blockAtomY[k]);
+                        fvec4 dy2 = dy1+voxelSizeY;
+                        if (usePeriodic) {
+                            dy1 -= round(dy1*invBoxSize[1])*boxSize[1];
+                            dy2 -= round(dy2*invBoxSize[1])*boxSize[1];
+                        }
+                        fvec4 dy = min(abs(dy1), abs(dy2));
+                        dist2 -= dy*dy;
+                    }
+                    if (z != atomVoxelIndex[k].z) {
+                        fvec4 dz1 = offset[2]-fvec4(&blockAtomZ[k]);
+                        fvec4 dz2 = dz1+voxelSizeZ;
+                        if (usePeriodic) {
+                            dz1 -= round(dz1*invBoxSize[2])*boxSize[2];
+                            dz2 -= round(dz2*invBoxSize[2])*boxSize[2];
+                        }
+                        fvec4 dz = min(abs(dz1), abs(dz2));
+                        dist2 -= dz*dz;
                    }
-                    fvec4 delta = min(abs(delta1), abs(delta2));
-                    float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
-                    float dz = (z == atomVoxelIndex[k].z ? 0.0f : delta[2]);
-                    float dist2 = maxDistanceSquared-dy*dy-dz*dz;
-                    if (dist2 > 0) {
-                        float dist = sqrtf(dist2);
-                        minx = min(minx, atomPos[0]-dist-xoffset);
-                        maxx = max(maxx, atomPos[0]+dist-xoffset);
+                    fvec4 dist = sqrt(dist2);
+                    int numToCheck = min(4, (int) (blockAtoms.size()-k));
+                    for (int m = 0; m < numToCheck; m++) {
+                        minx = min(minx, blockAtomX[k+m]-dist[m]-xoffset);
+                        maxx = max(maxx, blockAtomX[k+m]+dist[m]-xoffset);
                    }
                }
                if (minx == maxx)
@@ -290,12 +300,10 @@ public:
                        if (sortedIndex >= lastSortedIndex)
                            continue;
                        
-                        fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]);
+                        fvec4 atomPos(&sortedPositions[4*sortedIndex]);
                        fvec4 delta = atomPos-centerPos;
-                        if (periodicRectangular) {
-                            fvec4 base = round(delta*invBoxSize)*boxSize;
-                            delta = delta-base;
-                        }
+                        if (periodicRectangular)
+                            delta -= round(delta*invBoxSize)*boxSize;
                        else if (needPeriodic) {
                            delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f);
                            delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f);
@@ -310,26 +318,34 @@ public:
                            // The distance is large enough that there might not be any actual interactions.
                            // Check individual atom pairs to be sure.
                            
-                            bool any = false;
-                            for (int k = 0; k < (int) blockAtoms.size(); k++) {
-                                fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
-                                delta = atomPos-pos1;
+                            bool anyInteraction = false;
+                            for (int k = 0; k < (int) blockAtoms.size(); k += 4) {
+                                fvec4 dx = fvec4(&blockAtomX[k])-atomPos[0];
+                                fvec4 dy = fvec4(&blockAtomY[k])-atomPos[1];
+                                fvec4 dz = fvec4(&blockAtomZ[k])-atomPos[2];
                                if (periodicRectangular) {
-                                    fvec4 base = round(delta*invBoxSize)*boxSize;
-                                    delta = delta-base;
+                                    dx -= round(dx*invBoxSize[0])*boxSize[0];
+                                    dy -= round(dy*invBoxSize[1])*boxSize[1];
+                                    dz -= round(dz*invBoxSize[2])*boxSize[2];
                                }
                                else if (needPeriodic) {
-                                    delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f);
-                                    delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f);
-                                    delta -= periodicBoxVec4[0]*floorf(delta[0]*recipBoxSize[0]+0.5f);
+                                    fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
+                                    dx -= scale3*periodicBoxVectors[2][0];
+                                    dy -= scale3*periodicBoxVectors[2][1];
+                                    dz -= scale3*periodicBoxVectors[2][2];
+                                    fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
+                                    dx -= scale2*periodicBoxVectors[1][0];
+                                    dy -= scale2*periodicBoxVectors[1][1];
+                                    fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
+                                    dx -= scale1*periodicBoxVectors[0][0];
                                }
-                                float r2 = dot3(delta, delta);
-                                if (r2 < maxDistanceSquared) {
-                                    any = true;
+                                fvec4 r2 = dx*dx + dy*dy + dz*dz;
+                                if (any(r2 < maxDistanceSquared)) {
+                                    anyInteraction = true;
                                    break;
                                }
                            }
-                            if (!any)
+                            if (!anyInteraction)
                                continue;
                        }
                        
@@ -379,6 +395,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
    sortedAtoms.resize(numAtoms);
+    sortedPositions.resize(4*numAtoms);
    
    // Record the parameters for the threads.
    
@@ -428,6 +445,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    for (int i = 0; i < numAtoms; i++) {
        int atomIndex = atomBins[i].second;
        sortedAtoms[i] = atomIndex;
+        fvec4 atomPos(&atomLocations[4*atomIndex]);
+        atomPos.store(&sortedPositions[4*i]);
        voxels.insert(i, &atomLocations[4*atomIndex]);
    }
    voxels.sortItems();
@@ -489,6 +508,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI

    int numBlocks = blockNeighbors.size();
    vector<int> blockAtoms;
+    vector<float> blockAtomX(blockSize), blockAtomY(blockSize), blockAtomZ(blockSize);
    vector<VoxelIndex> atomVoxelIndex;
    for (int i = threadIndex; i < numBlocks; i += numThreads) {
        // Find the atoms in this block and compute their bounding box.
@@ -501,14 +521,24 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
            blockAtoms[j] = sortedAtoms[firstIndex+j];
            atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
        }
-        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
+        fvec4 minPos(&sortedPositions[4*firstIndex]);
        fvec4 maxPos = minPos;
        for (int j = 1; j < atomsInBlock; j++) {
-            fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
+            fvec4 pos(&sortedPositions[4*(firstIndex+j)]);
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
-        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);
+        for (int j = 0; j < atomsInBlock; j++) {
+            blockAtomX[j] = sortedPositions[4*(firstIndex+j)];
+            blockAtomY[j] = sortedPositions[4*(firstIndex+j)+1];
+            blockAtomZ[j] = sortedPositions[4*(firstIndex+j)+2];
+        }
+        for (int j = atomsInBlock; j < blockSize; j++) {
+            blockAtomX[j] = 1e10;
+            blockAtomY[j] = 1e10;
+            blockAtomZ[j] = 1e10;
+        }
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, blockAtomX, blockAtomY, blockAtomZ, sortedPositions, atomVoxelIndex);

        // Record the exclusions for this block.


--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -44,30 +44,76 @@ CpuNonbondedForce* createCpuNonbondedForceVec4() {
 CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
 }

+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+
 void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 4; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -82,7 +128,10 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
        // Compute the distances to the block atoms.
        
        fvec4 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec4 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -95,8 +144,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
        
        // Compute the interactions.
        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 inverseR = rsqrt(r2);
        fvec4 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -108,6 +156,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
+                fvec4 r = r2*inverseR;
                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
@@ -162,29 +211,73 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
  }

 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockEwaldIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockEwaldIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 4; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -199,7 +292,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        // Compute the distances to the block atoms.
        
        fvec4 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec4 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -212,8 +308,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        
        // Compute the interactions.
        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 inverseR = rsqrt(r2);
+        fvec4 r = r2*inverseR;
        fvec4 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -272,28 +368,26 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    dx = x-posI[0];
    dy = y-posI[1];
    dz = z-posI[2];
-    if (periodic) {
-        if (TRICLINIC) {
-            fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
-            dx -= scale3*periodicBoxVectors[2][0];
-            dy -= scale3*periodicBoxVectors[2][1];
-            dz -= scale3*periodicBoxVectors[2][2];
-            fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
-            dx -= scale2*periodicBoxVectors[1][0];
-            dy -= scale2*periodicBoxVectors[1][1];
-            fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
-            dx -= scale1*periodicBoxVectors[0][0];
-        }
-        else {
-            dx -= round(dx*invBoxSize[0])*boxSize[0];
-            dy -= round(dy*invBoxSize[1])*boxSize[1];
-            dz -= round(dz*invBoxSize[2])*boxSize[2];
-        }
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
    }
    r2 = dx*dx + dy*dy + dz*dz;
 }

--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -50,7 +50,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
 */
 bool isVec8Supported() {
    // Make sure the CPU supports AVX.
-    
+
    int cpuInfo[4];
    cpuid(cpuInfo, 0);
    if (cpuInfo[0] >= 1) {
@@ -76,29 +76,75 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
 CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
 }

+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+
 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 8; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
    fvec4 blockAtomPosq[8];
    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
-    for (int i = 0; i < 8; i++)
+    for (int i = 0; i < 8; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -113,7 +159,10 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
        // Compute the distances to the block atoms.
        
        fvec8 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec8 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -126,8 +175,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
        
        // Compute the interactions.
        
-        fvec8 r = sqrt(r2);
-        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 inverseR = rsqrt(r2);
        fvec8 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -139,6 +187,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
+                fvec8 r = r2*inverseR;
                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
@@ -193,28 +242,72 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
  }

 void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockEwaldIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockEwaldIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 8; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
    fvec4 blockAtomPosq[8];
    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
-    for (int i = 0; i < 8; i++)
+    for (int i = 0; i < 8; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -229,7 +322,10 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        // Compute the distances to the block atoms.
        
        fvec8 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec8 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -242,8 +338,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        
        // Compute the interactions.
        
-        fvec8 r = sqrt(r2);
-        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 inverseR = rsqrt(r2);
+        fvec8 r = r2*inverseR;
        fvec8 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -268,7 +364,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        }
        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
-        dEdR *= inverseR*inverseR;        
+        dEdR *= inverseR*inverseR;

        // Accumulate energies.

@@ -302,28 +398,26 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    dx = x-posI[0];
    dy = y-posI[1];
    dz = z-posI[2];
-    if (periodic) {
-        if (TRICLINIC) {
-            fvec8 scale3 = floor(dz*recipBoxSize[2]+0.5f);
-            dx -= scale3*periodicBoxVectors[2][0];
-            dy -= scale3*periodicBoxVectors[2][1];
-            dz -= scale3*periodicBoxVectors[2][2];
-            fvec8 scale2 = floor(dy*recipBoxSize[1]+0.5f);
-            dx -= scale2*periodicBoxVectors[1][0];
-            dy -= scale2*periodicBoxVectors[1][1];
-            fvec8 scale1 = floor(dx*recipBoxSize[0]+0.5f);
-            dx -= scale1*periodicBoxVectors[0][0];
-        }
-        else {
-            dx -= round(dx*invBoxSize[0])*boxSize[0];
-            dy -= round(dy*invBoxSize[1])*boxSize[1];
-            dz -= round(dz*invBoxSize[2])*boxSize[2];
-        }
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        fvec8 scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        fvec8 scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        fvec8 scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
    }
    r2 = dx*dx + dy*dy + dz*dz;
 }

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -520,11 +520,19 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CMAPTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
    int numTorsions;
    bool hasInitializedKernel;
    CudaContext& cu;
    const System& system;
+    std::vector<int2> mapPositionsVec;
    CudaArray* coefficients;
    CudaArray* mapPositions;
    CudaArray* torsionMaps;
@@ -1274,7 +1282,8 @@ private:
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    void recordChangedParameters(ContextImpl& context);
    CudaContext& cu;
-    double prevStepSize;
+    double prevStepSize, energy;
+    float energyFloat;
    int numGlobalVariables;
    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters, keNeedsForce;
    mutable bool localValuesAreCurrent;
@@ -1295,7 +1304,7 @@ private:
    std::vector<std::vector<CUfunction> > kernels;
    std::vector<std::vector<std::vector<void*> > > kernelArgs;
    std::vector<void*> kineticEnergyArgs;
-    CUfunction sumPotentialEnergyKernel, randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
+    CUfunction randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
    std::vector<CustomIntegrator::ComputationType> stepType;
    std::vector<bool> needsForces;
    std::vector<bool> needsEnergy;

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -345,6 +345,13 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CMAPTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
    class Task;
    CudaPlatform::PlatformData& data;

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,6 +32,7 @@
 #include "openmm/VirtualSite.h"
 #include "quern.h"
 #include "CudaExpressionUtilities.h"
+#include "ReferenceCCMAAlgorithm.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
@@ -304,157 +305,54 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S

    int numCCMA = (int) ccmaConstraints.size();
    if (numCCMA > 0) {
-        vector<vector<int> > atomConstraints(context.getNumAtoms());
+        // Record information needed by ReferenceCCMAAlgorithm.
+        
+        vector<pair<int, int> > refIndices(numCCMA);
+        vector<RealOpenMM> refDistance(numCCMA);
        for (int i = 0; i < numCCMA; i++) {
-            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
-            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
+            int index = ccmaConstraints[i];
+            refIndices[i] = make_pair(atom1[index], atom2[index]);
+            refDistance[i] = distance[index];
        }
-        vector<vector<int> > linkedConstraints(numCCMA);
-        for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
-            for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
-                for (unsigned j = 0; j < i; j++) {
-                    int c1 = atomConstraints[atom][i];
-                    int c2 = atomConstraints[atom][j];
-                    linkedConstraints[c1].push_back(c2);
-                    linkedConstraints[c2].push_back(c1);
+        vector<RealOpenMM> refMasses(numAtoms);
+        for (int i = 0; i < numAtoms; ++i)
+            refMasses[i] = (RealOpenMM) system.getParticleMass(i);
+
+        // Look up angles for CCMA.
+        
+        vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
+        for (int i = 0; i < system.getNumForces(); i++) {
+            const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
+            if (force != NULL) {
+                for (int j = 0; j < force->getNumAngles(); j++) {
+                    int atom1, atom2, atom3;
+                    double angle, k;
+                    force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
+                    angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, (RealOpenMM) angle));
                }
-        }
-        int maxLinks = 0;
-        for (unsigned i = 0; i < linkedConstraints.size(); i++)
-            maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
-        int maxAtomConstraints = 0;
-        for (unsigned i = 0; i < atomConstraints.size(); i++)
-            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
-
-        // Compute the constraint coupling matrix
-
-        vector<vector<int> > atomAngles(numAtoms);
-        HarmonicAngleForce const* angleForce = NULL;
-        for (int i = 0; i < system.getNumForces() && angleForce == NULL; i++)
-            angleForce = dynamic_cast<HarmonicAngleForce const*>(&system.getForce(i));
-        if (angleForce != NULL)
-            for (int i = 0; i < angleForce->getNumAngles(); i++) {
-                int particle1, particle2, particle3;
-                double angle, k;
-                angleForce->getAngleParameters(i, particle1, particle2, particle3, angle, k);
-                atomAngles[particle2].push_back(i);
-            }
-        vector<vector<pair<int, double> > > matrix(numCCMA);
-        for (int j = 0; j < numCCMA; j++) {
-            for (int k = 0; k < numCCMA; k++) {
-                if (j == k) {
-                    matrix[j].push_back(pair<int, double>(j, 1.0));
-                    continue;
-                }
-                double scale;
-                int cj = ccmaConstraints[j];
-                int ck = ccmaConstraints[k];
-                int atomj0 = atom1[cj];
-                int atomj1 = atom2[cj];
-                int atomk0 = atom1[ck];
-                int atomk1 = atom2[ck];
-                int atoma, atomb, atomc;
-                double imj0 = 1.0/system.getParticleMass(atomj0);
-                double imj1 = 1.0/system.getParticleMass(atomj1);
-                if (atomj0 == atomk0) {
-                    atoma = atomj1;
-                    atomb = atomj0;
-                    atomc = atomk1;
-                    scale = imj0/(imj0+imj1);
-                }
-                else if (atomj1 == atomk1) {
-                    atoma = atomj0;
-                    atomb = atomj1;
-                    atomc = atomk0;
-                    scale = imj1/(imj0+imj1);
-                }
-                else if (atomj0 == atomk1) {
-                    atoma = atomj1;
-                    atomb = atomj0;
-                    atomc = atomk0;
-                    scale = imj0/(imj0+imj1);
-                }
-                else if (atomj1 == atomk0) {
-                    atoma = atomj0;
-                    atomb = atomj1;
-                    atomc = atomk1;
-                    scale = imj1/(imj0+imj1);
-                }
-                else
-                    continue; // These constraints are not connected.
-
-                // Look for a third constraint forming a triangle with these two.
-
-                bool foundConstraint = false;
-                for (int m = 0; m < numCCMA; m++) {
-                    int other = ccmaConstraints[m];
-                    if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
-                        double d1 = distance[cj];
-                        double d2 = distance[ck];
-                        double d3 = distance[other];
-                        matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
-                        foundConstraint = true;
-                        break;
-                    }
-                }
-                if (!foundConstraint && angleForce != NULL) {
-                    // We didn't find one, so look for an angle force field term.
-
-                    const vector<int>& angleCandidates = atomAngles[atomb];
-                    for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
-                        int particle1, particle2, particle3;
-                        double angle, ka;
-                        angleForce->getAngleParameters(*iter, particle1, particle2, particle3, angle, ka);
-                        if ((particle1 == atoma && particle3 == atomc) || (particle3 == atoma && particle1 == atomc)) {
-                            matrix[j].push_back(pair<int, double>(k, scale*cos(angle)));
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // Invert it using QR.
-
-        vector<int> matrixRowStart;
-        vector<int> matrixColIndex;
-        vector<double> matrixValue;
-        for (int i = 0; i < numCCMA; i++) {
-            matrixRowStart.push_back(matrixValue.size());
-            for (int j = 0; j < (int) matrix[i].size(); j++) {
-                pair<int, double> element = matrix[i][j];
-                matrixColIndex.push_back(element.first);
-                matrixValue.push_back(element.second);
            }
        }
-        matrixRowStart.push_back(matrixValue.size());
-        int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
-        double *qValue, *rValue;
-        int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
-                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
-        vector<double> rhs(numCCMA);
-        matrix.clear();
-        matrix.resize(numCCMA);
-        for (int i = 0; i < numCCMA; i++) {
-            // Extract column i of the inverse matrix.
-
-            for (int j = 0; j < numCCMA; j++)
-                rhs[j] = (i == j ? 1.0 : 0.0);
-            result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
-            result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
-            for (int j = 0; j < numCCMA; j++) {
-                double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
-                if (abs(value) > 0.1)
-                    matrix[j].push_back(pair<int, double>(i, value));
-            }
-        }
-        QUERN_free_result(qRowStart, qColIndex, qValue);
-        QUERN_free_result(rRowStart, rColIndex, rValue);
+        
+        // Create a ReferenceCCMAAlgorithm.  It will build and invert the constraint matrix for us.
+        
+        ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
+        vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
        int maxRowElements = 0;
        for (unsigned i = 0; i < matrix.size(); i++)
            maxRowElements = max(maxRowElements, (int) matrix[i].size());
        maxRowElements++;

+        // Build the list of constraints for each atom.
+
+        vector<vector<int> > atomConstraints(context.getNumAtoms());
+        for (int i = 0; i < numCCMA; i++) {
+            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
+            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
+        }
+        int maxAtomConstraints = 0;
+        for (unsigned i = 0; i < atomConstraints.size(); i++)
+            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
+
        // Sort the constraints.

        vector<int> constraintOrder(numCCMA);

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1127,7 +1127,7 @@ void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAP
        return;
    int numMaps = force.getNumMaps();
    vector<float4> coeffVec;
-    vector<int2> mapPositionsVec(numMaps);
+    mapPositionsVec.resize(numMaps);
    vector<double> energy;
    vector<vector<double> > c;
    int currentPosition = 0;
@@ -1166,6 +1166,49 @@ double CudaCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includ
    return 0.0;
 }

+void CudaCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
+    int numMaps = force.getNumMaps();
+    int numContexts = cu.getPlatformData().contexts.size();
+    int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
+    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
+    numTorsions = endIndex-startIndex;
+    if (mapPositions->getSize() != numMaps)
+        throw OpenMMException("updateParametersInContext: The number of maps has changed");
+    if (torsionMaps->getSize() != numTorsions)
+        throw OpenMMException("updateParametersInContext: The number of CMAP torsions has changed");
+
+    // Update the maps.
+
+    vector<float4> coeffVec;
+    vector<double> energy;
+    vector<vector<double> > c;
+    int currentPosition = 0;
+    for (int i = 0; i < numMaps; i++) {
+        int size;
+        force.getMapParameters(i, size, energy);
+        if (size != mapPositionsVec[i].y)
+            throw OpenMMException("updateParametersInContext: The size of a map has changed");
+        CMAPTorsionForceImpl::calcMapDerivatives(size, energy, c);
+        currentPosition += 4*size*size;
+        for (int j = 0; j < size*size; j++) {
+            coeffVec.push_back(make_float4((float) c[j][0], (float) c[j][1], (float) c[j][2], (float) c[j][3]));
+            coeffVec.push_back(make_float4((float) c[j][4], (float) c[j][5], (float) c[j][6], (float) c[j][7]));
+            coeffVec.push_back(make_float4((float) c[j][8], (float) c[j][9], (float) c[j][10], (float) c[j][11]));
+            coeffVec.push_back(make_float4((float) c[j][12], (float) c[j][13], (float) c[j][14], (float) c[j][15]));
+        }
+    }
+    coefficients->upload(coeffVec);
+
+    // Update the indices.
+
+    vector<int> torsionMapsVec(numTorsions);
+    for (int i = 0; i < numTorsions; i++) {
+        int index[8];
+        force.getTorsionParameters(i, torsionMapsVec[i], index[0], index[1], index[2], index[3], index[4], index[5], index[6], index[7]);
+    }
+    torsionMaps->upload(torsionMapsVec);
+}
+
 class CudaCustomTorsionForceInfo : public CudaForceInfo {
 public:
    CudaCustomTorsionForceInfo(const CustomTorsionForce& force) : force(force) {
@@ -5688,7 +5731,7 @@ string CudaIntegrateCustomStepKernel::createGlobalComputation(const string& vari
    variables["dt"] = "dt[0].y";
    variables["uniform"] = "uniform";
    variables["gaussian"] = "gaussian";
-    variables[energyName] = "energy[0]";
+    variables[energyName] = "energy";
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
    for (int i = 0; i < (int) parameterNames.size(); i++)
@@ -5724,7 +5767,7 @@ string CudaIntegrateCustomStepKernel::createPerDofComputation(const string& vari
    variables["m"] = "mass";
    variables["dt"] = "stepSize";
    if (energyName != "")
-        variables[energyName] = "energy[0]";
+        variables[energyName] = "energy";
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
        variables[integrator.getGlobalVariableName(i)] = "globals["+cu.intToString(i)+"]";
    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
@@ -5957,7 +6000,10 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                args1.push_back(NULL);
                args1.push_back(NULL);
                args1.push_back(NULL);
-                args1.push_back(&potentialEnergy->getDevicePointer());
+                if (cu.getUseDoublePrecision())
+                    args1.push_back(&energy);
+                else
+                    args1.push_back(&energyFloat);
                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
                    args1.push_back(&perDofValues->getBuffers()[i].getMemory());
                kernelArgs[step].push_back(args1);
@@ -6006,7 +6052,10 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
                args.push_back(&contextParameterValues->getDevicePointer());
                args.push_back(NULL);
                args.push_back(NULL);
-                args.push_back(&potentialEnergy->getDevicePointer());
+                if (cu.getUseDoublePrecision())
+                    args.push_back(&energy);
+                else
+                    args.push_back(&energyFloat);
                kernelArgs[step].push_back(args);
            }
            else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
@@ -6046,13 +6095,6 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
        randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
        
-        // Create the kernel for summing the potential energy.
-
-        defines["SUM_OUTPUT_INDEX"] = "0";
-        defines["SUM_BUFFER_SIZE"] = cu.intToString(cu.getEnergyBuffer().getSize());
-        CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
-        sumPotentialEnergyKernel = cu.getKernel(module, cu.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
-        
        // Create the kernel for computing kinetic energy.

        stringstream computeKE;
@@ -6074,10 +6116,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
            args << ", " << buffer.getType() << "* __restrict__ " << valueName;
        }
        replacements["PARAMETER_ARGUMENTS"] = args.str();
+        defines["SUM_OUTPUT_INDEX"] = "0";
        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
        if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
            defines.erase("LOAD_POS_AS_DELTA");
-        module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
+        CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
        kineticEnergyKernel = cu.getKernel(module, "computePerDof");
        kineticEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
        kineticEnergyArgs.push_back(NULL);
@@ -6091,7 +6134,10 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
        kineticEnergyArgs.push_back(NULL);
        kineticEnergyArgs.push_back(NULL);
        kineticEnergyArgs.push_back(&uniformRandoms->getDevicePointer());
-        kineticEnergyArgs.push_back(&potentialEnergy->getDevicePointer());
+        if (cu.getUseDoublePrecision())
+            kineticEnergyArgs.push_back(&energy);
+        else
+            kineticEnergyArgs.push_back(&energyFloat);
        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
            kineticEnergyArgs.push_back(&perDofValues->getBuffers()[i].getMemory());
        keNeedsForce = usesVariable(keExpression, "f");
@@ -6197,11 +6243,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
            }
            else {
                recordChangedParameters(context);
-                context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
-                if (computeEnergy) {
-                    void* args[] = {&cu.getEnergyBuffer().getDevicePointer(), &potentialEnergy->getDevicePointer()};
-                    cu.executeKernel(sumPotentialEnergyKernel, &args[0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
-                }
+                energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
+                energyFloat = (float) energy;
            }
            forcesAreValid = true;
        }
@@ -6272,11 +6315,8 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
        bool willNeedEnergy = false;
        for (int i = 0; i < integrator.getNumComputations(); i++)
            willNeedEnergy |= needsEnergy[i];
-        context.calcForcesAndEnergy(true, willNeedEnergy, -1);
-        if (willNeedEnergy) {
-            void* args[] = {&cu.getEnergyBuffer().getDevicePointer(), &potentialEnergy->getDevicePointer()};
-            cu.executeKernel(sumPotentialEnergyKernel, &args[0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
-        }
+        energy = context.calcForcesAndEnergy(true, willNeedEnergy, -1);
+        energyFloat = (float) energy;
        forcesAreValid = true;
    }
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -536,6 +536,11 @@ double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, boo
    return 0.0;
 }

+void CudaParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
 class CudaParallelCalcCustomTorsionForceKernel::Task : public CudaContext::WorkTask {
 public:
    Task(ContextImpl& context, CudaCalcCustomTorsionForceKernel& kernel, bool includeForce,

--- a/platforms/cuda/src/kernels/customIntegratorGlobal.cu
+++ b/platforms/cuda/src/kernels/customIntegratorGlobal.cu
 extern "C" __global__ void computeGlobal(mixed2* __restrict__ dt, mixed* __restrict__ globals, mixed* __restrict__ params,
-        float uniform, float gaussian, const real* __restrict__ energy) {
+        float uniform, float gaussian, const real energy) {
    COMPUTE_STEP
 }
--- a/platforms/cuda/src/kernels/customIntegratorPerDof.cu
+++ b/platforms/cuda/src/kernels/customIntegratorPerDof.cu
@@ -34,7 +34,7 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
 extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
        mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
        const mixed* __restrict__ params, mixed* __restrict__ sum, const float4* __restrict__ gaussianValues,
-        unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
+        unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues, const real energy
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = blockIdx.x*blockDim.x+threadIdx.x;

--- a/platforms/cuda/tests/TestCudaCMAPTorsionForce.cpp
+++ b/platforms/cuda/tests/TestCudaCMAPTorsionForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -105,11 +105,68 @@ void testCMAPTorsions() {
    }
 }

+void testChangingParameters() {
+    // Create a system with two maps and one torsion.
+
+    const int mapSize = 8;
+    System system;
+    for (int i = 0; i < 5; i++)
+        system.addParticle(1.0);
+    CMAPTorsionForce* cmap = new CMAPTorsionForce();
+    vector<double> mapEnergy1(mapSize*mapSize);
+    vector<double> mapEnergy2(mapSize*mapSize);
+    for (int i = 0; i < mapSize; i++) {
+        double angle1 = i*2*M_PI/mapSize;
+        double energy1 = cos(angle1);
+        for (int j = 0; j < mapSize; j++) {
+            double angle2 = j*2*M_PI/mapSize;
+            double energy2 = 10*sin(angle2);
+            mapEnergy1[i+j*mapSize] = energy1+energy2;
+            mapEnergy2[i+j*mapSize] = energy1-energy2;
+        }
+    }
+    cmap->addMap(mapSize, mapEnergy1);
+    cmap->addMap(mapSize, mapEnergy2);
+    cmap->addTorsion(0, 0, 1, 2, 3, 1, 2, 3, 4);
+    system.addForce(cmap);
+
+    // Set particle positions so angle1=0 and angle2=PI/4.
+
+    vector<Vec3> positions(5);
+    positions[0] = Vec3(0, 0, 1);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    positions[3] = Vec3(1, 0, 1);
+    positions[4] = Vec3(0.5, -0.5, 1);
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+
+    // Check that the energy is correct.
+
+    double energy = context.getState(State::Energy).getPotentialEnergy();
+    ASSERT_EQUAL_TOL(1+10*sin(M_PI/4), energy, 1e-5);
+
+    // Modify the parameters.
+
+    cmap->setTorsionParameters(0, 1, 0, 1, 2, 3, 1, 2, 3, 4);
+    for (int i = 0; i < mapSize*mapSize; i++)
+        mapEnergy2[i] *= 2.0;
+    cmap->setMapParameters(1, mapSize, mapEnergy2);
+    cmap->updateParametersInContext(context);
+
+    // See if the results are correct.
+
+    energy = context.getState(State::Energy).getPotentialEnergy();
+    ASSERT_EQUAL_TOL(2-20*sin(M_PI/4), energy, 1e-5);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
        testCMAPTorsions();
+        testChangingParameters();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/include/OpenCLFFT3D.h
+++ b/platforms/opencl/include/OpenCLFFT3D.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -40,7 +40,7 @@ namespace OpenMM {
 * 15, 207–228 (2000).
 * <p>
 * This class places certain restrictions on the allowed dimensions of the grid.  First,
- * the size of each dimension may have no prime factors other than 2, 3, and 5.  You
+ * the size of each dimension may have no prime factors other than 2, 3, 5, and 7.  You
 * can call findLegalDimension() to determine the smallest size that satisfies this
 * requirement and is greater than or equal to a specified minimum size.  Second, the size
 * of each dimension must be small enough to compute each 1D transform entirely in local
@@ -61,12 +61,17 @@ public:
     * @param xsize   the first dimension of the data sets on which FFTs will be performed
     * @param ysize   the second dimension of the data sets on which FFTs will be performed
     * @param zsize   the third dimension of the data sets on which FFTs will be performed
+     * @param realToComplex  if true, a real-to-complex transform will be done.  Otherwise, it is complex-to-complex.
     */
-    OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize);
+    OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex=false);
    /**
     * Perform a Fourier transform.  The transform cannot be done in-place: the input and output
     * arrays must be different.  Also, the input array is used as workspace, so its contents
-     * are destroyed.
+     * are destroyed.  This also means that both arrays must be large enough to hold complex values,
+     * even when performing a real-to-complex transform.
+     * <p>
+     * When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
+     * and contains only the non-redundant elements.
     *
     * @param in       the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
     * @param out      on exit, this contains the transformed data
@@ -75,17 +80,20 @@ public:
    void execFFT(OpenCLArray& in, OpenCLArray& out, bool forward = true);
    /**
     * Get the smallest legal size for a dimension of the grid (that is, a size with no prime
-     * factors other than 2, 3, and 5).
+     * factors other than 2, 3, 5, and 7).
     *
     * @param minimum   the minimum size the return value must be greater than or equal to
     */
    static int findLegalDimension(int minimum);
 private:
-    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
+    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal);
    int xsize, ysize, zsize;
    int xthreads, ythreads, zthreads;
+    bool packRealAsComplex;
    OpenCLContext& context;
    cl::Kernel xkernel, ykernel, zkernel;
+    cl::Kernel invxkernel, invykernel, invzkernel;
+    cl::Kernel packForwardKernel, unpackForwardKernel, packBackwardKernel, unpackBackwardKernel;
 };

 } // namespace OpenMM

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -498,11 +498,19 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CMAPTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
    int numTorsions;
    bool hasInitializedKernel;
    OpenCLContext& cl;
    const System& system;
+    std::vector<mm_int2> mapPositionsVec;
    OpenCLArray* coefficients;
    OpenCLArray* mapPositions;
    OpenCLArray* torsionMaps;
@@ -631,6 +639,7 @@ private:
    cl::Kernel pmeSpreadChargeKernel;
    cl::Kernel pmeFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
+    cl::Kernel pmeEvalEnergyKernel;
    cl::Kernel pmeInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
@@ -1264,7 +1273,7 @@ private:
    void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
    void recordChangedParameters(ContextImpl& context);
    OpenCLContext& cl;
-    double prevStepSize;
+    double prevStepSize, energy;
    int numGlobalVariables;
    bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters, keNeedsForce;
    mutable bool localValuesAreCurrent;
@@ -1284,7 +1293,7 @@ private:
    std::vector<double> contextValuesDouble;
    std::vector<float> contextValues;
    std::vector<std::vector<cl::Kernel> > kernels;
-    cl::Kernel sumPotentialEnergyKernel, randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
+    cl::Kernel randomKernel, kineticEnergyKernel, sumKineticEnergyKernel;
    std::vector<CustomIntegrator::ComputationType> stepType;
    std::vector<bool> needsForces;
    std::vector<bool> needsEnergy;