Merge branch 'master' into psfinscode

b21e3182 · Jason Swails · 7b30da6e · 3946c025 · b21e3182 · b21e3182
Commit b21e3182 authored Apr 20, 2015 by Jason Swails
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,7 +296,9 @@ SET(OPENMM_BUILD_SHARED_LIB ON CACHE BOOL "Whether to build shared OpenMM librar

 SET(EXTRA_LINK_FLAGS ${EXTRA_COMPILE_FLAGS})
 IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    SET(EXTRA_LINK_FLAGS "${EXTRA_LINK_FLAGS} -Wl,--no-as-needed -lrt")
+    IF (NOT ANDROID)
+        SET(EXTRA_LINK_FLAGS "${EXTRA_LINK_FLAGS} -Wl,--no-as-needed -lrt")
+    ENDIF (NOT ANDROID)
 ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux")

 IF(OPENMM_BUILD_SHARED_LIB)
@@ -403,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN)
 MARK_AS_ADVANCED(CUDA_BUILD_EMULATION)

 FIND_PACKAGE(OpenCL QUIET)
-IF(OPENCL_FOUND AND NOT APPLE)
+IF(OPENCL_FOUND)
    SET(OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL "Build OpenMMOpenCL library")
-ELSE(OPENCL_FOUND AND NOT APPLE)
+ELSE(OPENCL_FOUND)
    SET(OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL "Build OpenMMOpenCL library")
-ENDIF(OPENCL_FOUND AND NOT APPLE)
+ENDIF(OPENCL_FOUND)
 IF(OPENMM_BUILD_OPENCL_LIB)
    ADD_SUBDIRECTORY(platforms/opencl)
 ENDIF(OPENMM_BUILD_OPENCL_LIB)

--- a/openmmapi/include/openmm/internal/vectorize8.h
+++ b/openmmapi/include/openmm/internal/vectorize8.h
@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) {
    return fvec8(_mm256_sqrt_ps(v.val));
 }

+static inline fvec8 rsqrt(const fvec8& v) {
+    // Initial estimate of rsqrt().
+
+    fvec8 y(_mm256_rsqrt_ps(v.val));
+
+    // Perform an iteration of Newton refinement.
+
+    fvec8 x2 = v*0.5f;
+    y *= fvec8(1.5f)-x2*y*y;
+    return y;
+}
+
 static inline float dot8(const fvec8& v1, const fvec8& v2) {
    fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
    return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());

--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) {
    return vabsq_f32(v);
 }

-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 rsqrt(const fvec4& v) {
    float32x4_t recipSqrt = vrsqrteq_f32(v);
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
-    return vmulq_f32(v, recipSqrt);
+    return recipSqrt;
+}
+
+static inline fvec4 sqrt(const fvec4& v) {
+    return rsqrt(v)*v;
 }

 static inline float dot3(const fvec4& v1, const fvec4& v2) {

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) {
    return truncated + blend(0.0f, 1.0f, truncated<v);
 }

-static inline fvec4 sqrt(const fvec4& v) {
+static inline fvec4 rsqrt(const fvec4& v) {
    // Initial estimate of rsqrt().

    ivec4 i = (__m128i) v;
@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) {
    y *= 1.5f-x2*y*y;
    y *= 1.5f-x2*y*y;
    y *= 1.5f-x2*y*y;
-    return y*v;
+    return y;
+}
+
+static inline fvec4 sqrt(const fvec4& v) {
+    return rsqrt(v)*v;
 }

 #endif /*OPENMM_VECTORIZE_PNACL_H_*/

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
    return fvec4(_mm_sqrt_ps(v.val));
 }

+static inline fvec4 rsqrt(const fvec4& v) {
+    // Initial estimate of rsqrt().
+
+    fvec4 y(_mm_rsqrt_ps(v.val));
+
+    // Perform an iteration of Newton refinement.
+
+    fvec4 x2 = v*0.5f;
+    y *= fvec4(1.5f)-x2*y*y;
+    return y;
+}
+
 static inline float dot3(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -61,6 +61,7 @@ public:
 private:
    int blockSize;
    std::vector<int> sortedAtoms;
+    std::vector<float> sortedPositions;
    std::vector<std::vector<int> > blockNeighbors;
    std::vector<std::vector<char> > blockExclusions;
    // The following variables are used to make information accessible to the individual threads.

--- a/platforms/cpu/include/CpuNonbondedForceVec4.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec4.h
@@ -56,8 +56,8 @@ protected:
      /**
       * Templatized implementation of calculateBlockIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
            
      /**---------------------------------------------------------------------------------------
      
@@ -74,15 +74,15 @@ protected:
      /**
       * Templatized implementation of calculateBlockEwaldIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);

      /**
       * Compute the displacement and squared distance between a collection of points, optionally using
       * periodic boundary conditions.
       */
-      template <bool TRICLINIC>
-      void getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      template <int PERIODIC_TYPE>
+      void getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

      /**
       * Compute a fast approximation to erfc(x).

--- a/platforms/cpu/include/CpuNonbondedForceVec8.h
+++ b/platforms/cpu/include/CpuNonbondedForceVec8.h
@@ -55,8 +55,8 @@ protected:
      /**
       * Templatized implementation of calculateBlockIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);
            
      /**---------------------------------------------------------------------------------------
      
@@ -73,15 +73,15 @@ protected:
      /**
       * Templatized implementation of calculateBlockEwaldIxn.
       */
-      template <bool TRICLINIC>
-      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      template <int PERIODIC_TYPE>
+      void calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter);

      /**
       * Compute the displacement and squared distance between a collection of points, optionally using
       * periodic boundary conditions.
       */
-      template <bool TRICLINIC>
-      void getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
+      template <int PERIODIC_TYPE>
+      void getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;

      /**
       * Compute a fast approximation to erfc(x).

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
@@ -199,7 +199,7 @@ void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
 }

 void CpuCustomManyParticleForce::setPeriodic(RealVec* periodicBoxVectors) {
-    assert(cutoff);
+    assert(useCutoff);
    assert(periodicBoxVectors[0][0] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[1][1] >= 2.0*cutoffDistance);
    assert(periodicBoxVectors[2][2] >= 2.0*cutoffDistance);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -164,8 +164,8 @@ public:
        
        return VoxelIndex(y, z);
    }
-
-    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const float* atomLocations, const vector<VoxelIndex>& atomVoxelIndex) const {
+        
+    void getNeighbors(vector<int>& neighbors, int blockIndex, const fvec4& blockCenter, const fvec4& blockWidth, const vector<int>& sortedAtoms, vector<char>& exclusions, float maxDistance, const vector<int>& blockAtoms, const vector<float>& blockAtomX, const vector<float>& blockAtomY, const vector<float>& blockAtomZ, const vector<float>& sortedPositions, const vector<VoxelIndex>& atomVoxelIndex) const {
        neighbors.resize(0);
        exclusions.resize(0);
        fvec4 boxSize(periodicBoxSize[0], periodicBoxSize[1], periodicBoxSize[2], 0);
@@ -233,24 +233,34 @@ public:
                
                float minx = centerPos[0];
                float maxx = centerPos[0];
-                fvec4 offset(-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz), 0);
-                for (int k = 0; k < (int) blockAtoms.size(); k++) {
-                    const float* atomPos = &atomLocations[4*blockAtoms[k]];
-                    fvec4 posVec(atomPos);
-                    fvec4 delta1 = offset-posVec;
-                    fvec4 delta2 = delta1+fvec4(0, voxelSizeY, voxelSizeZ, 0);
-                    if (usePeriodic) {
-                        delta1 -= round(delta1*invBoxSize)*boxSize;
-                        delta2 -= round(delta2*invBoxSize)*boxSize;
+                float offset[3] = {-xoffset, -yoffset+voxelSizeY*y+(usePeriodic ? 0.0f : miny), voxelSizeZ*z+(usePeriodic ? 0.0f : minz)};
+                for (int k = 0; k < (int) blockAtoms.size(); k += 4) {
+                    fvec4 dist2 = maxDistanceSquared;
+                    if (y != atomVoxelIndex[k].y) {
+                        fvec4 dy1 = offset[1]-fvec4(&blockAtomY[k]);
+                        fvec4 dy2 = dy1+voxelSizeY;
+                        if (usePeriodic) {
+                            dy1 -= round(dy1*invBoxSize[1])*boxSize[1];
+                            dy2 -= round(dy2*invBoxSize[1])*boxSize[1];
+                        }
+                        fvec4 dy = min(abs(dy1), abs(dy2));
+                        dist2 -= dy*dy;
+                    }
+                    if (z != atomVoxelIndex[k].z) {
+                        fvec4 dz1 = offset[2]-fvec4(&blockAtomZ[k]);
+                        fvec4 dz2 = dz1+voxelSizeZ;
+                        if (usePeriodic) {
+                            dz1 -= round(dz1*invBoxSize[2])*boxSize[2];
+                            dz2 -= round(dz2*invBoxSize[2])*boxSize[2];
+                        }
+                        fvec4 dz = min(abs(dz1), abs(dz2));
+                        dist2 -= dz*dz;
                    }
-                    fvec4 delta = min(abs(delta1), abs(delta2));
-                    float dy = (y == atomVoxelIndex[k].y ? 0.0f : delta[1]);
-                    float dz = (z == atomVoxelIndex[k].z ? 0.0f : delta[2]);
-                    float dist2 = maxDistanceSquared-dy*dy-dz*dz;
-                    if (dist2 > 0) {
-                        float dist = sqrtf(dist2);
-                        minx = min(minx, atomPos[0]-dist-xoffset);
-                        maxx = max(maxx, atomPos[0]+dist-xoffset);
+                    fvec4 dist = sqrt(dist2);
+                    int numToCheck = min(4, (int) (blockAtoms.size()-k));
+                    for (int m = 0; m < numToCheck; m++) {
+                        minx = min(minx, blockAtomX[k+m]-dist[m]-xoffset);
+                        maxx = max(maxx, blockAtomX[k+m]+dist[m]-xoffset);
                    }
                }
                if (minx == maxx)
@@ -290,12 +300,10 @@ public:
                        if (sortedIndex >= lastSortedIndex)
                            continue;
                        
-                        fvec4 atomPos(atomLocations+4*sortedAtoms[sortedIndex]);
+                        fvec4 atomPos(&sortedPositions[4*sortedIndex]);
                        fvec4 delta = atomPos-centerPos;
-                        if (periodicRectangular) {
-                            fvec4 base = round(delta*invBoxSize)*boxSize;
-                            delta = delta-base;
-                        }
+                        if (periodicRectangular)
+                            delta -= round(delta*invBoxSize)*boxSize;
                        else if (needPeriodic) {
                            delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f);
                            delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f);
@@ -310,26 +318,34 @@ public:
                            // The distance is large enough that there might not be any actual interactions.
                            // Check individual atom pairs to be sure.
                            
-                            bool any = false;
-                            for (int k = 0; k < (int) blockAtoms.size(); k++) {
-                                fvec4 pos1(&atomLocations[4*blockAtoms[k]]);
-                                delta = atomPos-pos1;
+                            bool anyInteraction = false;
+                            for (int k = 0; k < (int) blockAtoms.size(); k += 4) {
+                                fvec4 dx = fvec4(&blockAtomX[k])-atomPos[0];
+                                fvec4 dy = fvec4(&blockAtomY[k])-atomPos[1];
+                                fvec4 dz = fvec4(&blockAtomZ[k])-atomPos[2];
                                if (periodicRectangular) {
-                                    fvec4 base = round(delta*invBoxSize)*boxSize;
-                                    delta = delta-base;
+                                    dx -= round(dx*invBoxSize[0])*boxSize[0];
+                                    dy -= round(dy*invBoxSize[1])*boxSize[1];
+                                    dz -= round(dz*invBoxSize[2])*boxSize[2];
                                }
                                else if (needPeriodic) {
-                                    delta -= periodicBoxVec4[2]*floorf(delta[2]*recipBoxSize[2]+0.5f);
-                                    delta -= periodicBoxVec4[1]*floorf(delta[1]*recipBoxSize[1]+0.5f);
-                                    delta -= periodicBoxVec4[0]*floorf(delta[0]*recipBoxSize[0]+0.5f);
+                                    fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
+                                    dx -= scale3*periodicBoxVectors[2][0];
+                                    dy -= scale3*periodicBoxVectors[2][1];
+                                    dz -= scale3*periodicBoxVectors[2][2];
+                                    fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
+                                    dx -= scale2*periodicBoxVectors[1][0];
+                                    dy -= scale2*periodicBoxVectors[1][1];
+                                    fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
+                                    dx -= scale1*periodicBoxVectors[0][0];
                                }
-                                float r2 = dot3(delta, delta);
-                                if (r2 < maxDistanceSquared) {
-                                    any = true;
+                                fvec4 r2 = dx*dx + dy*dy + dz*dz;
+                                if (any(r2 < maxDistanceSquared)) {
+                                    anyInteraction = true;
                                    break;
                                }
                            }
-                            if (!any)
+                            if (!anyInteraction)
                                continue;
                        }
                        
@@ -379,6 +395,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    blockNeighbors.resize(numBlocks);
    blockExclusions.resize(numBlocks);
    sortedAtoms.resize(numAtoms);
+    sortedPositions.resize(4*numAtoms);
    
    // Record the parameters for the threads.
    
@@ -428,6 +445,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    for (int i = 0; i < numAtoms; i++) {
        int atomIndex = atomBins[i].second;
        sortedAtoms[i] = atomIndex;
+        fvec4 atomPos(&atomLocations[4*atomIndex]);
+        atomPos.store(&sortedPositions[4*i]);
        voxels.insert(i, &atomLocations[4*atomIndex]);
    }
    voxels.sortItems();
@@ -489,6 +508,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI

    int numBlocks = blockNeighbors.size();
    vector<int> blockAtoms;
+    vector<float> blockAtomX(blockSize), blockAtomY(blockSize), blockAtomZ(blockSize);
    vector<VoxelIndex> atomVoxelIndex;
    for (int i = threadIndex; i < numBlocks; i += numThreads) {
        // Find the atoms in this block and compute their bounding box.
@@ -501,14 +521,24 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
            blockAtoms[j] = sortedAtoms[firstIndex+j];
            atomVoxelIndex[j] = voxels->getVoxelIndex(&atomLocations[4*blockAtoms[j]]);
        }
-        fvec4 minPos(&atomLocations[4*sortedAtoms[firstIndex]]);
+        fvec4 minPos(&sortedPositions[4*firstIndex]);
        fvec4 maxPos = minPos;
        for (int j = 1; j < atomsInBlock; j++) {
-            fvec4 pos(&atomLocations[4*sortedAtoms[firstIndex+j]]);
+            fvec4 pos(&sortedPositions[4*(firstIndex+j)]);
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
        }
-        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, atomLocations, atomVoxelIndex);
+        for (int j = 0; j < atomsInBlock; j++) {
+            blockAtomX[j] = sortedPositions[4*(firstIndex+j)];
+            blockAtomY[j] = sortedPositions[4*(firstIndex+j)+1];
+            blockAtomZ[j] = sortedPositions[4*(firstIndex+j)+2];
+        }
+        for (int j = atomsInBlock; j < blockSize; j++) {
+            blockAtomX[j] = 1e10;
+            blockAtomY[j] = 1e10;
+            blockAtomZ[j] = 1e10;
+        }
+        voxels->getNeighbors(blockNeighbors[i], i, (maxPos+minPos)*0.5f, (maxPos-minPos)*0.5f, sortedAtoms, blockExclusions[i], maxDistance, blockAtoms, blockAtomX, blockAtomY, blockAtomZ, sortedPositions, atomVoxelIndex);

        // Record the exclusions for this block.


--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -44,30 +44,76 @@ CpuNonbondedForce* createCpuNonbondedForceVec4() {
 CpuNonbondedForceVec4::CpuNonbondedForceVec4() {
 }

+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+
 void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 4; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -82,7 +128,10 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
        // Compute the distances to the block atoms.
        
        fvec4 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec4 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -95,8 +144,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
        
        // Compute the interactions.
        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 inverseR = rsqrt(r2);
        fvec4 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -108,6 +156,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
+                fvec4 r = r2*inverseR;
                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
@@ -162,29 +211,73 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
  }

 void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockEwaldIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockEwaldIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 4; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[4*blockIndex];
    fvec4 blockAtomPosq[4];
    fvec4 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
-    for (int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    fvec4 blockAtomX = fvec4(blockAtomPosq[0][0], blockAtomPosq[1][0], blockAtomPosq[2][0], blockAtomPosq[3][0]);
    fvec4 blockAtomY = fvec4(blockAtomPosq[0][1], blockAtomPosq[1][1], blockAtomPosq[2][1], blockAtomPosq[3][1]);
    fvec4 blockAtomZ = fvec4(blockAtomPosq[0][2], blockAtomPosq[1][2], blockAtomPosq[2][2], blockAtomPosq[3][2]);
    fvec4 blockAtomCharge = fvec4(ONE_4PI_EPS0)*fvec4(blockAtomPosq[0][3], blockAtomPosq[1][3], blockAtomPosq[2][3], blockAtomPosq[3][3]);
    fvec4 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first);
    fvec4 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -199,7 +292,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        // Compute the distances to the block atoms.
        
        fvec4 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(posq+4*atom, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec4 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -212,8 +308,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        
        // Compute the interactions.
        
-        fvec4 r = sqrt(r2);
-        fvec4 inverseR = fvec4(1.0f)/r;
+        fvec4 inverseR = rsqrt(r2);
+        fvec4 r = r2*inverseR;
        fvec4 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -272,28 +368,26 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec4::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& y, const fvec4& z, fvec4& dx, fvec4& dy, fvec4& dz, fvec4& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    dx = x-posI[0];
    dy = y-posI[1];
    dz = z-posI[2];
-    if (periodic) {
-        if (TRICLINIC) {
-            fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
-            dx -= scale3*periodicBoxVectors[2][0];
-            dy -= scale3*periodicBoxVectors[2][1];
-            dz -= scale3*periodicBoxVectors[2][2];
-            fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
-            dx -= scale2*periodicBoxVectors[1][0];
-            dy -= scale2*periodicBoxVectors[1][1];
-            fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
-            dx -= scale1*periodicBoxVectors[0][0];
-        }
-        else {
-            dx -= round(dx*invBoxSize[0])*boxSize[0];
-            dy -= round(dy*invBoxSize[1])*boxSize[1];
-            dz -= round(dz*invBoxSize[2])*boxSize[2];
-        }
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        fvec4 scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        fvec4 scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        fvec4 scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
    }
    r2 = dx*dx + dy*dy + dz*dz;
 }

--- a/platforms/cpu/src/CpuNonbondedForceVec8.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec8.cpp
@@ -50,7 +50,7 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
 */
 bool isVec8Supported() {
    // Make sure the CPU supports AVX.
-    
+
    int cpuInfo[4];
    cpuid(cpuInfo, 0);
    if (cpuInfo[0] >= 1) {
@@ -76,29 +76,75 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
 CpuNonbondedForceVec8::CpuNonbondedForceVec8() {
 }

+enum PeriodicType {NoPeriodic, PeriodicPerAtom, PeriodicPerInteraction, PeriodicTriclinic};
+
 void CpuNonbondedForceVec8::calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 8; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
    fvec4 blockAtomPosq[8];
    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
-    for (int i = 0; i < 8; i++)
+    for (int i = 0; i < 8; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -113,7 +159,10 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
        // Compute the distances to the block atoms.
        
        fvec8 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec8 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -126,8 +175,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
        
        // Compute the interactions.
        
-        fvec8 r = sqrt(r2);
-        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 inverseR = rsqrt(r2);
        fvec8 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -139,6 +187,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
+                fvec8 r = r2*inverseR;
                fvec8 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
                fvec8 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec8 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
@@ -193,28 +242,72 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
  }

 void CpuNonbondedForceVec8::calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
-    if (triclinic)
-        calculateBlockEwaldIxnImpl<true>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
-    else
-        calculateBlockEwaldIxnImpl<false>(blockIndex, forces, totalEnergy, boxSize, invBoxSize);
+    // Determine whether we need to apply periodic boundary conditions.
+    
+    PeriodicType periodicType;
+    fvec4 blockCenter;
+    if (!periodic) {
+        periodicType = NoPeriodic;
+        blockCenter = 0.0f;
+    }
+    else {
+        const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
+        float minx, maxx, miny, maxy, minz, maxz;
+        minx = maxx = posq[4*blockAtom[0]];
+        miny = maxy = posq[4*blockAtom[0]+1];
+        minz = maxz = posq[4*blockAtom[0]+2];
+        for (int i = 1; i < 8; i++) {
+            minx = min(minx, posq[4*blockAtom[i]]);
+            maxx = max(maxx, posq[4*blockAtom[i]]);
+            miny = min(miny, posq[4*blockAtom[i]+1]);
+            maxy = max(maxy, posq[4*blockAtom[i]+1]);
+            minz = min(minz, posq[4*blockAtom[i]+2]);
+            maxz = max(maxz, posq[4*blockAtom[i]+2]);
+        }
+        blockCenter = fvec4(0.5f*(minx+maxx), 0.5f*(miny+maxy), 0.5f*(minz+maxz), 0.0f);
+        if (!(minx < cutoffDistance || miny < cutoffDistance || minz < cutoffDistance ||
+                maxx > boxSize[0]-cutoffDistance || maxy > boxSize[1]-cutoffDistance || maxz > boxSize[2]-cutoffDistance))
+            periodicType = NoPeriodic;
+        else if (triclinic)
+            periodicType = PeriodicTriclinic;
+        else if (0.5f*(boxSize[0]-(maxx-minx)) >= cutoffDistance &&
+                 0.5f*(boxSize[1]-(maxy-miny)) >= cutoffDistance &&
+                 0.5f*(boxSize[2]-(maxz-minz)) >= cutoffDistance)
+            periodicType = PeriodicPerAtom;
+        else
+            periodicType = PeriodicPerInteraction;
+    }
+    
+    // Call the appropriate version depending on what calculation is required for periodic boundary conditions.
+    
+    if (periodicType == NoPeriodic)
+        calculateBlockEwaldIxnImpl<NoPeriodic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerAtom)
+        calculateBlockEwaldIxnImpl<PeriodicPerAtom>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicPerInteraction)
+        calculateBlockEwaldIxnImpl<PeriodicPerInteraction>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
+    else if (periodicType == PeriodicTriclinic)
+        calculateBlockEwaldIxnImpl<PeriodicTriclinic>(blockIndex, forces, totalEnergy, boxSize, invBoxSize, blockCenter);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize) {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize, const fvec4& blockCenter) {
    // Load the positions and parameters of the atoms in the block.
    
    const int* blockAtom = &neighborList->getSortedAtoms()[8*blockIndex];
    fvec4 blockAtomPosq[8];
    fvec8 blockAtomForceX(0.0f), blockAtomForceY(0.0f), blockAtomForceZ(0.0f);
    fvec8 blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge;
-    for (int i = 0; i < 8; i++)
+    for (int i = 0; i < 8; i++) {
        blockAtomPosq[i] = fvec4(posq+4*blockAtom[i]);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            blockAtomPosq[i] -= floor((blockAtomPosq[i]-blockCenter)*invBoxSize+0.5f)*boxSize;
+    }
    transpose(blockAtomPosq[0], blockAtomPosq[1], blockAtomPosq[2], blockAtomPosq[3], blockAtomPosq[4], blockAtomPosq[5], blockAtomPosq[6], blockAtomPosq[7], blockAtomX, blockAtomY, blockAtomZ, blockAtomCharge);
    blockAtomCharge *= ONE_4PI_EPS0;
    fvec8 blockAtomSigma(atomParameters[blockAtom[0]].first, atomParameters[blockAtom[1]].first, atomParameters[blockAtom[2]].first, atomParameters[blockAtom[3]].first, atomParameters[blockAtom[4]].first, atomParameters[blockAtom[5]].first, atomParameters[blockAtom[6]].first, atomParameters[blockAtom[7]].first);
    fvec8 blockAtomEpsilon(atomParameters[blockAtom[0]].second, atomParameters[blockAtom[1]].second, atomParameters[blockAtom[2]].second, atomParameters[blockAtom[3]].second, atomParameters[blockAtom[4]].second, atomParameters[blockAtom[5]].second, atomParameters[blockAtom[6]].second, atomParameters[blockAtom[7]].second);
-    bool needPeriodic = (periodic && (any(blockAtomX < cutoffDistance) || any(blockAtomY < cutoffDistance) || any(blockAtomZ < cutoffDistance) ||
-            any(blockAtomX > boxSize[0]-cutoffDistance) || any(blockAtomY > boxSize[1]-cutoffDistance) || any(blockAtomZ > boxSize[2]-cutoffDistance)));
+    const bool needPeriodic = (PERIODIC_TYPE == PeriodicPerInteraction || PERIODIC_TYPE == PeriodicTriclinic);
    const float invSwitchingInterval = 1/(cutoffDistance-switchingDistance);
    
    // Loop over neighbors for this block.
@@ -229,7 +322,10 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        // Compute the distances to the block atoms.
        
        fvec8 dx, dy, dz, r2;
-        getDeltaR<TRICLINIC>(&posq[4*atom], blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
+        fvec4 atomPos(posq+4*atom);
+        if (PERIODIC_TYPE == PeriodicPerAtom)
+            atomPos -= floor((atomPos-blockCenter)*invBoxSize+0.5f)*boxSize;
+        getDeltaR<PERIODIC_TYPE>(atomPos, blockAtomX, blockAtomY, blockAtomZ, dx, dy, dz, r2, needPeriodic, boxSize, invBoxSize);
        ivec8 include;
        char excl = exclusions[i];
        if (excl == 0)
@@ -242,8 +338,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        
        // Compute the interactions.
        
-        fvec8 r = sqrt(r2);
-        fvec8 inverseR = fvec8(1.0f)/r;
+        fvec8 inverseR = rsqrt(r2);
+        fvec8 r = r2*inverseR;
        fvec8 energy, dEdR;
        float atomEpsilon = atomParameters[atom].second;
        if (atomEpsilon != 0.0f) {
@@ -268,7 +364,7 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        }
        fvec8 chargeProd = blockAtomCharge*posq[4*atom+3];
        dEdR += chargeProd*inverseR*ewaldScaleFunction(r);
-        dEdR *= inverseR*inverseR;        
+        dEdR *= inverseR*inverseR;

        // Accumulate energies.

@@ -302,28 +398,26 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
        (fvec4(forces+4*blockAtom[j])+f[j]).store(forces+4*blockAtom[j]);
 }

-template <bool TRICLINIC>
-void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
+template <int PERIODIC_TYPE>
+void CpuNonbondedForceVec8::getDeltaR(const fvec4& posI, const fvec8& x, const fvec8& y, const fvec8& z, fvec8& dx, fvec8& dy, fvec8& dz, fvec8& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const {
    dx = x-posI[0];
    dy = y-posI[1];
    dz = z-posI[2];
-    if (periodic) {
-        if (TRICLINIC) {
-            fvec8 scale3 = floor(dz*recipBoxSize[2]+0.5f);
-            dx -= scale3*periodicBoxVectors[2][0];
-            dy -= scale3*periodicBoxVectors[2][1];
-            dz -= scale3*periodicBoxVectors[2][2];
-            fvec8 scale2 = floor(dy*recipBoxSize[1]+0.5f);
-            dx -= scale2*periodicBoxVectors[1][0];
-            dy -= scale2*periodicBoxVectors[1][1];
-            fvec8 scale1 = floor(dx*recipBoxSize[0]+0.5f);
-            dx -= scale1*periodicBoxVectors[0][0];
-        }
-        else {
-            dx -= round(dx*invBoxSize[0])*boxSize[0];
-            dy -= round(dy*invBoxSize[1])*boxSize[1];
-            dz -= round(dz*invBoxSize[2])*boxSize[2];
-        }
+    if (PERIODIC_TYPE == PeriodicTriclinic) {
+        fvec8 scale3 = floor(dz*recipBoxSize[2]+0.5f);
+        dx -= scale3*periodicBoxVectors[2][0];
+        dy -= scale3*periodicBoxVectors[2][1];
+        dz -= scale3*periodicBoxVectors[2][2];
+        fvec8 scale2 = floor(dy*recipBoxSize[1]+0.5f);
+        dx -= scale2*periodicBoxVectors[1][0];
+        dy -= scale2*periodicBoxVectors[1][1];
+        fvec8 scale1 = floor(dx*recipBoxSize[0]+0.5f);
+        dx -= scale1*periodicBoxVectors[0][0];
+    }
+    else if (PERIODIC_TYPE == PeriodicPerInteraction) {
+        dx -= round(dx*invBoxSize[0])*boxSize[0];
+        dy -= round(dy*invBoxSize[1])*boxSize[1];
+        dz -= round(dz*invBoxSize[2])*boxSize[2];
    }
    r2 = dx*dx + dy*dy + dz*dz;
 }

--- a/platforms/opencl/include/OpenCLFFT3D.h
+++ b/platforms/opencl/include/OpenCLFFT3D.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -40,7 +40,7 @@ namespace OpenMM {
 * 15, 207–228 (2000).
 * <p>
 * This class places certain restrictions on the allowed dimensions of the grid.  First,
- * the size of each dimension may have no prime factors other than 2, 3, and 5.  You
+ * the size of each dimension may have no prime factors other than 2, 3, 5, and 7.  You
 * can call findLegalDimension() to determine the smallest size that satisfies this
 * requirement and is greater than or equal to a specified minimum size.  Second, the size
 * of each dimension must be small enough to compute each 1D transform entirely in local
@@ -61,12 +61,17 @@ public:
     * @param xsize   the first dimension of the data sets on which FFTs will be performed
     * @param ysize   the second dimension of the data sets on which FFTs will be performed
     * @param zsize   the third dimension of the data sets on which FFTs will be performed
+     * @param realToComplex  if true, a real-to-complex transform will be done.  Otherwise, it is complex-to-complex.
     */
-    OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize);
+    OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex=false);
    /**
     * Perform a Fourier transform.  The transform cannot be done in-place: the input and output
     * arrays must be different.  Also, the input array is used as workspace, so its contents
-     * are destroyed.
+     * are destroyed.  This also means that both arrays must be large enough to hold complex values,
+     * even when performing a real-to-complex transform.
+     * <p>
+     * When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
+     * and contains only the non-redundant elements.
     *
     * @param in       the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
     * @param out      on exit, this contains the transformed data
@@ -75,17 +80,20 @@ public:
    void execFFT(OpenCLArray& in, OpenCLArray& out, bool forward = true);
    /**
     * Get the smallest legal size for a dimension of the grid (that is, a size with no prime
-     * factors other than 2, 3, and 5).
+     * factors other than 2, 3, 5, and 7).
     *
     * @param minimum   the minimum size the return value must be greater than or equal to
     */
    static int findLegalDimension(int minimum);
 private:
-    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
+    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal);
    int xsize, ysize, zsize;
    int xthreads, ythreads, zthreads;
+    bool packRealAsComplex;
    OpenCLContext& context;
    cl::Kernel xkernel, ykernel, zkernel;
+    cl::Kernel invxkernel, invykernel, invzkernel;
+    cl::Kernel packForwardKernel, unpackForwardKernel, packBackwardKernel, unpackBackwardKernel;
 };

 } // namespace OpenMM

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -639,6 +639,7 @@ private:
    cl::Kernel pmeSpreadChargeKernel;
    cl::Kernel pmeFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
+    cl::Kernel pmeEvalEnergyKernel;
    cl::Kernel pmeInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                // if they supplied a valid deviceIndex, we only look through that one
                if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
                    continue;
-
-                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD"))
-                    continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
+                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
+                    continue; // The CPU device on OS X won't work correctly.
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
                int processingElementsPerComputeUnit = 8;
                if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
        if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
            defaultOptimizationOptions = "";
+        else if (platformVendor == "Apple")
+            defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
        else
            defaultOptimizationOptions = "-cl-fast-relaxed-math";
        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        }
        else
            simdWidth = 1;
-        if (platformVendor == "Apple" && vendor == "AMD")
-            compilationDefines["MAC_AMD_WORKAROUND"] = "";
        if (supports64BitGlobalAtomics)
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,25 +35,109 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
-    zkernel = createKernel(xsize, ysize, zsize, zthreads);
-    xkernel = createKernel(ysize, zsize, xsize, xthreads);
-    ykernel = createKernel(zsize, xsize, ysize, ythreads);
+OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
+        context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
+    packRealAsComplex = false;
+    int packedXSize = xsize;
+    int packedYSize = ysize;
+    int packedZSize = zsize;
+    if (realToComplex) {
+        // If any axis size is even, we can pack the real values into a complex grid that is only half as large.
+        // Look for an appropriate axis.
+        
+        packRealAsComplex = true;
+        int packedAxis, bufferSize;
+        if (xsize%2 == 0) {
+            packedAxis = 0;
+            packedXSize /= 2;
+            bufferSize = packedXSize;
+        }
+        else if (ysize%2 == 0) {
+            packedAxis = 1;
+            packedYSize /= 2;
+            bufferSize = packedYSize;
+        }
+        else if (zsize%2 == 0) {
+            packedAxis = 2;
+            packedZSize /= 2;
+            bufferSize = packedZSize;
+        }
+        else
+            packRealAsComplex = false;
+        if (packRealAsComplex) {
+            // Build the kernels for packing and unpacking the data.
+            
+            map<string, string> defines;
+            defines["XSIZE"] = context.intToString(xsize);
+            defines["YSIZE"] = context.intToString(ysize);
+            defines["ZSIZE"] = context.intToString(zsize);
+            defines["PACKED_AXIS"] = context.intToString(packedAxis);
+            defines["PACKED_XSIZE"] = context.intToString(packedXSize);
+            defines["PACKED_YSIZE"] = context.intToString(packedYSize);
+            defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
+            defines["M_PI"] = context.doubleToString(M_PI);
+            cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
+            packForwardKernel = cl::Kernel(program, "packForwardData");
+            unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
+            unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
+            packBackwardKernel = cl::Kernel(program, "packBackwardData");
+            packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
+            unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData");
+        }
+    }
+    bool inputIsReal = (realToComplex && !packRealAsComplex);
+    zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
+    xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
+    ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
+    invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
+    invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
+    invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
 }

 void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
-    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
-    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
-    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
-    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
-    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
+    cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
+    cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
+    cl::Kernel kernel3 = (forward ? ykernel : invykernel);
+    if (packRealAsComplex) {
+        cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel);
+        cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
+        int gridSize = xsize*ysize*zsize/2;
+
+        // Pack the data into a half sized grid.
+        
+        packKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        packKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(packKernel, gridSize);
+        
+        // Perform the FFT.
+        
+        kernel1.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel1, gridSize, zthreads);
+        kernel2.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel2, gridSize, xthreads);
+        kernel3.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel3, gridSize, ythreads);
+        
+        // Unpack the data.
+        
+        unpackKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        unpackKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(unpackKernel, gridSize);
+    }
+    else {
+        kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
+        kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
+        kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
+    }
 }

 int OpenCLFFT3D::findLegalDimension(int minimum) {
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
    }
 }

-cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
+cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
    int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+    while (maxThreads > 128 && maxThreads-64 >= zsize)
+        maxThreads -= 64;
    bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
    while (true) {
        bool loopRequired = (zsize > maxThreads || isCPU);
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
                source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
                source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
-                source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
-                source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
-                source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
-                source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+                source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+                source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+                source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+                source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
                source<<"real2 t0 = b0+b1;\n";
                source<<"real2 t1 = b2+b3;\n";
                source<<"real2 t2 = b4-b3;\n";
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d7 = d6+d5;\n";
                source<<"real2 d8 = d6-d5;\n";
                string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-                source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
-                source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+                source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+                source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
                source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
                source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
                source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d0 = c0+c2;\n";
                source<<"real2 d1 = c0-c2;\n";
                source<<"real2 d2 = c1+c3;\n";
-                source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
+                source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
                source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
                source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
                source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
                source<<"real2 d0 = c1+c2;\n";
                source<<"real2 d1 = c0-0.5f*d0;\n";
-                source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
+                source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
                source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
                source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
                source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa

        // Create the kernel.

+        bool outputIsReal = (inputIsReal && axis == 2 && !forward);
+        bool outputIsPacked = (inputIsReal && axis == 2 && forward);
+        string outputSuffix = (outputIsReal ? ".x" : "");
        if (loopRequired) {
+            if (outputIsPacked)
+                source<<"if (x < XSIZE/2+1)\n";
            source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
-            source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
+            if (outputIsPacked)
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
+            else
+                source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
        }
        else {
-            source<<"if (index < XSIZE*YSIZE)\n";
-            source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
+            if (outputIsPacked) {
+                source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
+            else {
+                source<<"if (index < XSIZE*YSIZE)\n";
+                source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
        }
        map<string, string> replacements;
        replacements["XSIZE"] = context.intToString(xsize);
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
        replacements["M_PI"] = context.doubleToString(M_PI);
        replacements["COMPUTE_FFT"] = source.str();
        replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
+        replacements["SIGN"] = (forward ? "1" : "-1");
+        replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
+        replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
+        replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
+        replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
+        replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
        cl::Kernel kernel(program, "execFFT");
        threads = (isCPU ? 1 : blocksPerGroup*zsize);
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
            continue;
        }
        int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+        kernel.setArg(2, bufferSize, NULL);
        kernel.setArg(3, bufferSize, NULL);
        kernel.setArg(4, bufferSize, NULL);
-        kernel.setArg(5, bufferSize, NULL);
        return kernel;
    }
 }
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
            pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
            pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
            pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["M_PI"] = cl.doubleToString(M_PI);
            bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
            if (deviceIsCpu)
                pmeDefines["DEVICE_IS_CPU"] = "1";
@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb

                int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
                pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-                cl.addAutoclearBuffer(*pmeGrid);
                pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
+                if (cl.getSupports64BitGlobalAtomics())
+                    cl.addAutoclearBuffer(*pmeGrid2);
+                else
+                    cl.addAutoclearBuffer(*pmeGrid);
                pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
                sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
-                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
+                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
                string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
-                usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                if (isNvidia)
+                    pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
+                usePmeQueue = isNvidia;
                if (usePmeQueue) {
                    pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
            pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
            pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
+            pmeEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
            pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
            int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
-            pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
+            if (cl.getSupports64BitGlobalAtomics())
+                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2->getDeviceBuffer());
+            else
+                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer());
            if (cl.getSupports64BitGlobalAtomics()) {
                pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
-                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
            }
       }
    }
@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
    }
    if (pmeGrid != NULL && includeReciprocal) {
-        if (usePmeQueue)
+        if (usePmeQueue && !includeEnergy)
            cl.setQueue(pmeQueue);
        
        // Invert the periodic box vectors.
@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        }
        fft->execFFT(*pmeGrid, *pmeGrid2, true);
        mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
-        double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
        if (cl.getUseDoublePrecision()) {
-            pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
-            pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
-            pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
-            pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor);
+            pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
+            pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
+            pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
        }
        else {
-            pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
-            pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
-            pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
-            pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor);
-        }
+            pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
+            pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
+            pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
+        }
+        if (includeEnergy)
+            cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
        cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
        fft->execFFT(*pmeGrid2, *pmeGrid, false);
        setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {

 bool OpenCLPlatform::isPlatformSupported() {
    // Return false for OpenCL implementations that are known
-    // to be buggy (Apple OSX since 10.7.5)
+    // to be buggy (Apple OS X prior to 10.10).

 #ifdef __APPLE__
    char str[256];
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
    if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
        return false;

-    if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2))
-        // 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
-        // point at which a number of serious bugs were introduced into the
-        // Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
-        // (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
-        // this version check should be updated.
+    if (major < 14 || (major == 14 && minor < 3))
+        // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
+        // contained a number of serious bugs in the Apple OpenCL libraries.
+        // (See https://github.com/SimTk/openmm/issues/395 for example.)
        return false;
 #endif


--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
-    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
    computeRangeKernel = cl::Kernel(program, "computeRange");
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
-    isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
+    // On Qualcomm's OpenCL, it's essential to check against maxShortListSize.  Otherwise you get a crash.
+    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
+    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
+    // If we officially support Qualcomm in the future, we'll need to do something better.
+    isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);

--- a/platforms/opencl/src/kernels/customManyParticle.cl
+++ b/platforms/opencl/src/kernels/customManyParticle.cl
@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
                    int start = block2*TILE_SIZE;
                    int included[TILE_SIZE];
                    int numIncluded = 0;
+                    SYNC_WARPS;
                    positionCache[get_local_id(0)] = posq[start+indexInWarp];
+                    SYNC_WARPS;
                    if (atom1 < NUM_ATOMS) {
                        for (int j = 0; j < 32; j++) {
                            int atom2 = start+j;
@@ -287,7 +289,7 @@ __kernel void computeNeighborStartIndices(__global int* restrict numNeighborsFor

        unsigned int globalIndex = startAtom+get_local_id(0);
        posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
-    barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform a parallel prefix sum.