Merge remote-tracking branch 'upstream/master'

bf6d95c2 · John Chodera (MSKCC) · 896413aa · 227c86bf · bf6d95c2 · bf6d95c2
Commit bf6d95c2 authored Jun 05, 2015 by John Chodera (MSKCC)
20 changed files
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -30,12 +30,12 @@
 #include <map>
 #include <queue>
 #include <string>
-#include <pthread.h>
 #define __CL_ENABLE_EXCEPTIONS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
    #define NOMINMAX
 #endif
+#include <pthread.h>
 #include <cuda.h>
 #include <builtin_types.h>
 #include <vector_functions.h>

--- a/platforms/cuda/include/CudaFFT3D.h
+++ b/platforms/cuda/include/CudaFFT3D.h
+#ifndef __OPENMM_CUDAFFT3D_H__
+#define __OPENMM_CUDAFFT3D_H__
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaArray.h"
+
+namespace OpenMM {
+
+/**
+ * This class performs three dimensional Fast Fourier Transforms.  It is based on the
+ * mixed radix algorithm described in
+ * <p>
+ * Takahashi, D. and Kanada, Y., "High-Performance Radix-2, 3 and 5 Parallel 1-D Complex
+ * FFT Algorithms for Distributed-Memory Parallel Computers."  Journal of Supercomputing,
+ * 15, 207–228 (2000).
+ * <p>
+ * This class places certain restrictions on the allowed dimensions of the grid.  First,
+ * the size of each dimension may have no prime factors other than 2, 3, 5, and 7.  You
+ * can call findLegalDimension() to determine the smallest size that satisfies this
+ * requirement and is greater than or equal to a specified minimum size.  Second, the size
+ * of each dimension must be small enough to compute each 1D transform entirely in local
+ * memory with one work unit per data point.  This will vary between platforms, but is
+ * typically at least 512.
+ * <p>
+ * Note that this class performs an unnormalized transform.  That means that if you perform
+ * a forward transform followed immediately by an inverse transform, the effect is to
+ * multiply every value of the original data set by the total number of data points.
+ */
+
+class OPENMM_EXPORT_CUDA CudaFFT3D {
+public:
+    /**
+     * Create an CudaFFT3D object for performing transforms of a particular size.
+     *
+     * @param context the context in which to perform calculations
+     * @param xsize   the first dimension of the data sets on which FFTs will be performed
+     * @param ysize   the second dimension of the data sets on which FFTs will be performed
+     * @param zsize   the third dimension of the data sets on which FFTs will be performed
+     * @param realToComplex  if true, a real-to-complex transform will be done.  Otherwise, it is complex-to-complex.
+     */
+    CudaFFT3D(CudaContext& context, int xsize, int ysize, int zsize, bool realToComplex=false);
+    /**
+     * Perform a Fourier transform.  The transform cannot be done in-place: the input and output
+     * arrays must be different.  Also, the input array is used as workspace, so its contents
+     * are destroyed.  This also means that both arrays must be large enough to hold complex values,
+     * even when performing a real-to-complex transform.
+     * <p>
+     * When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
+     * and contains only the non-redundant elements.
+     *
+     * @param in       the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
+     * @param out      on exit, this contains the transformed data
+     * @param forward  true to perform a forward transform, false to perform an inverse transform
+     */
+    void execFFT(CudaArray& in, CudaArray& out, bool forward = true);
+    /**
+     * Get the smallest legal size for a dimension of the grid (that is, a size with no prime
+     * factors other than 2, 3, 5, and 7).
+     *
+     * @param minimum   the minimum size the return value must be greater than or equal to
+     */
+    static int findLegalDimension(int minimum);
+private:
+    CUfunction createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal);
+    int xsize, ysize, zsize;
+    int xthreads, ythreads, zthreads;
+    bool packRealAsComplex;
+    CudaContext& context;
+    CUfunction xkernel, ykernel, zkernel;
+    CUfunction invxkernel, invykernel, invzkernel;
+    CUfunction packForwardKernel, unpackForwardKernel, packBackwardKernel, unpackBackwardKernel;
+};
+
+} // namespace OpenMM
+
+#endif // __OPENMM_CUDAFFT3D_H__
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -30,6 +30,7 @@
 #include "CudaPlatform.h"
 #include "CudaArray.h"
 #include "CudaContext.h"
+#include "CudaFFT3D.h"
 #include "CudaParameterSet.h"
 #include "CudaSort.h"
 #include "openmm/kernels.h"
@@ -588,7 +589,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
            cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
-            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), pmeio(NULL) {
+            pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL),  pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
    }
    ~CudaCalcNonbondedForceKernel();
    /**
@@ -649,6 +650,7 @@ private:
    PmeIO* pmeio;
    CUstream pmeStream;
    CUevent pmeSyncEvent;
+    CudaFFT3D* fft;
    cufftHandle fftForward;
    cufftHandle fftBackward;
    CUfunction ewaldSumsKernel;
@@ -663,7 +665,7 @@ private:
    std::vector<std::pair<int, int> > exceptionAtoms;
    double ewaldSelfEnergy, dispersionCoefficient, alpha;
    int interpolateForceThreads;
-    bool hasCoulomb, hasLJ, usePmeStream;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT;
    static const int PmeOrder = 5;
 };


--- a/platforms/cuda/src/CudaFFT3D.cpp
+++ b/platforms/cuda/src/CudaFFT3D.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaFFT3D.h"
+#include "CudaContext.h"
+#include "CudaKernelSources.h"
+#include "SimTKOpenMMRealType.h"
+#include <map>
+#include <sstream>
+#include <string>
+
+using namespace OpenMM;
+using namespace std;
+
+CudaFFT3D::CudaFFT3D(CudaContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
+        context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
+    packRealAsComplex = false;
+    int packedXSize = xsize;
+    int packedYSize = ysize;
+    int packedZSize = zsize;
+    if (realToComplex) {
+        // If any axis size is even, we can pack the real values into a complex grid that is only half as large.
+        // Look for an appropriate axis.
+        
+        packRealAsComplex = true;
+        int packedAxis, bufferSize;
+        if (xsize%2 == 0) {
+            packedAxis = 0;
+            packedXSize /= 2;
+            bufferSize = packedXSize;
+        }
+        else if (ysize%2 == 0) {
+            packedAxis = 1;
+            packedYSize /= 2;
+            bufferSize = packedYSize;
+        }
+        else if (zsize%2 == 0) {
+            packedAxis = 2;
+            packedZSize /= 2;
+            bufferSize = packedZSize;
+        }
+        else
+            packRealAsComplex = false;
+        if (packRealAsComplex) {
+            // Build the kernels for packing and unpacking the data.
+            
+            map<string, string> defines;
+            defines["XSIZE"] = context.intToString(xsize);
+            defines["YSIZE"] = context.intToString(ysize);
+            defines["ZSIZE"] = context.intToString(zsize);
+            defines["PACKED_AXIS"] = context.intToString(packedAxis);
+            defines["PACKED_XSIZE"] = context.intToString(packedXSize);
+            defines["PACKED_YSIZE"] = context.intToString(packedYSize);
+            defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
+            CUmodule module = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::fftR2C, defines);
+            packForwardKernel = context.getKernel(module, "packForwardData");
+            unpackForwardKernel = context.getKernel(module, "unpackForwardData");
+            packBackwardKernel = context.getKernel(module, "packBackwardData");
+            unpackBackwardKernel = context.getKernel(module, "unpackBackwardData");
+        }
+    }
+    bool inputIsReal = (realToComplex && !packRealAsComplex);
+    zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
+    xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
+    ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
+    invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
+    invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
+    invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
+}
+
+void CudaFFT3D::execFFT(CudaArray& in, CudaArray& out, bool forward) {
+    CUfunction kernel1 = (forward ? zkernel : invzkernel);
+    CUfunction kernel2 = (forward ? xkernel : invxkernel);
+    CUfunction kernel3 = (forward ? ykernel : invykernel);
+    void* args1[] = {&in.getDevicePointer(), &out.getDevicePointer()};
+    void* args2[] = {&out.getDevicePointer(), &in.getDevicePointer()};
+    if (packRealAsComplex) {
+        CUfunction packKernel = (forward ? packForwardKernel : packBackwardKernel);
+        CUfunction unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
+        int gridSize = xsize*ysize*zsize/2;
+
+        // Pack the data into a half sized grid.
+        
+        context.executeKernel(packKernel, args1, gridSize, 128);
+        
+        // Perform the FFT.
+        
+        context.executeKernel(kernel1, args2, gridSize, zthreads);
+        context.executeKernel(kernel2, args1, gridSize, xthreads);
+        context.executeKernel(kernel3, args2, gridSize, ythreads);
+        
+        // Unpack the data.
+        
+        context.executeKernel(unpackKernel, args1, gridSize, 128);
+    }
+    else {
+        context.executeKernel(kernel1, args1, xsize*ysize*zsize, zthreads);
+        context.executeKernel(kernel2, args2, xsize*ysize*zsize, xthreads);
+        context.executeKernel(kernel3, args1, xsize*ysize*zsize, ythreads);
+    }
+}
+
+int CudaFFT3D::findLegalDimension(int minimum) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+
+        int unfactored = minimum;
+        for (int factor = 2; factor < 8; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1)
+            return minimum;
+        minimum++;
+    }
+}
+
+
+static int getSmallestRadix(int size) {
+    int minRadix = 1;
+    int unfactored = size;
+    while (unfactored%7 == 0) {
+        minRadix = 7;
+        unfactored /= 7;
+    }
+    while (unfactored%5 == 0) {
+        minRadix = 5;
+        unfactored /= 5;
+    }
+    while (unfactored%4 == 0) {
+        minRadix = 4;
+        unfactored /= 4;
+    }
+    while (unfactored%3 == 0) {
+        minRadix = 3;
+        unfactored /= 3;
+    }
+    while (unfactored%2 == 0) {
+        minRadix = 2;
+        unfactored /= 2;
+    }
+    return minRadix;
+}
+
+CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
+    int maxThreads = 256;
+//    while (maxThreads > 128 && maxThreads-64 >= zsize)
+//        maxThreads -= 64;
+    int threadsPerBlock = zsize/getSmallestRadix(zsize);
+    stringstream source;
+    int blocksPerGroup = max(1, maxThreads/threadsPerBlock);
+    int stage = 0;
+    int L = zsize;
+    int m = 1;
+
+    // Factor zsize, generating an appropriate block of code for each factor.
+
+    while (L > 1) {
+        int input = stage%2;
+        int output = 1-input;
+        int radix;
+        if (L%7 == 0)
+            radix = 7;
+        else if (L%5 == 0)
+            radix = 5;
+        else if (L%4 == 0)
+            radix = 4;
+        else if (L%3 == 0)
+            radix = 3;
+        else if (L%2 == 0)
+            radix = 2;
+        else
+            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
+        source<<"{\n";
+        L = L/radix;
+        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
+        if (L*m < threadsPerBlock)
+            source<<"if (threadIdx.x < "<<(blocksPerGroup*L*m)<<") {\n";
+        else
+            source<<"{\n";
+        source<<"int block = threadIdx.x/"<<(L*m)<<";\n";
+        source<<"int i = threadIdx.x-block*"<<(L*m)<<";\n";
+        source<<"int base = i+block*"<<zsize<<";\n";
+        source<<"int j = i/"<<m<<";\n";
+        if (radix == 7) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
+            source<<"real2 c5 = data"<<input<<"[base+"<<(5*L*m)<<"];\n";
+            source<<"real2 c6 = data"<<input<<"[base+"<<(6*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c6;\n";
+            source<<"real2 d1 = c1-c6;\n";
+            source<<"real2 d2 = c2+c5;\n";
+            source<<"real2 d3 = c2-c5;\n";
+            source<<"real2 d4 = c4+c3;\n";
+            source<<"real2 d5 = c4-c3;\n";
+            source<<"real2 d6 = d2+d0;\n";
+            source<<"real2 d7 = d5+d3;\n";
+            source<<"real2 b0 = c0+d6+d4;\n";
+            source<<"real2 b1 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)+cos(6*M_PI/7))/3-1)<<"*(d6+d4);\n";
+            source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
+            source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
+            source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
+            source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+            source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+            source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+            source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+            source<<"real2 t0 = b0+b1;\n";
+            source<<"real2 t1 = b2+b3;\n";
+            source<<"real2 t2 = b4-b3;\n";
+            source<<"real2 t3 = -b2-b4;\n";
+            source<<"real2 t4 = b6+b7;\n";
+            source<<"real2 t5 = b8-b7;\n";
+            source<<"real2 t6 = -b8-b6;\n";
+            source<<"real2 t7 = t0+t1;\n";
+            source<<"real2 t8 = t0+t2;\n";
+            source<<"real2 t9 = t0+t3;\n";
+            source<<"real2 t10 = make_real2(t4.y+b5.y, -(t4.x+b5.x));\n";
+            source<<"real2 t11 = make_real2(t5.y+b5.y, -(t5.x+b5.x));\n";
+            source<<"real2 t12 = make_real2(t6.y+b5.y, -(t6.x+b5.x));\n";
+            source<<"data"<<output<<"[base+6*j*"<<m<<"] = b0;\n";
+            source<<"data"<<output<<"[base+(6*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(7*L)<<"], t7-t10);\n";
+            source<<"data"<<output<<"[base+(6*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(7*L)<<"], t9-t12);\n";
+            source<<"data"<<output<<"[base+(6*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(7*L)<<"], t8+t11);\n";
+            source<<"data"<<output<<"[base+(6*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(7*L)<<"], t8-t11);\n";
+            source<<"data"<<output<<"[base+(6*j+5)*"<<m<<"] = multiplyComplex(w[j*"<<(5*zsize)<<"/"<<(7*L)<<"], t9+t12);\n";
+            source<<"data"<<output<<"[base+(6*j+6)*"<<m<<"] = multiplyComplex(w[j*"<<(6*zsize)<<"/"<<(7*L)<<"], t7+t10);\n";
+        }
+        else if (radix == 5) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c4;\n";
+            source<<"real2 d1 = c2+c3;\n";
+            source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
+            source<<"real2 d3 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
+            source<<"real2 d4 = d0+d1;\n";
+            source<<"real2 d5 = "<<context.doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
+            source<<"real2 d6 = c0-0.25f*d4;\n";
+            source<<"real2 d7 = d6+d5;\n";
+            source<<"real2 d8 = d6-d5;\n";
+            string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
+            source<<"real2 d9 = (SIGN)*make_real2(d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+            source<<"real2 d10 = (SIGN)*make_real2("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+            source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
+            source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
+            source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
+            source<<"data"<<output<<"[base+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
+            source<<"data"<<output<<"[base+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
+        }
+        else if (radix == 4) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 d0 = c0+c2;\n";
+            source<<"real2 d1 = c0-c2;\n";
+            source<<"real2 d2 = c1+c3;\n";
+            source<<"real2 d3 = (SIGN)*make_real2(c1.y-c3.y, c3.x-c1.x);\n";
+            source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
+            source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
+            source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
+            source<<"data"<<output<<"[base+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
+        }
+        else if (radix == 3) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c2;\n";
+            source<<"real2 d1 = c0-0.5f*d0;\n";
+            source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*make_real2(c1.y-c2.y, c2.x-c1.x);\n";
+            source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
+            source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
+            source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
+        }
+        else if (radix == 2) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"data"<<output<<"[base+j*"<<m<<"] = c0+c1;\n";
+            source<<"data"<<output<<"[base+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
+        }
+        source<<"}\n";
+        m = m*radix;
+        source<<"__syncthreads();\n";
+        source<<"}\n";
+        ++stage;
+    }
+
+    // Create the kernel.
+
+    bool outputIsReal = (inputIsReal && axis == 2 && !forward);
+    bool outputIsPacked = (inputIsReal && axis == 2 && forward);
+    string outputSuffix = (outputIsReal ? ".x" : "");
+    if (outputIsPacked)
+        source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
+    else
+        source<<"if (index < XSIZE*YSIZE)\n";
+    source<<"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)\n";
+    if (outputIsPacked)
+        source<<"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
+    else
+            source<<"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
+    map<string, string> replacements;
+    replacements["XSIZE"] = context.intToString(xsize);
+    replacements["YSIZE"] = context.intToString(ysize);
+    replacements["ZSIZE"] = context.intToString(zsize);
+    replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup);
+    replacements["THREADS_PER_BLOCK"] = context.intToString(threadsPerBlock);
+    replacements["M_PI"] = context.doubleToString(M_PI);
+    replacements["COMPUTE_FFT"] = source.str();
+    replacements["SIGN"] = (forward ? "1" : "-1");
+    replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
+    replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
+    replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
+    replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
+    replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
+    CUmodule module = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::fft, replacements));
+    CUfunction kernel = context.getKernel(module, "execFFT");
+    threads = blocksPerGroup*threadsPerBlock;
+    return kernel;
+}
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1497,11 +1497,15 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
        delete pmeAtomGridIndex;
    if (sort != NULL)
        delete sort;
+    if (fft != NULL)
+        delete fft;
    if (pmeio != NULL)
        delete pmeio;
    if (hasInitializedFFT) {
-        cufftDestroy(fftForward);
-        cufftDestroy(fftBackward);
+        if (useCudaFFT) {
+            cufftDestroy(fftForward);
+            cufftDestroy(fftBackward);
+        }
        if (usePmeStream) {
            cuStreamDestroy(pmeStream);
            cuEventDestroy(pmeSyncEvent);
@@ -1509,26 +1513,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
    }
 }

-/**
- * Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
- */
-static int findFFTDimension(int minimum) {
-    if (minimum < 1)
-        return 1;
-    while (true) {
-        // Attempt to factor the current value.
-
-        int unfactored = minimum;
-        for (int factor = 2; factor < 8; factor++) {
-            while (unfactored > 1 && unfactored%factor == 0)
-                unfactored /= factor;
-        }
-        if (unfactored == 1)
-            return minimum;
-        minimum++;
-    }
-}
-
 void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
    cu.setAsCurrent();

@@ -1643,9 +1627,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
        int gridSizeX, gridSizeY, gridSizeZ;

        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
-        gridSizeX = findFFTDimension(gridSizeX);
-        gridSizeY = findFFTDimension(gridSizeY);
-        gridSizeZ = findFFTDimension(gridSizeZ);
+        gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
+        gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
+        gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);

        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
@@ -1692,38 +1676,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
                // Create required data structures.

                int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
-
-                directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
-                reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
-
+                directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
+                reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
                cu.addAutoclearBuffer(*directPmeGrid);
-
                pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
                pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
-
-                cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
-                if (result != CUFFT_SUCCESS)
-                    throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-                result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
-                if (result != CUFFT_SUCCESS)
-                    throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
-
-                cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
-                cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
+                useCudaFFT = false; // We might switch back in the future, once Nvidia has all their bugs worked out
+                if (useCudaFFT) {
+                    cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
+                    if (result != CUFFT_SUCCESS)
+                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                    result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
+                    if (result != CUFFT_SUCCESS)
+                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
+                    cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
+                    cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
+                }
+                else
+                    fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
                
                // Prepare for doing PME on its own stream.
                
-                int cufftVersion;
-                cufftGetVersion(&cufftVersion);
-                usePmeStream = (cu.getComputeCapability() < 5.0 && numParticles < 130000 && cufftVersion >= 6000); // Workarounds for various CUDA bugs
+                char deviceName[100];
+                cuDeviceGetName(deviceName, 100, cu.getDevice());
+                usePmeStream = (string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
                if (usePmeStream) {
                    cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
-                    cufftSetStream(fftForward, pmeStream);
-                    cufftSetStream(fftBackward, pmeStream);
+                    if (useCudaFFT) {
+                        cufftSetStream(fftForward, pmeStream);
+                        cufftSetStream(fftBackward, pmeStream);
+                    }
                    CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
@@ -1893,10 +1879,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
        }

-        if (cu.getUseDoublePrecision())
-            cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
-        else
-            cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
+        if (useCudaFFT) {
+            if (cu.getUseDoublePrecision())
+                cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
+            else
+                cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
+        }
+        else {
+            fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
+        }

        if (includeEnergy) {
            void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
@@ -1910,11 +1901,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
                cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());

-        if (cu.getUseDoublePrecision())
-            cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
-        else
-            cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*)  directPmeGrid->getDevicePointer());
-
+        if (useCudaFFT) {
+            if (cu.getUseDoublePrecision())
+                cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
+            else
+                cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*)  directPmeGrid->getDevicePointer());
+        }
+        else {
+            fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
+        }

        void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
                cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};

--- a/platforms/cuda/src/kernels/fft.cu
+++ b/platforms/cuda/src/kernels/fft.cu
+static __inline__ __device__ real2 multiplyComplex(real2 c1, real2 c2) {
+    return make_real2(c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
+}
+
+/**
+ * Load a value from the half-complex grid produces by a real-to-complex transform.
+ */
+static __inline__ __device__ real2 loadComplexValue(const real2* __restrict__ in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return make_real2(value.x, -value.y);
+}
+
+/**
+ * Perform a 1D FFT on each row along one axis.
+ */
+
+extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYPE* __restrict__ out) {
+    __shared__ real2 w[ZSIZE];
+    __shared__ real2 data0[BLOCKS_PER_GROUP*ZSIZE];
+    __shared__ real2 data1[BLOCKS_PER_GROUP*ZSIZE];
+    for (int i = threadIdx.x; i < ZSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
+    __syncthreads();
+    
+    const int block = threadIdx.x/THREADS_PER_BLOCK;
+    for (int baseIndex = blockIdx.x*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += gridDim.x*BLOCKS_PER_GROUP) {
+        int index = baseIndex+block;
+        int x = index/YSIZE;
+        int y = index-x*YSIZE;
+#if OUTPUT_IS_PACKED
+        if (x < XSIZE/2+1) {
+#endif
+        if (index < XSIZE*YSIZE)
+            for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
+    #if INPUT_IS_REAL
+                data0[i+block*ZSIZE] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+i], 0);
+    #elif INPUT_IS_PACKED
+                data0[i+block*ZSIZE] = loadComplexValue(in, x, y, i);
+    #else
+                data0[i+block*ZSIZE] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+i];
+    #endif
+#if OUTPUT_IS_PACKED
+        }
+#endif
+        __syncthreads();
+        COMPUTE_FFT
+    }
+}
--- a/platforms/cuda/src/kernels/fftR2C.cu
+++ b/platforms/cuda/src/kernels/fftR2C.cu
+/**
+ * Combine the two halves of a real grid into a complex grid that is half as large.
+ */
+extern "C" __global__ void packForwardData(const real* __restrict__ in, real2* __restrict__ out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+#if PACKED_AXIS == 0
+        real2 value = make_real2(in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
+#elif PACKED_AXIS == 1
+        real2 value = make_real2(in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
+#else
+        real2 value = make_real2(in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
+#endif
+        out[index] = value;
+    }
+}
+
+/**
+ * Split the transformed data back into a full sized, symmetric grid.
+ */
+extern "C" __global__ void unpackForwardData(const real2* __restrict__ in, real2* __restrict__ out) {
+    // Compute the phase factors.
+    
+#if PACKED_AXIS == 0
+    __shared__ real2 w[PACKED_XSIZE];
+    for (int i = threadIdx.x; i < PACKED_XSIZE; i += blockDim.x)
+        w[i] = make_real2(sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    __shared__ real2 w[PACKED_YSIZE];
+    for (int i = threadIdx.x; i < PACKED_YSIZE; i += blockDim.x)
+        w[i] = make_real2(sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
+#else
+    __shared__ real2 w[PACKED_ZSIZE];
+    for (int i = threadIdx.x; i < PACKED_ZSIZE; i += blockDim.x)
+        w[i] = make_real2(sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
+#endif
+    __syncthreads();
+
+    // Transform the data.
+    
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    const int outputZSize = ZSIZE/2+1;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
+        real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+#else
+        real2 wfac = w[z];
+#endif
+        real2 output = make_real2((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
+        if (z < outputZSize)
+            out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
+        xp = (x == 0 ? 0 : XSIZE-x);
+        yp = (y == 0 ? 0 : YSIZE-y);
+        zp = (z == 0 ? 0 : ZSIZE-z);
+        if (zp < outputZSize) {
+#if PACKED_AXIS == 0
+            if (x == 0)
+                out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#elif PACKED_AXIS == 1
+            if (y == 0)
+                out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#else
+            if (z == 0)
+                out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#endif
+            else
+                out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = make_real2(output.x, -output.y);
+        }
+    }
+}
+
+/**
+ * Load a value from the half-complex grid produced by a real-to-complex transform.
+ */
+static __inline__ __device__ real2 loadComplexValue(const real2* __restrict__ in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return make_real2(value.x, -value.y);
+}
+
+/**
+ * Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
+ */
+extern "C" __global__ void packBackwardData(const real2* __restrict__ in, real2* __restrict__ out) {
+    // Compute the phase factors.
+    
+#if PACKED_AXIS == 0
+    __shared__ real2 w[PACKED_XSIZE];
+    for (int i = threadIdx.x; i < PACKED_XSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    __shared__ real2 w[PACKED_YSIZE];
+    for (int i = threadIdx.x; i < PACKED_YSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
+#else
+    __shared__ real2 w[PACKED_ZSIZE];
+    for (int i = threadIdx.x; i < PACKED_ZSIZE; i += blockDim.x)
+        w[i] = make_real2(cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
+#endif
+    __syncthreads();
+
+    // Transform the data.
+    
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = loadComplexValue(in, x, y, z);
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+        real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+        real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
+#else
+        real2 wfac = w[z];
+        real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
+#endif
+        real2 even = make_real2((z1.x+z2.x)/2, (z1.y-z2.y)/2);
+        real2 odd = make_real2((z1.x-z2.x)/2, (z1.y+z2.y)/2);
+        odd = make_real2(odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
+        out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = make_real2(even.x-odd.y, even.y+odd.x);
+    }
+}
+
+/**
+ * Split the data back into a full sized, real grid after an inverse transform.
+ */
+extern "C" __global__ void unpackBackwardData(const real2* __restrict__ in, real* __restrict__ out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        real2 value = 2*in[index];
+#if PACKED_AXIS == 0
+        out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
+        out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
+#elif PACKED_AXIS == 1
+        out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
+        out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
+#else
+        out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
+        out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
+#endif
+    }
+}
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -271,9 +271,9 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);
        real forceZ = -q*(force.x*GRID_SIZE_X*recipBoxVecZ.x+force.y*GRID_SIZE_Y*recipBoxVecZ.y+force.z*GRID_SIZE_Z*recipBoxVecZ.z);
-        forceBuffers[atom] += static_cast<unsigned long long>((long long) (forceX*0x100000000));
-        forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (forceY*0x100000000));
-        forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (forceZ*0x100000000));
+        atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (forceX*0x100000000)));
+        atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceY*0x100000000)));
+        atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (forceZ*0x100000000)));
    }
 }


--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of sorting.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "CudaArray.h"
+#include "CudaContext.h"
+#include "CudaFFT3D.h"
+#include "CudaSort.h"
+#include "fftpack.h"
+#include "sfmt/SFMT.h"
+#include "openmm/System.h"
+#include <iostream>
+#include <cmath>
+#include <set>
+
+using namespace OpenMM;
+using namespace std;
+
+static CudaPlatform platform;
+
+template <class Real2>
+void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
+    System system;
+    system.addParticle(0.0);
+    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()));
+    CudaContext& context = *platformData.contexts[0];
+    context.initialize();
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Real2> original(xsize*ysize*zsize);
+    vector<t_complex> reference(original.size());
+    for (int i = 0; i < (int) original.size(); i++) {
+        Real2 value;
+        value.x = (float) genrand_real2(sfmt);
+        value.y = (float) genrand_real2(sfmt);
+        original[i] = value;
+        reference[i] = t_complex(value.x, value.y);
+    }
+    for (int i = 0; i < (int) reference.size(); i++) {
+        if (realToComplex)
+            reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
+        else
+            reference[i] = t_complex(original[i].x, original[i].y);
+    }
+    CudaArray grid1(context, original.size(), sizeof(Real2), "grid1");
+    CudaArray grid2(context, original.size(), sizeof(Real2), "grid2");
+    grid1.upload(original);
+    CudaFFT3D fft(context, xsize, ysize, zsize, realToComplex);
+
+    // Perform a forward FFT, then verify the result is correct.
+
+    fft.execFFT(grid1, grid2, true);
+    vector<Real2> result;
+    grid2.download(result);
+    fftpack_t plan;
+    fftpack_init_3d(&plan, xsize, ysize, zsize);
+    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
+    int outputZSize = (realToComplex ? zsize/2+1 : zsize);
+    for (int x = 0; x < xsize; x++)
+        for (int y = 0; y < ysize; y++)
+            for (int z = 0; z < outputZSize; z++) {
+                int index1 = x*ysize*zsize + y*zsize + z;
+                int index2 = x*ysize*outputZSize + y*outputZSize + z;
+                ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
+                ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
+            }
+    fftpack_destroy(plan);
+
+    // Perform a backward transform and see if we get the original values.
+
+    fft.execFFT(grid2, grid1, false);
+    grid1.download(result);
+    double scale = 1.0/(xsize*ysize*zsize);
+    int valuesToCheck = (realToComplex ? original.size()/2 : original.size());
+    for (int i = 0; i < valuesToCheck; ++i) {
+        ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4);
+        ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4);
+    }
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        if (argc > 1)
+            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
+        if (platform.getPropertyDefaultValue("CudaPrecision") == "double") {
+            testTransform<double2>(false, 28, 25, 30);
+            testTransform<double2>(true, 28, 25, 25);
+            testTransform<double2>(true, 25, 28, 25);
+            testTransform<double2>(true, 25, 25, 28);
+            testTransform<double2>(true, 21, 25, 27);
+        }
+        else {
+            testTransform<float2>(false, 28, 25, 30);
+            testTransform<float2>(true, 28, 25, 25);
+            testTransform<float2>(true, 25, 28, 25);
+            testTransform<float2>(true, 25, 25, 28);
+            testTransform<float2>(true, 21, 25, 27);
+        }
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -30,13 +30,13 @@
 #include <map>
 #include <queue>
 #include <string>
-#include <pthread.h>
 #define __CL_ENABLE_EXCEPTIONS
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #ifdef _MSC_VER
    // Prevent Windows from defining macros that interfere with other code.
    #define NOMINMAX
 #endif
+#include <pthread.h>
 #include <cl.hpp>
 #include "windowsExportOpenCL.h"
 #include "OpenCLPlatform.h"

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -177,12 +177,12 @@ static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int
    }
 }

-static float reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
+static double reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
    const float scaleFactor = (float) (M_PI*periodicBoxVectors[0][0]*periodicBoxVectors[1][1]*periodicBoxVectors[2][2]);
    const float recipExpFactor = (float) (M_PI*M_PI/(alpha*alpha));
-    float energy = 0.0f;
+    double energy = 0.0;

    int firstz = (start == 0 ? 1 : 0);
    for (int kx = start; kx < end; kx++) {
@@ -220,7 +220,7 @@ static float reciprocalEnergy(int start, int end, fftwf_complex* grid, int gridx
            firstz = 0;
        }
    }
-    return 0.5f*energy;
+    return 0.5*energy;
 }

 static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int gridx, int gridy, int gridz, vector<float>& recipEterm) {

--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#define NOMINMAX
 #include "internal/windowsExportPme.h"
 #include "openmm/kernels.h"
 #include "openmm/Vec3.h"

--- a/wrappers/python/simtk/openmm/__init__.py
+++ b/wrappers/python/simtk/openmm/__init__.py
@@ -20,6 +20,8 @@ if sys.platform == 'win32':

 from simtk.openmm.openmm import *
 from simtk.openmm.vec3 import Vec3
+from simtk.openmm.mtsintegrator import MTSIntegrator
+from simtk.openmm.amd import AMDIntegrator, AMDForceGroupIntegrator, DualAMDIntegrator

 if os.getenv('OPENMM_PLUGIN_DIR') is None and os.path.isdir(version.openmm_library_path):
    pluginLoadedLibNames = Platform.loadPluginsFromDirectory(os.path.join(version.openmm_library_path, 'plugins'))

--- a/wrappers/python/simtk/openmm/amd.py
+++ b/wrappers/python/simtk/openmm/amd.py
+"""
+amd.py: Implements the aMD integration method.
+
+This is part of the OpenMM molecular simulation toolkit originating from
+Simbios, the NIH National Center for Physics-Based Simulation of
+Biological Structures at Stanford, funded under the NIH Roadmap for
+Medical Research, grant U54 GM072970. See https://simtk.org.
+
+Portions copyright (c) 2012 Stanford University and the Authors.
+Authors: Peter Eastman, Steffen Lindert
+Contributors:
+
+Permission is hereby granted, free of charge, to any person obtaining a 
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+__author__ = "Peter Eastman"
+__version__ = "1.0"
+
+from simtk.openmm import CustomIntegrator
+from simtk.unit import kilojoules_per_mole, is_quantity
+
+class AMDIntegrator(CustomIntegrator):
+    """AMDIntegrator implements the aMD integration algorithm.
+    
+    The system is integrated based on a modified potential.  Whenever the energy V(r) is less than a
+    cutoff value E, the following effective potential is used:
+    
+    V*(r) = V(r) + (E-V(r))^2 / (alpha+E-V(r))
+    
+    For details, see Hamelberg et al., J. Chem. Phys. 127, 155102 (2007).
+    """
+    
+    def __init__(self, dt, alpha, E):
+        """Create an AMDIntegrator.
+        
+        Parameters:
+         - dt (time) The integration time step to use
+         - alpha (energy) The alpha parameter to use
+         - E (energy) The energy cutoff to use
+        """
+        CustomIntegrator.__init__(self, dt)
+        self.addGlobalVariable("alpha", alpha)
+        self.addGlobalVariable("E", E)
+        self.addPerDofVariable("oldx", 0)
+        self.addUpdateContextState();
+        self.addComputePerDof("v", "v+dt*fprime/m; fprime=f*((1-modify) + modify*(alpha/(alpha+E-energy))^2); modify=step(E-energy)")
+        self.addComputePerDof("oldx", "x")
+        self.addComputePerDof("x", "x+dt*v")
+        self.addConstrainPositions()
+        self.addComputePerDof("v", "(x-oldx)/dt")
+    
+    def getAlpha(self):
+        """Get the value of alpha for the integrator."""
+        return self.getGlobalVariable(0)*kilojoules_per_mole
+    
+    def setAlpha(self, alpha):
+        """Set the value of alpha for the integrator."""
+        self.setGlobalVariable(0, alpha)
+    
+    def getE(self):
+        """Get the energy threshold E for the integrator."""
+        return self.getGlobalVariable(1)*kilojoules_per_mole
+    
+    def setE(self, E):
+        """Set the energy threshold E for the integrator."""
+        self.setGlobalVariable(1, E)
+    
+    def getEffectiveEnergy(self, energy):
+        """Given the actual potential energy of the system, return the value of the effective potential."""
+        alpha = self.getAlpha()
+        E = self.getE()
+        if not is_quantity(energy):
+            energy = energy*kilojoules_per_mole # Assume kJ/mole
+        if (energy > E):
+            return energy
+        return energy+(E-energy)*(E-energy)/(alpha+E-energy)
+
+
+class AMDForceGroupIntegrator(CustomIntegrator):
+    """AMDForceGroupIntegrator implements a single boost aMD integration algorithm.
+    
+    This is similar to AMDIntegrator, but is applied based on the energy of a single force group
+    (typically representing torsions).
+    
+    For details, see Hamelberg et al., J. Chem. Phys. 127, 155102 (2007).
+    """
+    
+    def __init__(self, dt, group, alphaGroup, EGroup):
+        """Create a AMDForceGroupIntegrator.
+        
+        Parameters:
+         - dt (time) The integration time step to use
+         - group (int) The force group to apply the boost to
+         - alphaGroup (energy) The alpha parameter to use for the boosted force group
+         - EGroup (energy) The energy cutoff to use for the boosted force group
+        """
+        CustomIntegrator.__init__(self, dt)
+        self.addGlobalVariable("alphaGroup", alphaGroup)
+        self.addGlobalVariable("EGroup", EGroup)
+        self.addGlobalVariable("groupEnergy", 0)
+        self.addPerDofVariable("oldx", 0)
+        self.addPerDofVariable("fg", 0)
+        self.addUpdateContextState();
+        self.addComputeGlobal("groupEnergy", "energy"+str(group))
+        self.addComputePerDof("fg", "f"+str(group))
+        self.addComputePerDof("v", "v+dt*fprime/m; fprime=fother + fg*((1-modify) + modify*(alphaGroup/(alphaGroup+EGroup-groupEnergy))^2); fother=f-fg; modify=step(EGroup-groupEnergy)")
+        self.addComputePerDof("oldx", "x")
+        self.addComputePerDof("x", "x+dt*v")
+        self.addConstrainPositions()
+        self.addComputePerDof("v", "(x-oldx)/dt")
+    
+    def getAlphaGroup(self):
+        """Get the value of alpha for the boosted force group."""
+        return self.getGlobalVariable(0)*kilojoules_per_mole
+    
+    def setAlphaGroup(self, alpha):
+        """Set the value of alpha for the boosted force group."""
+        self.setGlobalVariable(0, alpha)
+    
+    def getEGroup(self):
+        """Get the energy threshold E for the boosted force group."""
+        return self.getGlobalVariable(1)*kilojoules_per_mole
+    
+    def setEGroup(self, E):
+        """Set the energy threshold E for the boosted force group."""
+        self.setGlobalVariable(1, E)
+    
+    def getEffectiveEnergy(self, groupEnergy):
+        """Given the actual group energy of the system, return the value of the effective potential.
+        
+        Parameters:
+          - groupEnergy (energy): the actual potential energy of the boosted force group
+        Returns: the value of the effective potential
+        """
+        alphaGroup = self.getAlphaGroup()
+        EGroup = self.getEGroup()
+        if not is_quantity(groupEnergy):
+            groupEnergy = groupEnergy*kilojoules_per_mole # Assume kJ/mole
+        dE = 0.0*kilojoules_per_mole
+        if (groupEnergy < EGroup):
+            dE = dE + (EGroup-groupEnergy)*(EGroup-groupEnergy)/(alphaGroup+EGroup-groupEnergy)
+        return groupEnergy+dE
+
+
+
+class DualAMDIntegrator(CustomIntegrator):
+    """DualAMDIntegrator implements a dual boost aMD integration algorithm.
+    
+    This is similar to AMDIntegrator, but two different boosts are applied to the potential:
+    one based on the total energy, and one based on the energy of a single force group
+    (typically representing torsions).
+
+    For details, see Hamelberg et al., J. Chem. Phys. 127, 155102 (2007).
+    """
+    
+    def __init__(self, dt, group, alphaTotal, ETotal, alphaGroup, EGroup):
+        """Create a DualAMDIntegrator.
+
+        Parameters:
+         - dt (time) The integration time step to use
+         - group (int) The force group to apply the second boost to
+         - alphaTotal (energy) The alpha parameter to use for the total energy
+         - ETotal (energy) The energy cutoff to use for the total energy
+         - alphaGroup (energy) The alpha parameter to use for the boosted force group
+         - EGroup (energy) The energy cutoff to use for the boosted force group
+        """
+        CustomIntegrator.__init__(self, dt)
+        self.addGlobalVariable("alphaTotal", alphaTotal)
+        self.addGlobalVariable("ETotal", ETotal)
+        self.addGlobalVariable("alphaGroup", alphaGroup)
+        self.addGlobalVariable("EGroup", EGroup)
+        self.addGlobalVariable("groupEnergy", 0)
+        self.addPerDofVariable("oldx", 0)
+        self.addPerDofVariable("fg", 0)
+        self.addUpdateContextState();
+        self.addComputeGlobal("groupEnergy", "energy"+str(group))
+        self.addComputePerDof("fg", "f"+str(group))
+        self.addComputePerDof("v", """v+dt*fprime/m;
+                                      fprime=fprime1 + fprime2;
+                                      fprime2=fg*((1-modifyGroup) + modifyGroup*(alphaGroup/(alphaGroup+EGroup-groupEnergy))^2);
+                                      fprime1=fother*((1-modifyTotal) + modifyTotal*(alphaTotal/(alphaTotal+ETotal-energy))^2);
+                                      fother=f-fg;
+                                      modifyTotal=step(ETotal-energy); modifyGroup=step(EGroup-groupEnergy)""")
+        self.addComputePerDof("oldx", "x")
+        self.addComputePerDof("x", "x+dt*v")
+        self.addConstrainPositions()
+        self.addComputePerDof("v", "(x-oldx)/dt")
+    
+    def getAlphaTotal(self):
+        """Get the value of alpha for the total energy."""
+        return self.getGlobalVariable(0)*kilojoules_per_mole
+    
+    def setAlphaTotal(self, alpha):
+        """Set the value of alpha for the total energy."""
+        self.setGlobalVariable(0, alpha)
+    
+    def getETotal(self):
+        """Get the energy threshold E for the total energy."""
+        return self.getGlobalVariable(1)*kilojoules_per_mole
+    
+    def setETotal(self, E):
+        """Set the energy threshold E for the total energy."""
+        self.setGlobalVariable(1, E)
+    
+    def getAlphaGroup(self):
+        """Get the value of alpha for the boosted force group."""
+        return self.getGlobalVariable(2)*kilojoules_per_mole
+    
+    def setAlphaGroup(self, alpha):
+        """Set the value of alpha for the boosted force group."""
+        self.setGlobalVariable(2, alpha)
+    
+    def getEGroup(self):
+        """Get the energy threshold E for the boosted force group."""
+        return self.getGlobalVariable(3)*kilojoules_per_mole
+    
+    def setEGroup(self, E):
+        """Set the energy threshold E for the boosted force group."""
+        self.setGlobalVariable(3, E)
+    
+    def getEffectiveEnergy(self, totalEnergy, groupEnergy):
+        """Given the actual potential energy of the system, return the value of the effective potential.
+        
+        Parameters:
+         - totalEnergy (energy): the actual potential energy of the whole system
+         - groupEnergy (energy): the actual potential energy of the boosted force group
+        Returns: the value of the effective potential
+        """
+        alphaTotal = self.getAlphaTotal()
+        ETotal = self.getETotal()
+        alphaGroup = self.getAlphaGroup()
+        EGroup = self.getEGroup()
+        if not is_quantity(totalEnergy):
+            totalEnergy = totalEnergy*kilojoules_per_mole # Assume kJ/mole
+        if not is_quantity(groupEnergy):
+            groupEnergy = groupEnergy*kilojoules_per_mole # Assume kJ/mole
+        dE = 0.0*kilojoules_per_mole
+        if (totalEnergy < ETotal):
+            dE = dE + (ETotal-totalEnergy)*(ETotal-totalEnergy)/(alphaTotal+ETotal-totalEnergy)
+        if (groupEnergy < EGroup):
+            dE = dE + (EGroup-groupEnergy)*(EGroup-groupEnergy)/(alphaGroup+EGroup-groupEnergy)
+        return totalEnergy+dE
--- a/wrappers/python/simtk/openmm/app/__init__.py
+++ b/wrappers/python/simtk/openmm/app/__init__.py
@@ -4,7 +4,7 @@ OpenMM Application
 __docformat__ = "epytext en"

 __author__ = "Peter Eastman"
-__copyright__ = "Copyright 2011, Stanford University and Peter Eastman"
+__copyright__ = "Copyright 2015, Stanford University and Peter Eastman"
 __credits__ = []
 __license__ = "MIT"
 __maintainer__ = "Peter Eastman"

--- a/wrappers/python/simtk/openmm/app/gromacstopfile.py
+++ b/wrappers/python/simtk/openmm/app/gromacstopfile.py
@@ -8,7 +8,7 @@ Medical Research, grant U54 GM072970. See https://simtk.org.

 Portions copyright (c) 2012-2015 Stanford University and the Authors.
 Authors: Peter Eastman
-Contributors:
+Contributors: Jason Swails

 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
@@ -40,7 +40,9 @@ import simtk.unit as unit
 import simtk.openmm as mm
 import math
 import os
+import re
 import distutils.spawn
+from collections import OrderedDict

 HBonds = ff.HBonds
 AllBonds = ff.AllBonds
@@ -48,6 +50,57 @@ HAngles = ff.HAngles

 OBC2 = prmtop.OBC2

+novarcharre = re.compile(r'\W')
+
+def _find_all_instances_in_string(string, substr):
+    """ Find indices of all instances of substr in string """
+    indices = []
+    idx = string.find(substr, 0)
+    while idx > -1:
+        indices.append(idx)
+        idx = string.find(substr, idx+1)
+    return indices
+
+def _replace_defines(line, defines):
+    """ Replaces defined tokens in a given line """
+    if not defines: return line
+    for define in reversed(defines):
+        value = defines[define]
+        indices = _find_all_instances_in_string(line, define)
+        if not indices: continue
+        # Check to see if it's inside of quotes
+        inside = ''
+        idx = 0
+        n_to_skip = 0
+        new_line = []
+        for i, char in enumerate(line):
+            if n_to_skip:
+                n_to_skip -= 1
+                continue
+            if char in ('\'"'):
+                if not inside:
+                    inside = char
+                else:
+                    if inside == char:
+                        inside = ''
+            if idx < len(indices) and i == indices[idx]:
+                if inside:
+                    new_line.append(char)
+                    idx += 1
+                    continue
+                if i == 0 or novarcharre.match(line[i-1]):
+                    endidx = indices[idx] + len(define)
+                    if endidx >= len(line) or novarcharre.match(line[endidx]):
+                        new_line.extend(list(value))
+                        n_to_skip = len(define) - 1
+                        idx += 1
+                        continue
+                idx += 1
+            new_line.append(char)
+        line = ''.join(new_line)
+
+    return line
+
 class GromacsTopFile(object):
    """GromacsTopFile parses a Gromacs top file and constructs a Topology and (optionally) an OpenMM System from it."""

@@ -61,6 +114,7 @@ class GromacsTopFile(object):
            self.exclusions = []
            self.pairs = []
            self.cmaps = []
+            self.has_virtual_sites = False

    def _processFile(self, file):
        append = ''
@@ -113,6 +167,7 @@ class GromacsTopFile(object):
                name = fields[1]
                valueStart = stripped.find(name, len(command))+len(name)+1
                value = line[valueStart:].strip()
+                value = value or '1' # Default define is 1
                self._defines[name] = value
            elif command == '#ifdef':
                # See whether this block should be ignored.
@@ -121,6 +176,12 @@ class GromacsTopFile(object):
                name = fields[1]
                self._ifStack.append(name in self._defines)
                self._elseStack.append(False)
+            elif command == '#undef':
+                # Un-define a variable
+                if len(fields) < 2:
+                    raise ValueError('Illegal line in .top file: '+line)
+                if fields[1] in self._defines:
+                    self._defines.pop(fields[1])
            elif command == '#ifndef':
                # See whether this block should be ignored.
                if len(fields) < 2:
@@ -145,6 +206,11 @@ class GromacsTopFile(object):
                self._elseStack[-1] = True

        elif not ignore:
+            # Gromacs occasionally uses #define's to introduce specific
+            # parameters for individual terms (for instance, this is how
+            # ff99SB-ILDN is implemented). So make sure we do the appropriate
+            # pre-processor replacements necessary
+            line = _replace_defines(line, self._defines)
            # A line of data for the current category
            if self._currentCategory is None:
                raise ValueError('Unexpected line in .top file: '+line)
@@ -182,6 +248,11 @@ class GromacsTopFile(object):
                self._processPairType(line)
            elif self._currentCategory == 'cmaptypes':
                self._processCmapType(line)
+            elif self._currentCategory.startswith('virtual_sites'):
+                if self._currentMoleculeType is None:
+                    raise ValueError('Found %s before [ moleculetype ]' %
+                                     self._currentCategory)
+                self._currentMoleculeType.has_virtual_sites = True

    def _processDefaults(self, line):
        """Process the [ defaults ] line."""
@@ -193,7 +264,7 @@ class GromacsTopFile(object):
        if fields[1] != '2':
            raise ValueError('Unsupported combination rule: '+fields[1])
        if fields[2].lower() == 'no':
-            raise ValueError('gen_pairs=no is not supported')
+            self._genpairs = False
        self._defaults = fields

    def _processMoleculeType(self, line):
@@ -379,9 +450,12 @@ class GromacsTopFile(object):
        # unless the preprocessor #define FLEXIBLE is given, don't define
        # bonds between the water hydrogen and oxygens, but only give the
        # constraint distances and exclusions.
-        self._defines = {'FLEXIBLE': True}
+        self._defines = OrderedDict()
+        self._defines['FLEXIBLE'] = True
+        self._genpairs = True
        if defines is not None:
-            self._defines.update(defines)
+            for define, value in defines.iteritems():
+                self._defines[define] = value

        # Parse the file.

@@ -416,6 +490,8 @@ class GromacsTopFile(object):
            if moleculeName not in self._moleculeTypes:
                raise ValueError("Unknown molecule type: "+moleculeName)
            moleculeType = self._moleculeTypes[moleculeName]
+            if moleculeCount > 0 and moleculeType.has_virtual_sites:
+                raise ValueError('Virtual sites not yet supported by Gromacs parsers')

            # Create the specified number of molecules of this type.

@@ -747,6 +823,9 @@ class GromacsTopFile(object):
                        params = self._pairTypes[types][3:5]
                    elif types[::-1] in self._pairTypes:
                        params = self._pairTypes[types[::-1]][3:5]
+                    elif not self._genpairs:
+                        raise ValueError('No pair parameters defined for atom '
+                                         'types %s and gen-pairs is "no"' % types)
                    else:
                        continue # We'll use the automatically generated parameters
                    atom1params = nb.getParticleParameters(baseAtomIndex+atoms[0])
@@ -796,8 +875,8 @@ class GromacsTopFile(object):
 def _defaultGromacsIncludeDir():
    """Find the location where gromacs #include files are referenced from, by
    searching for (1) gromacs environment variables, (2) for the gromacs binary
-    'pdb2gmx' in the PATH, or (3) just using the default gromacs install
-    location, /usr/local/gromacs/share/gromacs/top """
+    'pdb2gmx' or 'gmx' in the PATH, or (3) just using the default gromacs
+    install location, /usr/local/gromacs/share/gromacs/top """
    if 'GMXDATA' in os.environ:
        return os.path.join(os.environ['GMXDATA'], 'top')
    if 'GMXBIN' in os.environ:
@@ -806,5 +885,9 @@ def _defaultGromacsIncludeDir():
    pdb2gmx_path = distutils.spawn.find_executable('pdb2gmx')
    if pdb2gmx_path is not None:
        return os.path.abspath(os.path.join(os.path.dirname(pdb2gmx_path), '..', 'share', 'gromacs', 'top'))
+    else:
+        gmx_path = distutils.spawn.find_executable('gmx')
+        if gmx_path is not None:
+            return os.path.abspath(os.path.join(os.path.dirname(gmx_path), '..', 'share', 'gromacs', 'top'))

    return '/usr/local/gromacs/share/gromacs/top'
--- a/wrappers/python/simtk/openmm/app/internal/amber_file_parser.py
+++ b/wrappers/python/simtk/openmm/app/internal/amber_file_parser.py
@@ -1363,6 +1363,7 @@ class AmberNetcdfRestart(object):
            if 'velocities' in ncfile.variables:
                vels = ncfile.variables['velocities']
                self.velocities = np.array(vels[:]) * vels.scale_factor
+                del vels # Get rid of reference to variable to avoid warnings
            if ('cell_lengths' in ncfile.variables and
                'cell_angles' in ncfile.variables):
                self.boxVectors = np.zeros((3,3), np.float32)
@@ -1372,6 +1373,7 @@ class AmberNetcdfRestart(object):
                        units.degrees)
                self.boxVectors = computePeriodicBoxVectors(leng[0], leng[1],
                        leng[2], angl[0], angl[1], angl[2])
+                del leng, angl # Avoid warnings
            if 'time' in ncfile.variables:
                self.time = ncfile.variables['time'].getValue()
        finally:

--- a/wrappers/python/simtk/openmm/app/internal/customgbforces.py
+++ b/wrappers/python/simtk/openmm/app/internal/customgbforces.py
@@ -6,7 +6,7 @@ Simbios, the NIH National Center for Physics-Based Simulation of
 Biological Structures at Stanford, funded under the NIH Roadmap for
 Medical Research, grant U54 GM072970. See https://simtk.org.

-Portions copyright (c) 2012-2014 University of Virginia and the Authors.
+Portions copyright (c) 2012-2015 University of Virginia and the Authors.
 Authors: Christoph Klein, Michael R. Shirts
 Contributors: Jason M. Swails, Peter Eastman

@@ -31,7 +31,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.

 from __future__ import division

-from simtk.openmm import CustomGBForce, Discrete1DFunction
+from simtk.openmm import CustomGBForce, Continuous2DFunction

 d0=[2.26685,2.32548,2.38397,2.44235,2.50057,2.55867,2.61663,2.67444,
    2.73212,2.78965,2.84705,2.9043,2.96141,3.0184,3.07524,3.13196,
@@ -308,12 +308,11 @@ def GBSAGBnForce(solventDielectric=78.5, soluteDielectric=1, SA=None,
    custom.addPerParticleParameter("or") # Offset radius
    custom.addPerParticleParameter("sr") # Scaled offset radius

-    custom.addTabulatedFunction("getd0", Discrete1DFunction(d0))
-    custom.addTabulatedFunction("getm0", Discrete1DFunction(m0))
+    custom.addTabulatedFunction("getd0", Continuous2DFunction(21, 21, d0, 0.1, 0.2, 0.1, 0.2))
+    custom.addTabulatedFunction("getm0", Continuous2DFunction(21, 21, m0, 0.1, 0.2, 0.1, 0.2))

    custom.addComputedValue("I",  "Ivdw+neckScale*Ineck;"
-                                  "Ineck=step(radius1+radius2+neckCut-r)*getm0(index)/(1+100*(r-getd0(index))^2+0.3*1000000*(r-getd0(index))^6);"
-                                  "index = (radius2*200-20)*21 + (radius1*200-20);"
+                                  "Ineck=step(radius1+radius2+neckCut-r)*getm0(radius1,radius2)/(1+100*(r-getd0(radius1,radius2))^2+0.3*1000000*(r-getd0(radius1,radius2))^6);"
                                  "Ivdw=step(r+sr2-or1)*0.5*(1/L-1/U+0.25*(r-sr2^2/r)*(1/(U^2)-1/(L^2))+0.5*log(L/U)/r);"
                                  "U=r+sr2;"
                                  "L=max(or1, D);"
@@ -350,12 +349,11 @@ def GBSAGBn2Force(solventDielectric=78.5, soluteDielectric=1, SA=None,
    custom.addPerParticleParameter("beta")
    custom.addPerParticleParameter("gamma")

-    custom.addTabulatedFunction("getd0", Discrete1DFunction(d0))
-    custom.addTabulatedFunction("getm0", Discrete1DFunction(m0))
+    custom.addTabulatedFunction("getd0", Continuous2DFunction(21, 21, d0, 0.1, 0.2, 0.1, 0.2))
+    custom.addTabulatedFunction("getm0", Continuous2DFunction(21, 21, m0, 0.1, 0.2, 0.1, 0.2))

    custom.addComputedValue("I",  "Ivdw+neckScale*Ineck;"
-                                  "Ineck=step(radius1+radius2+neckCut-r)*getm0(index)/(1+100*(r-getd0(index))^2+0.3*1000000*(r-getd0(index))^6);"
-                                  "index = (radius2*200-20)*21 + (radius1*200-20);"
+                                  "Ineck=step(radius1+radius2+neckCut-r)*getm0(radius1,radius2)/(1+100*(r-getd0(radius1,radius2))^2+0.3*1000000*(r-getd0(radius1,radius2))^6);"
                                  "Ivdw=step(r+sr2-or1)*0.5*(1/L-1/U+0.25*(r-sr2^2/r)*(1/(U^2)-1/(L^2))+0.5*log(L/U)/r);"
                                  "U=r+sr2;"
                                  "L=max(or1, D);"

--- a/wrappers/python/simtk/openmm/app/simulation.py
+++ b/wrappers/python/simtk/openmm/app/simulation.py
--- a/wrappers/python/simtk/openmm/app/statedatareporter.py
+++ b/wrappers/python/simtk/openmm/app/statedatareporter.py