Use PocketFFT (#3667)

* Use PocketFFT instead of FFTW * Minor cleanup * Use PocketFFT instead of fftpack for reference platform * Remove FFTW as a dependency * Converted a test case to use PocketFFT * Fixed an incorrect comment

Use PocketFFT (#3667)
* Use PocketFFT instead of FFTW * Minor cleanup * Use PocketFFT instead of fftpack for reference platform * Remove FFTW as a dependency * Converted a test case to use PocketFFT * Fixed an incorrect comment
1dac981a · Peter Eastman · GitHub · 583471a6 · 1dac981a · 583471a6
Unverified Commit 1dac981a authored Jun 30, 2022 by Peter Eastman Committed by GitHub Jun 30, 2022
13 changed files
--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -30,7 +30,7 @@
 * -------------------------------------------------------------------------- */
 /**
- * This tests the OpenCL implementation of sorting.
+ * This tests the OpenCL implementation of FFT.
 */
 #include "openmm/internal/AssertionUtilities.h"
@@ -38,12 +38,16 @@
 #include "OpenCLContext.h"
 #include "OpenCLFFT3D.h"
 #include "OpenCLSort.h"
-#include "fftpack.h"
 #include "sfmt/SFMT.h"
 #include "openmm/System.h"
+#include <complex>
 #include <iostream>
 #include <cmath>
 #include <set>
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#include "pocketfft_hdronly.h"
 using namespace OpenMM;
 using namespace std;
@@ -60,17 +64,17 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
    vector<Real2> original(xsize*ysize*zsize);
-    vector<t_complex> reference(original.size());
+    vector<complex<double> > reference(original.size());
    for (int i = 0; i < (int) original.size(); i++) {
        Real2 value = Real2((cl_float) genrand_real2(sfmt), (cl_float) genrand_real2(sfmt));
        original[i] = value;
-        reference[i] = t_complex(value.x, value.y);
+        reference[i] = complex<double>(value.x, value.y);
    }
    for (int i = 0; i < (int) reference.size(); i++) {
        if (realToComplex)
-            reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
+            reference[i] = complex<double>(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
        else
-            reference[i] = t_complex(original[i].x, original[i].y);
+            reference[i] = complex<double>(original[i].x, original[i].y);
    }
    OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1");
    OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2");
@@ -82,19 +86,21 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    fft.execFFT(grid1, grid2, true);
    vector<Real2> result;
    grid2.download(result);
-    fftpack_t plan;
+    vector<size_t> shape = {(size_t) xsize, (size_t) ysize, (size_t) zsize};
-    fftpack_init_3d(&plan, xsize, ysize, zsize);
+    vector<size_t> axes = {0, 1, 2};
-    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (ysize*zsize*sizeof(complex<double>)),
+                                (ptrdiff_t) (zsize*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, reference.data(), reference.data(), 1.0);
    int outputZSize = (realToComplex ? zsize/2+1 : zsize);
    for (int x = 0; x < xsize; x++)
        for (int y = 0; y < ysize; y++)
            for (int z = 0; z < outputZSize; z++) {
                int index1 = x*ysize*zsize + y*zsize + z;
                int index2 = x*ysize*outputZSize + y*outputZSize + z;
-                ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
+                ASSERT_EQUAL_TOL(reference[index1].real(), result[index2].x, 1e-3);
-                ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
+                ASSERT_EQUAL_TOL(reference[index1].imag(), result[index2].y, 1e-3);
            }
-    fftpack_destroy(plan);
    // Perform a backward transform and see if we get the original values.

--- a/platforms/reference/include/fftpack.h
+++ b/platforms/reference/include/fftpack.h
-/*
-* This file contains a Fortran to C translation of the 1D transformations
-* based on the original FFTPACK, written by paul n swarztrauber
-* at the national center for atmospheric research and available
-* at www.netlib.org. FFTPACK is in the public domain.
-*
-* Higher-dimension transforms written by Erik Lindahl, 2008-2009.
-* Just as FFTPACK, this file may be redistributed freely, and can be
-* considered to be in the public domain. 
-*
-* Any errors in this (threadsafe, but not threaded) C version
-* are probably due to the f2c translator, or hacks by Erik Lindahl,
-* rather than FFTPACK. If you find a bug, it would be great if you could
-* drop a line to lindahl@cbr.su.se and let me know about it!
-*/
-#ifndef _FFTPACK_H_
-#define _FFTPACK_H_
-#include <stdio.h>
-#include "openmm/internal/windowsExport.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if 0
-} /* fixes auto-indentation problems */
-#endif
-class t_complex {
-public:
-    double re;
-    double im;
-    t_complex() : re(0.0), im(0.0) {
-    }
-    t_complex(double re, double im) : re(re), im(im) {
-    }
-    t_complex(const t_complex& c) : re(c.re), im(c.im) {
-    }
-    t_complex operator*(double r) {
-        return t_complex(re*r, im*r);
-    }
-    t_complex operator+(const t_complex& c) const {
-        return t_complex(re+c.re, im+c.im);
-    }
-    t_complex operator-(const t_complex& c) const {
-        return t_complex(re-c.re, im-c.im);
-    }
-    t_complex& operator+=(const t_complex& c) {
-        re += c.re;
-        im += c.im;
-        return *this;
-    }
-    t_complex& operator-=(const t_complex& c) {
-        re -= c.re;
-        im -= c.im;
-        return *this;
-    }
-};
-/*! \brief Datatype for FFT setup
- *
- *  The fftpack_t type contains all the setup information, e.g. twiddle
- *  factors, necessary to perform an FFT. Internally it is mapped to
- *  whatever FFT library we are using, or the built-in FFTPACK if no fast
- *  external library is available.
- */
-typedef struct fftpack *
-fftpack_t;
-/*! \brief Specifier for FFT direction.
- *
- *  The definition of the 1D forward transform from input x[] to output y[] is
- *  \f[
- *  y_{k} = \sum_{j=0}^{N-1} x_{j} \exp{-i 2 \pi j k /N}
- *  \f]
- *
- *  while the corresponding backward transform is
- *
- *  \f[
- *  y_{k} = \sum_{j=0}^{N-1} x_{j} \exp{i 2 \pi j k /N}
- *  \f]
- *
- *  A forward-backward transform pair will this result in data scaled by N.
- *
- */
-typedef enum fftpack_direction
-{
-    FFTPACK_FORWARD,         /*!< Forward complex-to-complex transform  */
-    FFTPACK_BACKWARD,        /*!< Backward complex-to-complex transform */
-} fftpack_direction;
-/*! \brief Setup a 1-dimensional complex-to-complex transform
- *
- *  \param fft    Pointer to opaque Gromacs FFT datatype
- *  \param nx     Length of transform
- *
- *  \return status - 0 or a standard error message.
- */
-int
-OPENMM_EXPORT
-fftpack_init_1d        (fftpack_t *       fft,
-                        int               nx);
-/*! \brief Setup a 2-dimensional complex-to-complex transform
- *
- *  \param fft    Pointer to opaque Gromacs FFT datatype
- *  \param nx     Length of transform in first dimension
- *  \param ny     Length of transform in second dimension
- *
- *  \return status - 0 or a standard error message.
- *
- */
-int
-OPENMM_EXPORT
-fftpack_init_2d        (fftpack_t *         fft,
-                        int                 nx,
-                        int                 ny);
-/*! \brief Setup a 3-dimensional complex-to-complex transform
- *
- *  \param fft    Pointer to opaque Gromacs FFT datatype
- *  \param nx     Length of transform in first dimension
- *  \param ny     Length of transform in second dimension
- *  \param nz     Length of transform in third dimension
- *
- *  \return status - 0 or a standard error message.
- *
- */
-int
-OPENMM_EXPORT
-fftpack_init_3d        (fftpack_t *         fft,
-                        int                 nx,
-                        int                 ny,
-                        int                 nz);
-/*! \brief Perform a 1-dimensional complex-to-complex transform
- *
- *  Performs an instance of a transform previously initiated.
- *
- *  \param setup     Setup returned from fftpack_init_1d()
- *  \param dir       Forward or Backward
- *  \param in_data   Input grid data.
- *  \param out_data  Output grid data.
- *                   You can provide the same pointer for in_data and out_data
- *                   to perform an in-place transform.
- *
- * \return 0 on success, or an error code.
- *
- * \note Data pointers are declared as void, to avoid casting pointers
- *       depending on your grid type.
- */
-int
-OPENMM_EXPORT
-fftpack_exec_1d          (fftpack_t                  setup,
-                          enum fftpack_direction     dir,
-                          t_complex *                in_data,
-                          t_complex *                out_data);
-/*! \brief Perform a 2-dimensional complex-to-complex transform
- *
- *  Performs an instance of a transform previously initiated.
- *
- *  \param setup     Setup returned from fftpack_init_1d()
- *  \param dir       Forward or Backward
- *  \param in_data   Input grid data.
- *  \param out_data  Output grid data.
- *                   You can provide the same pointer for in_data and out_data
- *                   to perform an in-place transform.
- *
- * \return 0 on success, or an error code.
- *
- * \note Data pointers are declared as void, to avoid casting pointers
- *       depending on your grid type.
- */
-int
-OPENMM_EXPORT
-fftpack_exec_2d          (fftpack_t                  setup,
-                          enum fftpack_direction     dir,
-                          t_complex *                in_data,
-                          t_complex *                out_data);
-/*! \brief Perform a 3-dimensional complex-to-complex transform
- *
- *  Performs an instance of a transform previously initiated.
- *
- *  \param setup     Setup returned from fftpack_init_1d()
- *  \param dir       Forward or Backward
- *  \param in_data   Input grid data.
- *  \param out_data  Output grid data.
- *                   You can provide the same pointer for in_data and out_data
- *                   to perform an in-place transform.
- *
- * \return 0 on success, or an error code.
- *
- * \note Data pointers are declared as void, to avoid casting pointers
- *       depending on your grid type.
- */
-int
-OPENMM_EXPORT
-fftpack_exec_3d          (fftpack_t                  setup,
-                          enum fftpack_direction     dir,
-                          t_complex *                in_data,
-                          t_complex *                out_data);
-/*! \brief Release an FFT setup structure
- *
- *  Destroy setup and release all allocated memory.
- *
- *  \param setup Setup returned from fftpack_init_1d(), or one
- *         of the other initializers.
- *
- */
-void
-OPENMM_EXPORT
-fftpack_destroy          (fftpack_t                 setup);
-#ifdef __cplusplus
-}
-#endif
-#endif /* _FFTPACK_H_ */
--- a/platforms/reference/src/SimTKReference/ReferencePME.cpp
+++ b/platforms/reference/src/SimTKReference/ReferencePME.cpp
 /*
 * Reference implementation of PME reciprocal space interactions.
 *
- * Copyright (c) 2009, Erik Lindahl, Rossen Apostolov, Szilard Pall
+ * Copyright (c) 2009-2022, Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman
 * All rights reserved.
 * Contact: lindahl@cbr.su.se Stockholm University, Sweden.
 *
@@ -32,12 +32,17 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <complex>
 #include "ReferencePME.h"
-#include "fftpack.h"
 #include "SimTKOpenMMRealType.h"
-using std::vector;
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#include "pocketfft_hdronly.h"
+using namespace std;
 typedef int    ivec[3];
@@ -48,12 +53,11 @@ struct pme
    int          natoms;
    double       ewaldcoeff;
-    t_complex *  grid;                 /* Memory for the grid we spread charges on.
+    complex<double>* grid;             /* Memory for the grid we spread charges on.
                                        * Element (i,j,k) is accessed as:
                                        * grid[i*ngrid[1]*ngrid[2] + j*ngrid[2] + k]
                                        */
    int          ngrid[3];             /* Total grid dimensions (all data is complex!) */
-    fftpack_t    fftplan;              /* Handle to fourier transform setup  */
    int          order;                /* PME interpolation order. Almost always 4 */
@@ -346,7 +350,7 @@ pme_grid_spread_charge(pme_t pme, const vector<double>& charges)
    /* Reset the grid */
    for (i=0;i<pme->ngrid[0]*pme->ngrid[1]*pme->ngrid[2];i++)
    {
-        pme->grid[i].re = pme->grid[i].im = 0;
+        pme->grid[i] = complex<double>(0, 0);
    }
    for (i=0;i<pme->natoms;i++)
@@ -393,11 +397,11 @@ pme_grid_spread_charge(pme_t pme, const vector<double>& charges)
                for (iz=0;iz<order;iz++)
                {
                    /* Can be optimized, but we keep it simple here */
-                    zindex               = (z0index + iz) % pme->ngrid[2];
+                    zindex            = (z0index + iz) % pme->ngrid[2];
                    /* Calculate index in the charge grid */
-                    index                = xindex*pme->ngrid[1]*pme->ngrid[2] + yindex*pme->ngrid[2] + zindex;
+                    index             = xindex*pme->ngrid[1]*pme->ngrid[2] + yindex*pme->ngrid[2] + zindex;
                    /* Add the charge times the bspline spread/interpolation factors to this grid position */
-                    pme->grid[index].re += q*thetax[ix]*thetay[iy]*thetaz[iz];
+                    pme->grid[index] += q*thetax[ix]*thetay[iy]*thetaz[iz];
                }
            }
        }
@@ -427,7 +431,7 @@ pme_reciprocal_convolution(pme_t     pme,
    double boxfactor;
    double maxkx,maxky,maxkz;
-    t_complex *ptr;
+    complex<double> *ptr;
    nx = pme->ngrid[0];
    ny = pme->ngrid[1];
@@ -486,8 +490,8 @@ pme_reciprocal_convolution(pme_t     pme,
                ptr       = pme->grid + kx*ny*nz + ky*nz + kz;
                /* Get grid data for this frequency */
-                d1        = ptr->re;
+                d1        = ptr->real();
-                d2        = ptr->im;
+                d2        = ptr->imag();
                /* Calculate the convolution - see the Essman/Darden paper for the equation! */
                m2        = mhx*mhx+mhy*mhy+mhz*mhz;
@@ -497,8 +501,8 @@ pme_reciprocal_convolution(pme_t     pme,
                eterm     = one_4pi_eps*exp(-factor*m2)/denom;
                /* write back convolution data to grid */
-                ptr->re   = d1*eterm;
+                ptr->real(d1*eterm);
-                ptr->im   = d2*eterm;
+                ptr->imag(d2*eterm);
                struct2   = (d1*d1+d2*d2);
@@ -532,7 +536,7 @@ dpme_reciprocal_convolution(pme_t pme,
    double boxfactor;
    double maxkx,maxky,maxkz;
-    t_complex *ptr;
+    complex<double> *ptr;
    nx = pme->ngrid[0];
    ny = pme->ngrid[1];
@@ -580,8 +584,8 @@ dpme_reciprocal_convolution(pme_t pme,
                ptr       = pme->grid + kx*ny*nz + ky*nz + kz;
                /* Get grid data for this frequency */
-                d1        = ptr->re;
+                d1        = ptr->real();
-                d2        = ptr->im;
+                d2        = ptr->imag();
                /* Calculate the convolution - see the Essman/Darden paper for the equation! */
                m2        = mhx*mhx+mhy*mhy+mhz*mhz;
@@ -598,8 +602,8 @@ dpme_reciprocal_convolution(pme_t pme,
                eterm     = (fac1*erfcterm*m3 + expterm*(fac2 + fac3*m2)) * denom;
                /* write back convolution data to grid */
-                ptr->re   = d1*eterm;
+                ptr->real(d1*eterm);
-                ptr->im   = d2*eterm;
+                ptr->imag(d2*eterm);
                struct2   = (d1*d1+d2*d2);
@@ -696,7 +700,7 @@ pme_grid_interpolate_force(pme_t pme,
                    /* Get the fft+convoluted+ifft:d data from the grid, which must be real by definition */
                    /* Checking that the imaginary part is indeed zero might be a good check :-) */
-                    gridvalue            = pme->grid[index].re;
+                    gridvalue            = pme->grid[index].real();
                    /* The d component of the force is calculated by taking the derived bspline in dimension d, normal bsplines in the other two */
                    fx                  += dtx*ty*tz*gridvalue;
@@ -745,9 +749,7 @@ pme_init(pme_t *       ppme,
    pme->particleindex    = (ivec *)malloc(sizeof(ivec)*natoms);
    /* Allocate charge grid storage */
-    pme->grid        = (t_complex *)malloc(sizeof(t_complex)*ngrid[0]*ngrid[1]*ngrid[2]);
+    pme->grid        = (complex<double> *)malloc(sizeof(complex<double>)*ngrid[0]*ngrid[1]*ngrid[2]);
-    fftpack_init_3d(&pme->fftplan,ngrid[0],ngrid[1],ngrid[2]);
    /* Setup bspline moduli (see Essman paper) */
    pme_calculate_bsplines_moduli(pme);
@@ -790,13 +792,18 @@ int pme_exec(pme_t       pme,
    pme_grid_spread_charge(pme, charges);
    /* do 3d-fft */
-    fftpack_exec_3d(pme->fftplan,FFTPACK_FORWARD,pme->grid,pme->grid);
+    vector<size_t> shape = {(size_t) pme->ngrid[0], (size_t) pme->ngrid[1], (size_t) pme->ngrid[2]};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (pme->ngrid[1]*pme->ngrid[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) (pme->ngrid[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, pme->grid, pme->grid, 1.0, 0);
    /* solve in k-space */
    pme_reciprocal_convolution(pme,periodicBoxVectors,recipBoxVectors,energy);
    /* do 3d-invfft */
-    fftpack_exec_3d(pme->fftplan,FFTPACK_BACKWARD,pme->grid,pme->grid);
+    pocketfft::c2c(shape, stride, stride, axes, false, pme->grid, pme->grid, 1.0, 0);
    /* Get the particle forces from the grid and bsplines in the pme structure */
    pme_grid_interpolate_force(pme,recipBoxVectors,charges,forces);
@@ -834,13 +841,18 @@ int pme_exec_dpme(pme_t       pme,
    pme_grid_spread_charge(pme, c6s);
    /* do 3d-fft */
-    fftpack_exec_3d(pme->fftplan,FFTPACK_FORWARD,pme->grid,pme->grid);
+    vector<size_t> shape = {(size_t) pme->ngrid[0], (size_t) pme->ngrid[1], (size_t) pme->ngrid[2]};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (pme->ngrid[1]*pme->ngrid[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) (pme->ngrid[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, pme->grid, pme->grid, 1.0, 0);
    /* solve in k-space */
    dpme_reciprocal_convolution(pme,periodicBoxVectors,recipBoxVectors,energy);
    /* do 3d-invfft */
-    fftpack_exec_3d(pme->fftplan,FFTPACK_BACKWARD,pme->grid,pme->grid);
+    pocketfft::c2c(shape, stride, stride, axes, false, pme->grid, pme->grid, 1.0, 0);
    /* Get the particle forces from the grid and bsplines in the pme structure */
    pme_grid_interpolate_force(pme,recipBoxVectors,c6s,forces);
@@ -866,8 +878,6 @@ pme_destroy(pme_t    pme)
    free(pme->particlefraction);
    free(pme->particleindex);
-    fftpack_destroy(pme->fftplan);
    /* destroy structure itself */
    free(pme);

--- a/platforms/reference/src/SimTKReference/fftpack.cpp
+++ b/platforms/reference/src/SimTKReference/fftpack.cpp
-/*
-* This file contains a Fortran to C translation of the 1D transformations
-* based on the original FFTPACK, written by paul n swarztrauber
-* at the national center for atmospheric research and available
-* at www.netlib.org. FFTPACK is in the public domain.
-*
-* Higher-dimension transforms written by Erik Lindahl, 2008-2009.
-* Just as FFTPACK, this file may be redistributed freely, and can be
-* considered to be in the public domain. 
-*
-* Any errors in this (threadsafe, but not threaded) C version
-* are probably due to the f2c translator, or hacks by Erik Lindahl,
-* rather than FFTPACK. If you find a bug, it would be great if you could
-* drop a line to lindahl@cbr.su.se and let me know about it!
-*/
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <stdio.h>
-#include "fftpack.h"
-/** Contents of the FFTPACK fft datatype.
- *
- *  FFTPACK only does 1d transforms, so we use a pointers to another fft for
- *  the transform in the next dimension.
- * Thus, a 3d-structure contains a pointer to a 2d one, which in turns contains
- * a pointer to a 1d. The 1d structure has next==NULL.
- */
-struct fftpack
-{
-    int                    ndim;     /**< Dimensions, including our subdimensions.  */
-    int                    n;        /**< Number of points in this dimension.       */
-    int                    ifac[15]; /**< 15 bytes needed for cfft and rfft         */
-    struct fftpack *       next;     /**< Pointer to next dimension, or NULL.       */
-    double *               work;     /**< 1st 4n reserved for cfft, 1st 2n for rfft */
-};
-static void
-fftpack_passf2(int    ido,
-               int    l1,
-               double cc[],
-               double ch[],
-               double wa1[],
-               int    isign)
-{
-    int i, k, ah, ac;
-    double ti2, tr2;
-    if (ido <= 2)
-    {
-        for (k=0; k<l1; k++)
-        {
-            ah = k*ido;
-            ac = 2*k*ido;
-            ch[ah]              = cc[ac]   + cc[ac + ido];
-            ch[ah + ido*l1]     = cc[ac]   - cc[ac + ido];
-            ch[ah+1]            = cc[ac+1] + cc[ac + ido + 1];
-            ch[ah + ido*l1 + 1] = cc[ac+1] - cc[ac + ido + 1];
-        }
-    }
-    else
-    {
-        for (k=0; k<l1; k++)
-        {
-            for (i=0; i<ido-1; i+=2)
-            {
-                ah              = i + k*ido;
-                ac              = i + 2*k*ido;
-                ch[ah]          = cc[ac] + cc[ac + ido];
-                tr2             = cc[ac] - cc[ac + ido];
-                ch[ah+1]        = cc[ac+1] + cc[ac + 1 + ido];
-                ti2             = cc[ac+1] - cc[ac + 1 + ido];
-                ch[ah+l1*ido+1] = wa1[i]*ti2 + isign*wa1[i+1]*tr2;
-                ch[ah+l1*ido]   = wa1[i]*tr2 - isign*wa1[i+1]*ti2;
-            }
-        }
-    }
-}
-static void
-fftpack_passf3(int    ido,
-               int    l1,
-               double cc[],
-               double ch[],
-               double wa1[],
-               double wa2[],
-               int    isign)
-{
-    const double taur = -0.5;
-    const double taui = 0.866025403784439;
-    int i, k, ac, ah;
-    double ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-    if (ido == 2)
-    {
-        for (k=1; k<=l1; k++)
-        {
-            ac = (3*k - 2)*ido;
-            tr2 = cc[ac] + cc[ac + ido];
-            cr2 = cc[ac - ido] + taur*tr2;
-            ah = (k - 1)*ido;
-            ch[ah] = cc[ac - ido] + tr2;
-            ti2 = cc[ac + 1] + cc[ac + ido + 1];
-            ci2 = cc[ac - ido + 1] + taur*ti2;
-            ch[ah + 1] = cc[ac - ido + 1] + ti2;
-            cr3 = isign*taui*(cc[ac] - cc[ac + ido]);
-            ci3 = isign*taui*(cc[ac + 1] - cc[ac + ido + 1]);
-            ch[ah + l1*ido] = cr2 - ci3;
-            ch[ah + 2*l1*ido] = cr2 + ci3;
-            ch[ah + l1*ido + 1] = ci2 + cr3;
-            ch[ah + 2*l1*ido + 1] = ci2 - cr3;
-        }
-    }
-    else
-    {
-        for (k=1; k<=l1; k++)
-        {
-            for (i=0; i<ido-1; i+=2)
-            {
-                ac = i + (3*k - 2)*ido;
-                tr2 = cc[ac] + cc[ac + ido];
-                cr2 = cc[ac - ido] + taur*tr2;
-                ah = i + (k-1)*ido;
-                ch[ah] = cc[ac - ido] + tr2;
-                ti2 = cc[ac + 1] + cc[ac + ido + 1];
-                ci2 = cc[ac - ido + 1] + taur*ti2;
-                ch[ah + 1] = cc[ac - ido + 1] + ti2;
-                cr3 = isign*taui*(cc[ac] - cc[ac + ido]);
-                ci3 = isign*taui*(cc[ac + 1] - cc[ac + ido + 1]);
-                dr2 = cr2 - ci3;
-                dr3 = cr2 + ci3;
-                di2 = ci2 + cr3;
-                di3 = ci2 - cr3;
-                ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
-                ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
-                ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
-                ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
-            }
-        }
-    }
-}
-static void
-fftpack_passf4(int    ido,
-               int    l1,
-               double cc[],
-               double ch[],
-               double wa1[],
-               double wa2[],
-               double wa3[],
-               int    isign)
-{
-    int i, k, ac, ah;
-    double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    if (ido == 2)
-    {
-        for (k=0; k<l1; k++)
-        {
-            ac = 4*k*ido + 1;
-            ti1 = cc[ac] - cc[ac + 2*ido];
-            ti2 = cc[ac] + cc[ac + 2*ido];
-            tr4 = cc[ac + 3*ido] - cc[ac + ido];
-            ti3 = cc[ac + ido] + cc[ac + 3*ido];
-            tr1 = cc[ac - 1] - cc[ac + 2*ido - 1];
-            tr2 = cc[ac - 1] + cc[ac + 2*ido - 1];
-            ti4 = cc[ac + ido - 1] - cc[ac + 3*ido - 1];
-            tr3 = cc[ac + ido - 1] + cc[ac + 3*ido - 1];
-            ah = k*ido;
-            ch[ah] = tr2 + tr3;
-            ch[ah + 2*l1*ido] = tr2 - tr3;
-            ch[ah + 1] = ti2 + ti3;
-            ch[ah + 2*l1*ido + 1] = ti2 - ti3;
-            ch[ah + l1*ido] = tr1 + isign*tr4;
-            ch[ah + 3*l1*ido] = tr1 - isign*tr4;
-            ch[ah + l1*ido + 1] = ti1 + isign*ti4;
-            ch[ah + 3*l1*ido + 1] = ti1 - isign*ti4;
-        }
-    }
-    else
-    {
-        for (k=0; k<l1; k++)
-        {
-            for (i=0; i<ido-1; i+=2)
-            {
-                ac = i + 1 + 4*k*ido;
-                ti1 = cc[ac] - cc[ac + 2*ido];
-                ti2 = cc[ac] + cc[ac + 2*ido];
-                ti3 = cc[ac + ido] + cc[ac + 3*ido];
-                tr4 = cc[ac + 3*ido] - cc[ac + ido];
-                tr1 = cc[ac - 1] - cc[ac + 2*ido - 1];
-                tr2 = cc[ac - 1] + cc[ac + 2*ido - 1];
-                ti4 = cc[ac + ido - 1] - cc[ac + 3*ido - 1];
-                tr3 = cc[ac + ido - 1] + cc[ac + 3*ido - 1];
-                ah = i + k*ido;
-                ch[ah] = tr2 + tr3;
-                cr3 = tr2 - tr3;
-                ch[ah + 1] = ti2 + ti3;
-                ci3 = ti2 - ti3;
-                cr2 = tr1 + isign*tr4;
-                cr4 = tr1 - isign*tr4;
-                ci2 = ti1 + isign*ti4;
-                ci4 = ti1 - isign*ti4;
-                ch[ah + l1*ido] = wa1[i]*cr2 - isign*wa1[i + 1]*ci2;
-                ch[ah + l1*ido + 1] = wa1[i]*ci2 + isign*wa1[i + 1]*cr2;
-                ch[ah + 2*l1*ido] = wa2[i]*cr3 - isign*wa2[i + 1]*ci3;
-                ch[ah + 2*l1*ido + 1] = wa2[i]*ci3 + isign*wa2[i + 1]*cr3;
-                ch[ah + 3*l1*ido] = wa3[i]*cr4 -isign*wa3[i + 1]*ci4;
-                ch[ah + 3*l1*ido + 1] = wa3[i]*ci4 + isign*wa3[i + 1]*cr4;
-            }
-        }
-    }
-}
-static void
-fftpack_passf5(int    ido,
-               int    l1,
-               double cc[],
-               double ch[],
-               double wa1[],
-               double wa2[],
-               double wa3[],
-               double wa4[],
-               int    isign)
-{
-    const double tr11 = 0.309016994374947;
-    const double ti11 = 0.951056516295154;
-    const double tr12 = -0.809016994374947;
-    const double ti12 = 0.587785252292473;
-    int i, k, ac, ah;
-    double ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-        ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-    if (ido == 2)
-    {
-        for (k = 1; k <= l1; ++k)
-        {
-            ac = (5*k - 4)*ido + 1;
-            ti5 = cc[ac] - cc[ac + 3*ido];
-            ti2 = cc[ac] + cc[ac + 3*ido];
-            ti4 = cc[ac + ido] - cc[ac + 2*ido];
-            ti3 = cc[ac + ido] + cc[ac + 2*ido];
-            tr5 = cc[ac - 1] - cc[ac + 3*ido - 1];
-            tr2 = cc[ac - 1] + cc[ac + 3*ido - 1];
-            tr4 = cc[ac + ido - 1] - cc[ac + 2*ido - 1];
-            tr3 = cc[ac + ido - 1] + cc[ac + 2*ido - 1];
-            ah = (k - 1)*ido;
-            ch[ah] = cc[ac - ido - 1] + tr2 + tr3;
-            ch[ah + 1] = cc[ac - ido] + ti2 + ti3;
-            cr2 = cc[ac - ido - 1] + tr11*tr2 + tr12*tr3;
-            ci2 = cc[ac - ido] + tr11*ti2 + tr12*ti3;
-            cr3 = cc[ac - ido - 1] + tr12*tr2 + tr11*tr3;
-            ci3 = cc[ac - ido] + tr12*ti2 + tr11*ti3;
-            cr5 = isign*(ti11*tr5 + ti12*tr4);
-            ci5 = isign*(ti11*ti5 + ti12*ti4);
-            cr4 = isign*(ti12*tr5 - ti11*tr4);
-            ci4 = isign*(ti12*ti5 - ti11*ti4);
-            ch[ah + l1*ido] = cr2 - ci5;
-            ch[ah + 4*l1*ido] = cr2 + ci5;
-            ch[ah + l1*ido + 1] = ci2 + cr5;
-            ch[ah + 2*l1*ido + 1] = ci3 + cr4;
-            ch[ah + 2*l1*ido] = cr3 - ci4;
-            ch[ah + 3*l1*ido] = cr3 + ci4;
-            ch[ah + 3*l1*ido + 1] = ci3 - cr4;
-            ch[ah + 4*l1*ido + 1] = ci2 - cr5;
-        }
-    }
-    else
-    {
-        for (k=1; k<=l1; k++)
-        {
-            for (i=0; i<ido-1; i+=2)
-            {
-                ac = i + 1 + (k*5 - 4)*ido;
-                ti5 = cc[ac] - cc[ac + 3*ido];
-                ti2 = cc[ac] + cc[ac + 3*ido];
-                ti4 = cc[ac + ido] - cc[ac + 2*ido];
-                ti3 = cc[ac + ido] + cc[ac + 2*ido];
-                tr5 = cc[ac - 1] - cc[ac + 3*ido - 1];
-                tr2 = cc[ac - 1] + cc[ac + 3*ido - 1];
-                tr4 = cc[ac + ido - 1] - cc[ac + 2*ido - 1];
-                tr3 = cc[ac + ido - 1] + cc[ac + 2*ido - 1];
-                ah = i + (k - 1)*ido;
-                ch[ah] = cc[ac - ido - 1] + tr2 + tr3;
-                ch[ah + 1] = cc[ac - ido] + ti2 + ti3;
-                cr2 = cc[ac - ido - 1] + tr11*tr2 + tr12*tr3;
-                ci2 = cc[ac - ido] + tr11*ti2 + tr12*ti3;
-                cr3 = cc[ac - ido - 1] + tr12*tr2 + tr11*tr3;
-                ci3 = cc[ac - ido] + tr12*ti2 + tr11*ti3;
-                cr5 = isign*(ti11*tr5 + ti12*tr4);
-                ci5 = isign*(ti11*ti5 + ti12*ti4);
-                cr4 = isign*(ti12*tr5 - ti11*tr4);
-                ci4 = isign*(ti12*ti5 - ti11*ti4);
-                dr3 = cr3 - ci4;
-                dr4 = cr3 + ci4;
-                di3 = ci3 + cr4;
-                di4 = ci3 - cr4;
-                dr5 = cr2 + ci5;
-                dr2 = cr2 - ci5;
-                di5 = ci2 - cr5;
-                di2 = ci2 + cr5;
-                ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
-                ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
-                ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
-                ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
-                ch[ah + 3*l1*ido] = wa3[i]*dr4 - isign*wa3[i+1]*di4;
-                ch[ah + 3*l1*ido + 1] = wa3[i]*di4 + isign*wa3[i+1]*dr4;
-                ch[ah + 4*l1*ido] = wa4[i]*dr5 - isign*wa4[i+1]*di5;
-                ch[ah + 4*l1*ido + 1] = wa4[i]*di5 + isign*wa4[i+1]*dr5;
-            }
-        }
-    }
-}
-static void
-fftpack_passf(int*   nac,
-              int    ido,
-              int    ip,
-              int    l1,
-              int    idl1,
-              double cc[],
-              double ch[],
-              double wa[],
-              int    isign)
-{
-    int idij, idlj, idot, ipph, i, j, k, l, jc, lc, ik, nt, idj, idl, inc,idp;
-    double wai, war;
-    idot = ido / 2;
-    nt = ip*idl1;
-    ipph = (ip + 1) / 2;
-    idp = ip*ido;
-    if (ido >= l1)
-    {
-        for (j=1; j<ipph; j++)
-        {
-            jc = ip - j;
-            for (k=0; k<l1; k++)
-            {
-                for (i=0; i<ido; i++)
-                {
-                    ch[i + (k + j*l1)*ido]  = cc[i + (j + k*ip)*ido] + cc[i + (jc + k*ip)*ido];
-                    ch[i + (k + jc*l1)*ido] = cc[i + (j + k*ip)*ido] - cc[i + (jc + k*ip)*ido];
-                }
-            }
-        }
-        for (k=0; k<l1; k++)
-            for (i=0; i<ido; i++)
-                ch[i + k*ido] = cc[i + k*ip*ido];
-    }
-    else
-    {
-        for (j=1; j<ipph; j++)
-        {
-            jc = ip - j;
-            for (i=0; i<ido; i++)
-            {
-                for (k=0; k<l1; k++)
-                {
-                    ch[i + (k + j*l1)*ido] =  cc[i + (j + k*ip)*ido] + cc[i + (jc + k*ip)*ido];
-                    ch[i + (k + jc*l1)*ido] = cc[i + (j + k*ip)*ido] - cc[i + (jc + k*ip)*ido];
-                }
-            }
-        }
-        for (i=0; i<ido; i++)
-            for (k=0; k<l1; k++)
-                ch[i + k*ido] = cc[i + k*ip*ido];
-    }
-    idl = 2 - ido;
-    inc = 0;
-    for (l=1; l<ipph; l++)
-    {
-        lc = ip - l;
-        idl += ido;
-        for (ik=0; ik<idl1; ik++)
-        {
-            cc[ik + l*idl1] = ch[ik] + wa[idl - 2]*ch[ik + idl1];
-            cc[ik + lc*idl1] = isign*wa[idl-1]*ch[ik + (ip-1)*idl1];
-        }
-        idlj = idl;
-        inc += ido;
-        for (j=2; j<ipph; j++)
-        {
-            jc = ip - j;
-            idlj += inc;
-            if (idlj > idp) idlj -= idp;
-            war = wa[idlj - 2];
-            wai = wa[idlj-1];
-            for (ik=0; ik<idl1; ik++)
-            {
-                cc[ik + l*idl1] += war*ch[ik + j*idl1];
-                cc[ik + lc*idl1] += isign*wai*ch[ik + jc*idl1];
-            }
-        }
-    }
-    for (j=1; j<ipph; j++)
-        for (ik=0; ik<idl1; ik++)
-            ch[ik] += ch[ik + j*idl1];
-    for (j=1; j<ipph; j++)
-    {
-        jc = ip - j;
-        for (ik=1; ik<idl1; ik+=2)
-        {
-            ch[ik - 1 + j*idl1] = cc[ik - 1 + j*idl1] - cc[ik + jc*idl1];
-            ch[ik - 1 + jc*idl1] = cc[ik - 1 + j*idl1] + cc[ik + jc*idl1];
-            ch[ik + j*idl1] = cc[ik + j*idl1] + cc[ik - 1 + jc*idl1];
-            ch[ik + jc*idl1] = cc[ik + j*idl1] - cc[ik - 1 + jc*idl1];
-        }
-    }
-    *nac = 1;
-    if (ido == 2)
-        return;
-    *nac = 0;
-    for (ik=0; ik<idl1; ik++)
-    {
-        cc[ik] = ch[ik];
-    }
-    for (j=1; j<ip; j++)
-    {
-        for (k=0; k<l1; k++)
-        {
-            cc[(k + j*l1)*ido + 0] = ch[(k + j*l1)*ido + 0];
-            cc[(k + j*l1)*ido + 1] = ch[(k + j*l1)*ido + 1];
-        }
-    }
-    if (idot <= l1)
-    {
-        idij = 0;
-        for (j=1; j<ip; j++)
-        {
-            idij += 2;
-            for (i=3; i<ido; i+=2)
-            {
-                idij += 2;
-                for (k=0; k<l1; k++)
-                {
-                    cc[i - 1 + (k + j*l1)*ido] =
-                    wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
-                    isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
-                    cc[i + (k + j*l1)*ido] =
-                        wa[idij - 2]*ch[i + (k + j*l1)*ido] +
-                        isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
-                }
-            }
-        }
-    }
-    else
-    {
-        idj = 2 - ido;
-        for (j=1; j<ip; j++)
-        {
-            idj += ido;
-            for (k = 0; k < l1; k++)
-            {
-                idij = idj;
-                for (i=3; i<ido; i+=2)
-                {
-                    idij += 2;
-                    cc[i - 1 + (k + j*l1)*ido] =
-                        wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
-                        isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
-                    cc[i + (k + j*l1)*ido] =
-                        wa[idij - 2]*ch[i + (k + j*l1)*ido] +
-                        isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
-                }
-            }
-        }
-    }
-}
-static void
-fftpack_cfftf1(int    n,
-               double c[],
-               double ch[],
-               double wa[],
-               int    ifac[15],
-               int    isign)
-{
-    int idot, i;
-    int k1, l1, l2;
-    int na, nf, ip, iw, ix2, ix3, ix4, nac, ido, idl1;
-    double *cinput, *coutput;
-    nf = ifac[1];
-    na = 0;
-    l1 = 1;
-    iw = 0;
-    for (k1=2; k1<=nf+1; k1++)
-    {
-        ip = ifac[k1];
-        l2 = ip*l1;
-        ido = n / l2;
-        idot = ido + ido;
-        idl1 = idot*l1;
-        if (na)
-        {
-            cinput = ch;
-            coutput = c;
-        }
-        else
-        {
-            cinput = c;
-            coutput = ch;
-        }
-        switch (ip)
-        {
-            case 4:
-                ix2 = iw + idot;
-                ix3 = ix2 + idot;
-                fftpack_passf4(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], isign);
-                na = !na;
-                break;
-            case 2:
-                fftpack_passf2(idot, l1, cinput, coutput, &wa[iw], isign);
-                na = !na;
-                break;
-            case 3:
-                ix2 = iw + idot;
-                fftpack_passf3(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], isign);
-                na = !na;
-                break;
-            case 5:
-                ix2 = iw + idot;
-                ix3 = ix2 + idot;
-                ix4 = ix3 + idot;
-                fftpack_passf5(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
-                na = !na;
-                break;
-            default:
-                fftpack_passf(&nac, idot, ip, l1, idl1, cinput, coutput, &wa[iw], isign);
-                if (nac != 0) na = !na;
-        }
-        l1 = l2;
-        iw += (ip - 1)*idot;
-    }
-    if (na == 0)
-        return;
-    for (i=0; i<2*n; i++)
-        c[i] = ch[i];
-}
-static void
-fftpack_factorize(int    n,
-                  int    ifac[15])
-{
-    static const int ntryh[4] = { 3,4,2,5 };
-    int ntry=3, i, j=0, ib, nf=0, nl=n, nq, nr;
-startloop:
-    if (j < 4)
-        ntry = ntryh[j];
-    else
-        ntry+= 2;
-    j++;
-    do
-    {
-        nq = nl / ntry;
-        nr = nl - ntry*nq;
-        if (nr != 0) goto startloop;
-        nf++;
-        ifac[nf + 1] = ntry;
-        nl = nq;
-        if (ntry == 2 && nf != 1)
-        {
-            for (i=2; i<=nf; i++)
-            {
-                ib = nf - i + 2;
-                ifac[ib + 1] = ifac[ib];
-            }
-            ifac[2] = 2;
-        }
-    }
-    while (nl != 1);
-    ifac[0] = n;
-    ifac[1] = nf;
-}
-static void
-fftpack_cffti1(int    n,
-               double wa[],
-               int    ifac[15])
-{
-    const double twopi = 6.28318530717959;
-    double arg, argh, argld, fi;
-    int idot, i, j;
-    int i1, k1, l1, l2;
-    int ld, ii, nf, ip;
-    int ido, ipm;
-    fftpack_factorize(n,ifac);
-    nf = ifac[1];
-    argh = twopi/n;
-    i = 1;
-    l1 = 1;
-    for (k1=1; k1<=nf; k1++)
-    {
-        ip = ifac[k1+1];
-        ld = 0;
-        l2 = l1*ip;
-        ido = n / l2;
-        idot = ido + ido + 2;
-        ipm = ip - 1;
-        for (j=1; j<=ipm; j++)
-        {
-            i1 = i;
-            wa[i-1] = 1;
-            wa[i] = 0;
-            ld += l1;
-            fi = 0;
-            argld = ld*argh;
-            for (ii=4; ii<=idot; ii+=2)
-            {
-                i+= 2;
-                fi+= 1;
-                arg = fi*argld;
-                wa[i-1] = cos(arg);
-                wa[i] = sin(arg);
-            }
-            if (ip > 5)
-            {
-                wa[i1-1] = wa[i-1];
-                wa[i1] = wa[i];
-            }
-        }
-        l1 = l2;
-    }
-}
-static int
-fftpack_transpose_2d(t_complex *          in_data,
-                     t_complex *          out_data,
-                     int                  nx,
-                     int                  ny)
-{
-    t_complex *  src;
-    int          i,j;
-    if (nx<2 || ny<2)
-    {
-        if (in_data != out_data)
-        {
-            memcpy(out_data,in_data,sizeof(t_complex)*nx*ny);
-        }
-        return 0;
-    }
-    if (in_data == out_data)
-    {
-        src = (t_complex *)malloc(sizeof(t_complex)*nx*ny);
-        memcpy(src,in_data,sizeof(t_complex)*nx*ny);
-    }
-    else
-    {
-        src = in_data;
-    }
-    for (i=0;i<nx;i++)
-    {
-        for (j=0;j<ny;j++)
-        {
-            out_data[j*nx+i].re = src[i*ny+j].re;
-            out_data[j*nx+i].im = src[i*ny+j].im;
-        }
-    }
-    if (src != in_data)
-    {
-        free(src);
-    }
-    return 0;
-}
-static int
-fftpack_transpose_2d_nelem(t_complex *          in_data,
-                           t_complex *          out_data,
-                           int                  nx,
-                           int                  ny,
-                           int                  nelem)
-{
-    t_complex *   src;
-    int           ncopy;
-    int           i,j;
-    ncopy = nelem*sizeof(t_complex);
-    if (nx<2 || ny<2)
-    {
-        if (in_data != out_data)
-        {
-            memcpy(out_data,in_data,nx*ny*ncopy);
-        }
-        return 0;
-    }
-    if (in_data == out_data)
-    {
-        src = (t_complex *)malloc(nx*ny*ncopy);
-        memcpy(src,in_data,nx*ny*ncopy);
-    }
-    else
-    {
-        src = in_data;
-    }
-    for (i=0;i<nx;i++)
-    {
-        for (j=0;j<ny;j++)
-        {
-            memcpy(out_data + (j*nx+i)*nelem , src + (i*ny+j)*nelem , ncopy);
-        }
-    }
-    if (src != in_data)
-    {
-        free(src);
-    }
-    return 0;
-}
-int
-fftpack_init_1d(fftpack_t *        pfft,
-                int                nx)
-{
-    fftpack_t    fft;
-    if (pfft==NULL)
-    {
-        fprintf(stderr,"Fatal error - Invalid FFT opaque type pointer.");
-        return EINVAL;
-    }
-    *pfft = NULL;
-    if ((fft = (struct fftpack *)malloc(sizeof(struct fftpack))) == NULL)
-    {
-        return ENOMEM;
-    }
-    fft->next = NULL;
-    fft->n    = nx;
-    /* Need 4*n storage for 1D complex FFT */
-    if ((fft->work = (double *)malloc(sizeof(double)*(4*nx))) == NULL)
-    {
-        free(fft);
-        return ENOMEM;
-    }
-    if (fft->n>1)
-        fftpack_cffti1(nx,fft->work,fft->ifac);
-    *pfft = fft;
-    return 0;
-};
-int
-fftpack_init_2d(fftpack_t *        pfft,
-                int                nx,
-                int                ny)
-{
-    fftpack_t     fft;
-    int           rc;
-    if (pfft==NULL)
-    {
-        fprintf(stderr,"Fatal error - Invalid FFT opaque type pointer.");
-        return EINVAL;
-    }
-    *pfft = NULL;
-    /* Create the X transform */
-    if ((rc = fftpack_init_1d(&fft,nx)) != 0)
-    {
-        return rc;
-    }
-    /* Create Y transform as a link from X */
-    if ((rc=fftpack_init_1d(&(fft->next),ny)) != 0)
-    {
-        free(fft);
-        return rc;
-    }
-    *pfft = fft;
-    return 0;
-};
-int
-fftpack_init_3d(fftpack_t *        pfft,
-                int                nx,
-                int                ny,
-                int                nz)
-{
-    fftpack_t     fft;
-    int           rc;
-    if (pfft==NULL)
-    {
-        fprintf(stderr,"Fatal error - Invalid FFT opaque type pointer.");
-        return EINVAL;
-    }
-    *pfft = NULL;
-    /* Create the X transform */
-    if ((fft = (struct fftpack *)malloc(sizeof(struct fftpack))) == NULL)
-    {
-        return ENOMEM;
-    }
-    fft->n    = nx;
-    /* Need 4*nx storage for 1D complex FFT.
-     */
-    if ((fft->work = (double *)malloc(sizeof(double)*(4*nx))) == NULL)
-    {
-        free(fft);
-        return ENOMEM;
-    }
-    fftpack_cffti1(nx,fft->work,fft->ifac);
-    /* Create 2D Y/Z transforms as a link from X */
-    if ((rc=fftpack_init_2d(&(fft->next),ny,nz)) != 0)
-    {
-        free(fft);
-        return rc;
-    }
-    *pfft = fft;
-    return 0;
-};
-int
-fftpack_exec_1d          (fftpack_t                  fft,
-                          enum fftpack_direction     dir,
-                          t_complex *                in_data,
-                          t_complex *                out_data)
-{
-    int i, n;
-    double* p1;
-    double* p2;
-    n=fft->n;
-    if (n==1)
-    {
-        p1 = (double *)in_data;
-        p2 = (double *)out_data;
-        p2[0] = p1[0];
-        p2[1] = p1[1];
-    }
-    /* FFTPACK only does in-place transforms, so emulate out-of-place
-     * by copying data to the output array first.
-     */
-    if (in_data != out_data)
-    {
-        p1 = (double *)in_data;
-        p2 = (double *)out_data;
-        /* n complex = 2*n double elements */
-        for (i=0;i<2*n;i++)
-        {
-            p2[i] = p1[i];
-        }
-    }
-    /* Elements 0   .. 2*n-1 in work are used for ffac values,
-     * Elements 2*n .. 4*n-1 are internal FFTPACK work space.
-     */
-    if (dir == FFTPACK_FORWARD)
-    {
-        fftpack_cfftf1(n,(double *)out_data,fft->work+2*n,fft->work,fft->ifac, -1);
-    }
-    else if (dir == FFTPACK_BACKWARD)
-    {
-        fftpack_cfftf1(n,(double *)out_data,fft->work+2*n,fft->work,fft->ifac, 1);
-    }
-    else
-    {
-        fprintf(stderr,"FFT plan mismatch - bad plan or direction.");
-        return EINVAL;
-    }
-    return 0;
-}
-int
-fftpack_exec_2d          (fftpack_t                  fft,
-                          enum fftpack_direction     dir,
-                          t_complex *                in_data,
-                          t_complex *                out_data)
-{
-    int                i,nx,ny;
-    t_complex *    data;
-    nx = fft->n;
-    ny = fft->next->n;
-    /* FFTPACK only does in-place transforms, so emulate out-of-place
-     * by copying data to the output array first.
-     * For 2D there is likely enough data to benefit from memcpy().
-     */
-    if (in_data != out_data)
-    {
-        memcpy(out_data,in_data,sizeof(t_complex)*nx*ny);
-    }
-    /* Much easier to do pointer arithmetic when base has the correct type */
-    data = (t_complex *)out_data;
-    /* y transforms */
-    for (i=0;i<nx;i++)
-    {
-        fftpack_exec_1d(fft->next,dir,data+i*ny,data+i*ny);
-    }
-    /* Transpose in-place to get data in place for x transform now */
-    fftpack_transpose_2d(data,data,nx,ny);
-    /* x transforms */
-    for (i=0;i<ny;i++)
-    {
-        fftpack_exec_1d(fft,dir,data+i*nx,data+i*nx);
-    }
-    /* Transpose in-place to get data back in original order */
-    fftpack_transpose_2d(data,data,ny,nx);
-    return 0;
-}
-int
-fftpack_exec_3d     (fftpack_t                  fft,
-                     enum fftpack_direction     dir,
-                     t_complex *                in_data,
-                     t_complex *                out_data)
-{
-    int              i,nx,ny,nz,rc;
-    t_complex *      data;
-    nx=fft->n;
-    ny=fft->next->n;
-    nz=fft->next->next->n;
-    /* FFTPACK only does in-place transforms, so emulate out-of-place
-     * by copying data to the output array first.
-     * For 3D there is likely enough data to benefit from memcpy().
-     */
-    if (in_data != out_data)
-    {
-        memcpy(out_data,in_data,sizeof(t_complex)*nx*ny*nz);
-    }
-    /* Much easier to do pointer arithmetic when base has the correct type */
-    data = (t_complex *)out_data;
-    /* Perform z transforms */
-    for (i=0;i<nx*ny;i++)
-        fftpack_exec_1d(fft->next->next,dir,data+i*nz,data+i*nz);
-    /* For each X slice, transpose the y & z dimensions inside the slice */
-    for (i=0;i<nx;i++)
-    {
-        fftpack_transpose_2d(data+i*ny*nz,data+i*ny*nz,ny,nz);
-    }
-    /* Array is now (nx,nz,ny) - perform y transforms */
-    for (i=0;i<nx*nz;i++)
-    {
-        fftpack_exec_1d(fft->next,dir,data+i*ny,data+i*ny);
-    }
-    /* Transpose back to (nx,ny,nz) */
-    for (i=0;i<nx;i++)
-    {
-        fftpack_transpose_2d(data+i*ny*nz,data+i*ny*nz,nz,ny);
-    }
-    /* Transpose entire x & y slices to go from
-     * (nx,ny,nz) to (ny,nx,nz).
-     */
-    rc=fftpack_transpose_2d_nelem(data,data,nx,ny,nz);
-    if (rc != 0)
-    {
-        fprintf(stderr,"Fatal error - cannot transpose X & Y/Z in fftpack_exec_3d().");
-        return rc;
-    }
-    /* Then go from (ny,nx,nz) to (ny,nz,nx) */
-    for (i=0;i<ny;i++)
-    {
-        fftpack_transpose_2d(data+i*nx*nz,data+i*nx*nz,nx,nz);
-    }
-    /* Perform x transforms */
-    for (i=0;i<ny*nz;i++)
-    {
-        fftpack_exec_1d(fft,dir,data+i*nx,data+i*nx);
-    }
-    /* Transpose back from (ny,nz,nx) to (ny,nx,nz) */
-    for (i=0;i<ny;i++)
-    {
-        fftpack_transpose_2d(data+i*nz*nx,data+i*nz*nx,nz,nx);
-    }
-    /* Transpose from (ny,nx,nz) to (nx,ny,nz).
-     */
-    rc = fftpack_transpose_2d_nelem(data,data,ny,nx,nz);
-    if (rc != 0)
-    {
-        fprintf(stderr,"Fatal error - cannot transpose Y/Z & X in fftpack_exec_3d().");
-        return rc;
-    }
-    return 0;
-}
-void
-fftpack_destroy(fftpack_t      fft)
-{
-    if (fft != NULL)
-    {
-        free(fft->work);
-        if (fft->next != NULL)
-            fftpack_destroy(fft->next);
-        free(fft);
-    }
-}
--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceHippoNonbondedForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceHippoNonbondedForce.cpp
-/* Portions copyright (c) 2006-2019 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2022 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -30,6 +30,10 @@
 #include "SimTKOpenMMRealType.h"
 #include "jama_svd.h"
 #include <algorithm>
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#include "pocketfft_hdronly.h"
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -1396,7 +1400,6 @@ const double AmoebaReferencePmeHippoNonbondedForce::SQRT_PI = sqrt(M_PI);
 AmoebaReferencePmeHippoNonbondedForce::AmoebaReferencePmeHippoNonbondedForce(const HippoNonbondedForce& force, const System& system) :
               AmoebaReferenceHippoNonbondedForce(force) {
-    _fftplan = NULL;
    force.getPMEParameters(_alphaEwald, _pmeGridDimensions[0], _pmeGridDimensions[1], _pmeGridDimensions[2]);
    force.getDPMEParameters(_dalphaEwald, _dpmeGridDimensions[0], _dpmeGridDimensions[1], _dpmeGridDimensions[2]);
    if (_alphaEwald == 0.0 || _dalphaEwald == 0.0) {
@@ -1408,15 +1411,9 @@ AmoebaReferencePmeHippoNonbondedForce::AmoebaReferencePmeHippoNonbondedForce(con
        if (_dalphaEwald == 0.0)
            NonbondedForceImpl::calcPMEParameters(system, nb, _dalphaEwald, _dpmeGridDimensions[0], _dpmeGridDimensions[1], _dpmeGridDimensions[2], true);
    }
-    fftpack_init_3d(&_fftplan, _pmeGridDimensions[0], _pmeGridDimensions[1], _pmeGridDimensions[2]);
    initializeBSplineModuli();
 }
-AmoebaReferencePmeHippoNonbondedForce::~AmoebaReferencePmeHippoNonbondedForce() {
-    if (_fftplan != NULL)
-        fftpack_destroy(_fftplan);
-};
 double AmoebaReferencePmeHippoNonbondedForce::getCutoffDistance() const {
     return _cutoffDistance;
 };
@@ -1451,11 +1448,6 @@ void AmoebaReferencePmeHippoNonbondedForce::setPmeGridDimensions(vector<int>& pm
        (pmeGridDimensions[2] == _pmeGridDimensions[2]))
        return;
-    if (_fftplan) {
-        fftpack_destroy(_fftplan);
-    }
-    fftpack_init_3d(&_fftplan,pmeGridDimensions[0], pmeGridDimensions[1], pmeGridDimensions[2]);
    _pmeGridDimensions[0] = pmeGridDimensions[0];
    _pmeGridDimensions[1] = pmeGridDimensions[1];
    _pmeGridDimensions[2] = pmeGridDimensions[2];
@@ -1469,11 +1461,6 @@ void AmoebaReferencePmeHippoNonbondedForce::setDispersionPmeGridDimensions(vecto
        (pmeGridDimensions[2] == _dpmeGridDimensions[2]))
        return;
-    if (_fftplan) {
-        fftpack_destroy(_fftplan);
-    }
-    fftpack_init_3d(&_fftplan,pmeGridDimensions[0], pmeGridDimensions[1], pmeGridDimensions[2]);
    _dpmeGridDimensions[0] = pmeGridDimensions[0];
    _dpmeGridDimensions[1] = pmeGridDimensions[1];
    _dpmeGridDimensions[2] = pmeGridDimensions[2];
@@ -1516,7 +1503,7 @@ void AmoebaReferencePmeHippoNonbondedForce::resizePmeArrays() {
 void AmoebaReferencePmeHippoNonbondedForce::initializePmeGrid() {
    for (int jj = 0; jj < _pmeGrid.size(); jj++)
-        _pmeGrid[jj].re = _pmeGrid[jj].im = 0.0;
+        _pmeGrid[jj] = complex<double>(0, 0);
 }
 void AmoebaReferencePmeHippoNonbondedForce::getPeriodicDelta(Vec3& deltaR) const {
@@ -1674,9 +1661,14 @@ void AmoebaReferencePmeHippoNonbondedForce::calculateFixedMultipoleField() {
    computeAmoebaBsplines(particleData);
    initializePmeGrid();
    spreadFixedMultipolesOntoGrid(particleData);
-    fftpack_exec_3d(_fftplan, FFTPACK_FORWARD, _pmeGrid.data(), _pmeGrid.data());
+    vector<size_t> shape = {(size_t) _pmeGridDimensions[0], (size_t) _pmeGridDimensions[1], (size_t) _pmeGridDimensions[2]};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (_pmeGridDimensions[1]*_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) (_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, _pmeGrid.data(), _pmeGrid.data(), 1.0, 0);
    performAmoebaReciprocalConvolution();
-    fftpack_exec_3d(_fftplan, FFTPACK_BACKWARD, _pmeGrid.data(), _pmeGrid.data());
+    pocketfft::c2c(shape, stride, stride, axes, false, _pmeGrid.data(), _pmeGrid.data(), 1.0, 0);
    computeFixedPotentialFromGrid();
    recordFixedMultipoleField();
@@ -1875,7 +1867,7 @@ void AmoebaReferencePmeHippoNonbondedForce::spreadFixedMultipolesOntoGrid(const
    // Clear the grid.
    for (int gridIndex = 0; gridIndex < _pmeGrid.size(); gridIndex++)
-        _pmeGrid[gridIndex] = t_complex(0, 0);
+        _pmeGrid[gridIndex] = complex<double>(0, 0);
    // Loop over atoms and spread them on the grid.
@@ -1904,8 +1896,8 @@ void AmoebaReferencePmeHippoNonbondedForce::spreadFixedMultipolesOntoGrid(const
                for (int iz = 0; iz < AMOEBA_PME_ORDER; iz++) {
                    int z = (gridPoint[2]+iz) % _pmeGridDimensions[2];
                    HippoDouble4 v = _thetai[2][atomIndex*AMOEBA_PME_ORDER+iz];
-                    t_complex& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
+                    complex<double>& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
-                    gridValue.re += term0*v[0] + term1*v[1] + term2*v[2];
+                    gridValue += term0*v[0] + term1*v[1] + term2*v[2];
                }
            }
        }
@@ -1923,7 +1915,7 @@ void AmoebaReferencePmeHippoNonbondedForce::performAmoebaReciprocalConvolution()
        int kz = remainder-ky*_pmeGridDimensions[2];
        if (kx == 0 && ky == 0 && kz == 0) {
-            _pmeGrid[index].re = _pmeGrid[index].im = 0.0;
+            _pmeGrid[index] = complex<double>(0, 0);
            continue;
        }
@@ -1943,8 +1935,7 @@ void AmoebaReferencePmeHippoNonbondedForce::performAmoebaReciprocalConvolution()
        double denom = m2*bx*by*bz;
        double eterm = scaleFactor*exp(-expFactor*m2)/denom;
-        _pmeGrid[index].re *= eterm;
+        _pmeGrid[index] *= eterm;
-        _pmeGrid[index].im *= eterm;
    }
 }
@@ -1993,7 +1984,7 @@ void AmoebaReferencePmeHippoNonbondedForce::computeFixedPotentialFromGrid() {
                for (int ix = 0; ix < AMOEBA_PME_ORDER; ix++) {
                    int i = gridPoint[0]+ix-(gridPoint[0]+ix >= _pmeGridDimensions[0] ? _pmeGridDimensions[0] : 0);
                    int gridIndex = i*_pmeGridDimensions[1]*_pmeGridDimensions[2] + j*_pmeGridDimensions[2] + k;
-                    double tq = _pmeGrid[gridIndex].re;
+                    double tq = _pmeGrid[gridIndex].real();
                    HippoDouble4 tadd = _thetai[0][m*AMOEBA_PME_ORDER+ix];
                    t[0] += tq*tadd[0];
                    t[1] += tq*tadd[1];
@@ -2066,7 +2057,7 @@ void AmoebaReferencePmeHippoNonbondedForce::spreadInducedDipolesOnGrid(const vec
    // Clear the grid.
    for (int gridIndex = 0; gridIndex < _pmeGrid.size(); gridIndex++)
-        _pmeGrid[gridIndex] = t_complex(0, 0);
+        _pmeGrid[gridIndex] = complex<double>(0, 0);
    // Loop over atoms and spread them on the grid.
@@ -2086,8 +2077,8 @@ void AmoebaReferencePmeHippoNonbondedForce::spreadInducedDipolesOnGrid(const vec
                for (int iz = 0; iz < AMOEBA_PME_ORDER; iz++) {
                    int z = (gridPoint[2]+iz) % _pmeGridDimensions[2];
                    HippoDouble4 v = _thetai[2][atomIndex*AMOEBA_PME_ORDER+iz];
-                    t_complex& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
+                    complex<double>& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
-                    gridValue.re += term01*v[0] + term11*v[1];
+                    gridValue += term01*v[0] + term11*v[1];
                }
            }
        }
@@ -2142,12 +2133,12 @@ void AmoebaReferencePmeHippoNonbondedForce::computeInducedPotentialFromGrid() {
                for (int ix = 0; ix < AMOEBA_PME_ORDER; ix++) {
                    int i = gridPoint[0]+ix-(gridPoint[0]+ix >= _pmeGridDimensions[0] ? _pmeGridDimensions[0] : 0);
                    int gridIndex = i*_pmeGridDimensions[1]*_pmeGridDimensions[2] + j*_pmeGridDimensions[2] + k;
-                    t_complex tq = _pmeGrid[gridIndex];
+                    complex<double> tq = _pmeGrid[gridIndex];
                    HippoDouble4 tadd = _thetai[0][m*AMOEBA_PME_ORDER+ix];
-                    t0 += tq.re*tadd[0];
+                    t0 += tq.real()*tadd[0];
-                    t1 += tq.re*tadd[1];
+                    t1 += tq.real()*tadd[1];
-                    t2 += tq.re*tadd[2];
+                    t2 += tq.real()*tadd[2];
-                    t3 += tq.re*tadd[3];
+                    t3 += tq.real()*tadd[3];
                }
                tu00 += t0*u[0];
                tu10 += t1*u[0];
@@ -2412,9 +2403,14 @@ void AmoebaReferencePmeHippoNonbondedForce::calculateReciprocalSpaceInducedDipol
    initializePmeGrid();
    spreadInducedDipolesOnGrid(_inducedDipole);
-    fftpack_exec_3d(_fftplan, FFTPACK_FORWARD, _pmeGrid.data(), _pmeGrid.data());
+    vector<size_t> shape = {(size_t) _pmeGridDimensions[0], (size_t) _pmeGridDimensions[1], (size_t) _pmeGridDimensions[2]};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (_pmeGridDimensions[1]*_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) (_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, _pmeGrid.data(), _pmeGrid.data(), 1.0, 0);
    performAmoebaReciprocalConvolution();
-    fftpack_exec_3d(_fftplan, FFTPACK_BACKWARD, _pmeGrid.data(), _pmeGrid.data());
+    pocketfft::c2c(shape, stride, stride, axes, false, _pmeGrid.data(), _pmeGrid.data(), 1.0, 0);
    computeInducedPotentialFromGrid();
    recordInducedDipoleField(_inducedDipoleField);
 }

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceHippoNonbondedForce.h
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceHippoNonbondedForce.h
-/* Portions copyright (c) 2006-2019 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2022 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -31,7 +31,6 @@
 #include <map>
 #include <utility>
 #include <vector>
-#include "fftpack.h"
 #include <complex>
 namespace OpenMM {
@@ -109,12 +108,6 @@ public:
     * 
     */
    AmoebaReferenceHippoNonbondedForce(const HippoNonbondedForce& force);
-    /**
-     * Destructor
-     * 
-     */
-    virtual ~AmoebaReferenceHippoNonbondedForce() {};
    /**
     * Get nonbonded method.
@@ -655,9 +648,7 @@ private:
    int _pmeGridDimensions[3];
    int _dpmeGridDimensions[3];
-    fftpack_t   _fftplan;
+    std::vector<std::complex<double> > _pmeGrid;
-    std::vector<t_complex> _pmeGrid;
    std::vector<double> _pmeBsplineModuli[3];
    std::vector<HippoDouble4> _thetai[3];

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.cpp
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-222 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -26,6 +26,10 @@
 #include "SimTKOpenMMRealType.h"
 #include "jama_svd.h"
 #include <algorithm>
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#include "pocketfft_hdronly.h"
 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -4809,16 +4813,12 @@ AmoebaReferencePmeMultipoleForce::AmoebaReferencePmeMultipoleForce() :
               _pmeGridSize(0), _totalGridSize(0), _alphaEwald(0.0)
 {
-    _fftplan = NULL;
    _pmeGrid = NULL;
    _pmeGridDimensions = IntVec(-1, -1, -1);
 }
 AmoebaReferencePmeMultipoleForce::~AmoebaReferencePmeMultipoleForce()
 {
-    if (_fftplan) {
-        fftpack_destroy(_fftplan);
-    }
    if (_pmeGrid) {
        delete[] _pmeGrid;
    }
@@ -4863,11 +4863,6 @@ void AmoebaReferencePmeMultipoleForce::setPmeGridDimensions(vector<int>& pmeGrid
        (pmeGridDimensions[2] == _pmeGridDimensions[2]))
        return;
-    if (_fftplan) {
-        fftpack_destroy(_fftplan);
-    }
-    fftpack_init_3d(&_fftplan,pmeGridDimensions[0], pmeGridDimensions[1], pmeGridDimensions[2]);
    _pmeGridDimensions[0] = pmeGridDimensions[0];
    _pmeGridDimensions[1] = pmeGridDimensions[1];
    _pmeGridDimensions[2] = pmeGridDimensions[2];
@@ -4908,7 +4903,7 @@ void AmoebaReferencePmeMultipoleForce::resizePmeArrays()
        if (_pmeGrid) {
            delete[] _pmeGrid;
        }
-        _pmeGrid      = new t_complex[_totalGridSize];
+        _pmeGrid      = new complex<double>[_totalGridSize];
        _pmeGridSize  = _totalGridSize;
    }
@@ -4930,7 +4925,7 @@ void AmoebaReferencePmeMultipoleForce::initializePmeGrid()
        return;
    for (int jj = 0; jj < _totalGridSize; jj++)
-        _pmeGrid[jj].re = _pmeGrid[jj].im = 0.0;
+        _pmeGrid[jj] = complex<double>(0, 0);
 }
 void AmoebaReferencePmeMultipoleForce::getPeriodicDelta(Vec3& deltaR) const
@@ -5175,9 +5170,14 @@ void AmoebaReferencePmeMultipoleForce::calculateFixedMultipoleField(const vector
    computeAmoebaBsplines(particleData);
    initializePmeGrid();
    spreadFixedMultipolesOntoGrid(particleData);
-    fftpack_exec_3d(_fftplan, FFTPACK_FORWARD, _pmeGrid, _pmeGrid);
+    vector<size_t> shape = {(size_t) _pmeGridDimensions[0], (size_t) _pmeGridDimensions[1], (size_t) _pmeGridDimensions[2]};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (_pmeGridDimensions[1]*_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) (_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, _pmeGrid, _pmeGrid, 1.0, 0);
    performAmoebaReciprocalConvolution();
-    fftpack_exec_3d(_fftplan, FFTPACK_BACKWARD, _pmeGrid, _pmeGrid);
+    pocketfft::c2c(shape, stride, stride, axes, false, _pmeGrid, _pmeGrid, 1.0, 0);
    computeFixedPotentialFromGrid();
    recordFixedMultipoleField();
@@ -5385,7 +5385,7 @@ void AmoebaReferencePmeMultipoleForce::spreadFixedMultipolesOntoGrid(const vecto
    // Clear the grid.
    for (int gridIndex = 0; gridIndex < _totalGridSize; gridIndex++)
-        _pmeGrid[gridIndex] = t_complex(0, 0);
+        _pmeGrid[gridIndex] = complex<double>(0, 0);
    // Loop over atoms and spread them on the grid.
@@ -5414,8 +5414,8 @@ void AmoebaReferencePmeMultipoleForce::spreadFixedMultipolesOntoGrid(const vecto
                for (int iz = 0; iz < AMOEBA_PME_ORDER; iz++) {
                    int z = (gridPoint[2]+iz) % _pmeGridDimensions[2];
                    double4 v = _thetai[2][atomIndex*AMOEBA_PME_ORDER+iz];
-                    t_complex& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
+                    complex<double>& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
-                    gridValue.re += term0*v[0] + term1*v[1] + term2*v[2];
+                    gridValue += term0*v[0] + term1*v[1] + term2*v[2];
                }
            }
        }
@@ -5436,7 +5436,7 @@ void AmoebaReferencePmeMultipoleForce::performAmoebaReciprocalConvolution()
        int kz = remainder-ky*_pmeGridDimensions[2];
        if (kx == 0 && ky == 0 && kz == 0) {
-            _pmeGrid[index].re = _pmeGrid[index].im = 0.0;
+            _pmeGrid[index] = complex<double>(0, 0);
            continue;
        }
@@ -5456,8 +5456,7 @@ void AmoebaReferencePmeMultipoleForce::performAmoebaReciprocalConvolution()
        double denom = m2*bx*by*bz;
        double eterm = scaleFactor*exp(-expFactor*m2)/denom;
-        _pmeGrid[index].re *= eterm;
+        _pmeGrid[index] *= eterm;
-        _pmeGrid[index].im *= eterm;
    }
 }
@@ -5507,7 +5506,7 @@ void AmoebaReferencePmeMultipoleForce::computeFixedPotentialFromGrid()
                for (int ix = 0; ix < AMOEBA_PME_ORDER; ix++) {
                    int i = gridPoint[0]+ix-(gridPoint[0]+ix >= _pmeGridDimensions[0] ? _pmeGridDimensions[0] : 0);
                    int gridIndex = i*_pmeGridDimensions[1]*_pmeGridDimensions[2] + j*_pmeGridDimensions[2] + k;
-                    double tq = _pmeGrid[gridIndex].re;
+                    double tq = _pmeGrid[gridIndex].real();
                    double4 tadd = _thetai[0][m*AMOEBA_PME_ORDER+ix];
                    t[0] += tq*tadd[0];
                    t[1] += tq*tadd[1];
@@ -5581,7 +5580,7 @@ void AmoebaReferencePmeMultipoleForce::spreadInducedDipolesOnGrid(const vector<V
    // Clear the grid.
    for (int gridIndex = 0; gridIndex < _totalGridSize; gridIndex++)
-        _pmeGrid[gridIndex] = t_complex(0, 0);
+        _pmeGrid[gridIndex] = complex<double>(0, 0);
    // Loop over atoms and spread them on the grid.
@@ -5606,9 +5605,8 @@ void AmoebaReferencePmeMultipoleForce::spreadInducedDipolesOnGrid(const vector<V
                for (int iz = 0; iz < AMOEBA_PME_ORDER; iz++) {
                    int z = (gridPoint[2]+iz) % _pmeGridDimensions[2];
                    double4 v = _thetai[2][atomIndex*AMOEBA_PME_ORDER+iz];
-                    t_complex& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
+                    complex<double>& gridValue = _pmeGrid[x*_pmeGridDimensions[1]*_pmeGridDimensions[2]+y*_pmeGridDimensions[2]+z];
-                    gridValue.re += term01*v[0] + term11*v[1];
+                    gridValue += complex<double>(term01*v[0] + term11*v[1], term02*v[0] + term12*v[1]);
-                    gridValue.im += term02*v[0] + term12*v[1];
                }
            }
        }
@@ -5697,15 +5695,15 @@ void AmoebaReferencePmeMultipoleForce::computeInducedPotentialFromGrid()
                for (int ix = 0; ix < AMOEBA_PME_ORDER; ix++) {
                    int i = gridPoint[0]+ix-(gridPoint[0]+ix >= _pmeGridDimensions[0] ? _pmeGridDimensions[0] : 0);
                    int gridIndex = i*_pmeGridDimensions[1]*_pmeGridDimensions[2] + j*_pmeGridDimensions[2] + k;
-                    t_complex tq = _pmeGrid[gridIndex];
+                    complex<double> tq = _pmeGrid[gridIndex];
                    double4 tadd = _thetai[0][m*AMOEBA_PME_ORDER+ix];
-                    t0_1 += tq.re*tadd[0];
+                    t0_1 += tq.real()*tadd[0];
-                    t1_1 += tq.re*tadd[1];
+                    t1_1 += tq.real()*tadd[1];
-                    t2_1 += tq.re*tadd[2];
+                    t2_1 += tq.real()*tadd[2];
-                    t0_2 += tq.im*tadd[0];
+                    t0_2 += tq.imag()*tadd[0];
-                    t1_2 += tq.im*tadd[1];
+                    t1_2 += tq.imag()*tadd[1];
-                    t2_2 += tq.im*tadd[2];
+                    t2_2 += tq.imag()*tadd[2];
-                    t3 += (tq.re+tq.im)*tadd[3];
+                    t3 += (tq.real()+tq.imag())*tadd[3];
                }
                tu00_1 += t0_1*u[0];
                tu10_1 += t1_1*u[0];
@@ -6049,9 +6047,14 @@ void AmoebaReferencePmeMultipoleForce::calculateReciprocalSpaceInducedDipoleFiel
    initializePmeGrid();
    spreadInducedDipolesOnGrid(*updateInducedDipoleFields[0].inducedDipoles, *updateInducedDipoleFields[1].inducedDipoles);
-    fftpack_exec_3d(_fftplan, FFTPACK_FORWARD, _pmeGrid, _pmeGrid);
+    vector<size_t> shape = {(size_t) _pmeGridDimensions[0], (size_t) _pmeGridDimensions[1], (size_t) _pmeGridDimensions[2]};
+    vector<size_t> axes = {0, 1, 2};
+    vector<ptrdiff_t> stride = {(ptrdiff_t) (_pmeGridDimensions[1]*_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) (_pmeGridDimensions[2]*sizeof(complex<double>)),
+                                (ptrdiff_t) sizeof(complex<double>)};
+    pocketfft::c2c(shape, stride, stride, axes, true, _pmeGrid, _pmeGrid, 1.0, 0);
    performAmoebaReciprocalConvolution();
-    fftpack_exec_3d(_fftplan, FFTPACK_BACKWARD, _pmeGrid, _pmeGrid);
+    pocketfft::c2c(shape, stride, stride, axes, false, _pmeGrid, _pmeGrid, 1.0, 0);
    computeInducedPotentialFromGrid();
    recordInducedDipoleField(updateInducedDipoleFields[0].inducedDipoleField, updateInducedDipoleFields[1].inducedDipoleField);
 }

--- a/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
+++ b/plugins/amoeba/platforms/reference/src/SimTKReference/AmoebaReferenceMultipoleForce.h
-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2022 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -28,7 +28,6 @@
 #include "openmm/Vec3.h"
 #include "AmoebaReferenceGeneralizedKirkwoodForce.h"
 #include <map>
-#include "fftpack.h"
 #include <complex>
 namespace OpenMM {
@@ -1474,10 +1473,8 @@ private:
    int _totalGridSize;
    IntVec _pmeGridDimensions;
-    fftpack_t   _fftplan;
    unsigned int _pmeGridSize;
-    t_complex* _pmeGrid;
+    std::complex<double>* _pmeGrid;
    std::vector<double> _pmeBsplineModuli[3];
    std::vector<double4> _thetai[3];

--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
@@ -64,17 +64,14 @@ IF(NOT MSVC)
    ENDIF()
 ENDIF()
-# Include FFTW related files.
+# Include PocketFFT.
-INCLUDE_DIRECTORIES(${FFTW_INCLUDES})
+INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/libraries/include/pocketfft")
 # Build the shared plugin library.
 IF (OPENMM_BUILD_SHARED_LIB)
    ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB} ${FFTW_LIBRARY})
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
-    IF (FFTW_THREADS_LIBRARY)
-        TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${FFTW_THREADS_LIBRARY})
-    ENDIF (FFTW_THREADS_LIBRARY)
    SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_PME_BUILDING_SHARED_LIBRARY")
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
@@ -84,10 +81,7 @@ ENDIF (OPENMM_BUILD_SHARED_LIB)
 IF(OPENMM_BUILD_STATIC_LIB)
    ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_INCLUDE_FILES})
-    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static ${PTHREADS_LIB} ${FFTW_LIBRARY})
+    TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME}_static ${PTHREADS_LIB})
-    IF (FFTW_THREADS_LIBRARY)
-        TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${FFTW_THREADS_LIBRARY})
-    ENDIF (FFTW_THREADS_LIBRARY)
    SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_PME_BUILDING_STATIC_LIBRARY")
    INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,11 +32,16 @@
 #ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
 #endif
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#define POCKETFFT_CACHE_SIZE 4
 #include "CpuPmeKernels.h"
 #include "SimTKOpenMMRealType.h"
 #include "openmm/internal/hardware.h"
 #include "openmm/internal/vectorize.h"
 #include "openmm/OpenMMException.h"
+#include "pocketfft_hdronly.h"
 #include <cmath>
 #include <algorithm>
 #include <cstring>
@@ -51,7 +56,7 @@ static const int PME_ORDER = 5;
 bool CpuCalcDispersionPmeReciprocalForceKernel::hasInitializedThreads = false;
 int CpuCalcDispersionPmeReciprocalForceKernel::numThreads = 0;
-static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors,
+static void spreadCharge(float* posq, vector<float>& grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors,
        atomic<int>& atomicCounter, const float epsilonFactor, int threadIndex, int numThreads, bool deterministic) {
    float temp[4];
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
@@ -64,7 +69,7 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
    float posInBox[4] = {0,0,0,0};
-    memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
+    memset(grid.data(), 0, sizeof(float)*gridx*gridy*gridz);
    const int groupSize = max(1, numParticles / (10 * numThreads));
    int start = groupSize * threadIndex;
@@ -244,7 +249,7 @@ static void computeReciprocalEterm(int start, int end, int gridx, int gridy, int
    }
 }
-static double reciprocalEnergy(int start, int end, fftwf_complex* grid, vector<float>& recipEterm, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
+static double reciprocalEnergy(int start, int end, vector<complex<float> >& grid, vector<float>& recipEterm, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
@@ -266,8 +271,8 @@ static double reciprocalEnergy(int start, int end, fftwf_complex* grid, vector<f
                    kz1 = kz;
                }
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
-                float gridReal = grid[index][0];
+                float gridReal = grid[index].real();
-                float gridImag = grid[index][1];
+                float gridImag = grid[index].imag();
                energy += recipEterm[index]*(gridReal*gridReal+gridImag*gridImag);
            }
            firstz = 0;
@@ -277,7 +282,7 @@ static double reciprocalEnergy(int start, int end, fftwf_complex* grid, vector<f
 }
-static double reciprocalDispersionEnergy(int start, int end, fftwf_complex* grid, const vector<float>& recipEterm, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
+static double reciprocalDispersionEnergy(int start, int end, vector<complex<float> >& grid, const vector<float>& recipEterm, int gridx, int gridy, int gridz, double alpha, vector<float>* bsplineModuli, Vec3* periodicBoxVectors, Vec3* recipBoxVectors) {
    const unsigned int zsizeHalf = gridz/2+1;
    const unsigned int yzsizeHalf = gridy*zsizeHalf;
@@ -298,8 +303,8 @@ static double reciprocalDispersionEnergy(int start, int end, fftwf_complex* grid
                    kz1 = kz;
                }
                int index = kx1*yzsizeHalf + ky1*zsizeHalf + kz1;
-                float gridReal = grid[index][0];
+                float gridReal = grid[index].real();
-                float gridImag = grid[index][1];
+                float gridImag = grid[index].imag();
                energy += recipEterm[index]*(gridReal*gridReal+gridImag*gridImag);
            }
        }
@@ -308,15 +313,14 @@ static double reciprocalDispersionEnergy(int start, int end, fftwf_complex* grid
 }
-static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vector<float>& recipEterm) {
+static void reciprocalConvolution(int start, int end, vector<complex<float> >& grid, vector<float>& recipEterm) {
    for (int index = start; index < end; index++) {
        float eterm = recipEterm[index];
-        grid[index][0] *= eterm;
+        grid[index] *= eterm;
-        grid[index][1] *= eterm;
    }
 }
-static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor, int numThreads) {
+static void interpolateForces(float* posq, vector<float>& force, vector<float>& grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor, int numThreads) {
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
@@ -429,13 +433,24 @@ void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
-        fftwf_init_threads();
        hasInitializedThreads = true;
    }
    threadEnergy.resize(numThreads);
-    gridx = findFFTDimension(xsize, false);
+    gridx = findFFTDimension(xsize);
-    gridy = findFFTDimension(ysize, false);
+    gridy = findFFTDimension(ysize);
-    gridz = findFFTDimension(zsize, true);
+    gridz = findFFTDimension(zsize);
+    gridShape.push_back(gridx);
+    gridShape.push_back(gridy);
+    gridShape.push_back(gridz);
+    fftAxes.push_back(0);
+    fftAxes.push_back(1);
+    fftAxes.push_back(2);
+    realGridStride.push_back(gridy*gridz*sizeof(float));
+    realGridStride.push_back(gridz*sizeof(float));
+    realGridStride.push_back(sizeof(float));
+    complexGridStride.push_back(gridy*(gridz/2+1)*sizeof(complex<float>));
+    complexGridStride.push_back((gridz/2+1)*sizeof(complex<float>));
+    complexGridStride.push_back(sizeof(complex<float>));
    this->numParticles = numParticles;
    this->alpha = alpha;
    this->deterministic = deterministic;
@@ -457,16 +472,10 @@ void CpuCalcPmeReciprocalForceKernel::initialize(int xsize, int ysize, int zsize
        pthread_cond_wait(&endCondition, &lock);
    pthread_mutex_unlock(&lock);
-    // Initialize FFTW.
+    // Initialize the FFT grids.
-    for (int i = 0; i < numThreads; i++)
+    realGrids.resize(numThreads, vector<float>(gridx*gridy*gridz+3));
-        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
+    complexGrid.resize(gridx*gridy*(gridz/2+1));
-    realGrid = tempGrid[0];
-    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
-    fftwf_plan_with_nthreads(numThreads);
-    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
-    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
-    hasCreatedPlan = true;
    // Initialize the b-spline moduli.
@@ -533,14 +542,6 @@ CpuCalcPmeReciprocalForceKernel::~CpuCalcPmeReciprocalForceKernel() {
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
-    for (auto grid : tempGrid)
-        fftwf_free(grid);
-    if (complexGrid != NULL)
-        fftwf_free(complexGrid);
-    if (hasCreatedPlan) {
-        fftwf_destroy_plan(forwardFFT);
-        fftwf_destroy_plan(backwardFFT);
-    }
 }
 void CpuCalcPmeReciprocalForceKernel::runMainThread() {
@@ -562,7 +563,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();
-        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
+        pocketfft::r2c(gridShape, realGridStride, complexGridStride, fftAxes, true, realGrids[0].data(), complexGrid.data(), 1.0f, 0);
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
            threads.waitForThreads();
@@ -575,7 +576,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
        }
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
-        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
+        pocketfft::c2r(gridShape, complexGridStride, realGridStride, fftAxes, false, complexGrid.data(), realGrids[0].data(), 1.0f, 0);
        atomicCounter = 0;
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
@@ -598,14 +599,14 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
    int complexEnd = (((index+1)*complexSize)/numThreads);
    const float epsilonFactor = sqrt(ONE_4PI_EPS0);
-    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, index, numThreads, deterministic);
+    spreadCharge(posq, realGrids[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, index, numThreads, deterministic);
    threads.syncThreads();
-    int numGrids = tempGrid.size();
+    int numGrids = realGrids.size();
    for (int i = gridStart; i < gridEnd; i += 4) {
-        fvec4 sum(&realGrid[i]);
+        fvec4 sum(&realGrids[0][i]);
        for (int j = 1; j < numGrids; j++)
-            sum += fvec4(&tempGrid[j][i]);
+            sum += fvec4(&realGrids[j][i]);
-        sum.store(&realGrid[i]);
+        sum.store(&realGrids[0][i]);
    }
    threads.syncThreads();
    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
@@ -618,7 +619,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
    }
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
    threads.syncThreads();
-    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
+    interpolateForces(posq, force, realGrids[0], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
 }
 void CpuCalcPmeReciprocalForceKernel::beginComputation(IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
@@ -666,24 +667,18 @@ void CpuCalcPmeReciprocalForceKernel::getPMEParameters(double& alpha, int& nx, i
    nz = gridz;
 }
-int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
+int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.
-        if (isZ && minimum%2 == 1) {
-            // Force the last dimension to be even, since this produces better performance in FFTW.
-            minimum++;
-            continue;
-        }
        int unfactored = minimum;
-        for (int factor = 2; factor < 8; factor++) {
+        for (int factor = 2; factor < 9; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
-        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
+        if (unfactored == 1 || unfactored == 11)
            return minimum;
        minimum++;
    }
@@ -720,13 +715,24 @@ void CpuCalcDispersionPmeReciprocalForceKernel::initialize(int xsize, int ysize,
        char* threadsEnv = getenv("OPENMM_CPU_THREADS");
        if (threadsEnv != NULL)
            stringstream(threadsEnv) >> numThreads;
-        fftwf_init_threads();
        hasInitializedThreads = true;
    }
    threadEnergy.resize(numThreads);
-    gridx = findFFTDimension(xsize, false);
+    gridx = findFFTDimension(xsize);
-    gridy = findFFTDimension(ysize, false);
+    gridy = findFFTDimension(ysize);
-    gridz = findFFTDimension(zsize, true);
+    gridz = findFFTDimension(zsize);
+    gridShape.push_back(gridx);
+    gridShape.push_back(gridy);
+    gridShape.push_back(gridz);
+    fftAxes.push_back(0);
+    fftAxes.push_back(1);
+    fftAxes.push_back(2);
+    realGridStride.push_back(gridy*gridz*sizeof(float));
+    realGridStride.push_back(gridz*sizeof(float));
+    realGridStride.push_back(sizeof(float));
+    complexGridStride.push_back(gridy*(gridz/2+1)*sizeof(complex<float>));
+    complexGridStride.push_back((gridz/2+1)*sizeof(complex<float>));
+    complexGridStride.push_back(sizeof(complex<float>));
    this->numParticles = numParticles;
    this->alpha = alpha;
    this->deterministic = deterministic;
@@ -748,16 +754,11 @@ void CpuCalcDispersionPmeReciprocalForceKernel::initialize(int xsize, int ysize,
        pthread_cond_wait(&endCondition, &lock);
    pthread_mutex_unlock(&lock);
-    // Initialize FFTW.
+    // Initialize the FFT grids.
-    for (int i = 0; i < numThreads; i++)
-        tempGrid.push_back((float*) fftwf_malloc(sizeof(float)*(gridx*gridy*gridz+3)));
+    realGrids.resize(numThreads, vector<float>(gridx*gridy*gridz+3));
-    realGrid = tempGrid[0];
+    complexGrid.resize(gridx*gridy*(gridz/2+1));
-    complexGrid = (fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex)*gridx*gridy*(gridz/2+1));
-    fftwf_plan_with_nthreads(numThreads);
-    forwardFFT = fftwf_plan_dft_r2c_3d(gridx, gridy, gridz, realGrid, complexGrid, FFTW_MEASURE);
-    backwardFFT = fftwf_plan_dft_c2r_3d(gridx, gridy, gridz, complexGrid, realGrid, FFTW_MEASURE);
-    hasCreatedPlan = true;
    // Initialize the b-spline moduli.
@@ -824,14 +825,6 @@ CpuCalcDispersionPmeReciprocalForceKernel::~CpuCalcDispersionPmeReciprocalForceK
    pthread_mutex_destroy(&lock);
    pthread_cond_destroy(&startCondition);
    pthread_cond_destroy(&endCondition);
-    for (auto grid : tempGrid)
-        fftwf_free(grid);
-    if (complexGrid != NULL)
-        fftwf_free(complexGrid);
-    if (hasCreatedPlan) {
-        fftwf_destroy_plan(forwardFFT);
-        fftwf_destroy_plan(backwardFFT);
-    }
 }
 void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
@@ -854,7 +847,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();
-        fftwf_execute_dft_r2c(forwardFFT, realGrid, complexGrid);
+        pocketfft::r2c(gridShape, realGridStride, complexGridStride, fftAxes, true, realGrids[0].data(), complexGrid.data(), 1.0f, 0);
        if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
            threads.resumeThreads(); // Signal threads to compute the reciprocal scale factors.
            threads.waitForThreads();
@@ -867,7 +860,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
        }
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
-        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
+        pocketfft::c2r(gridShape, complexGridStride, realGridStride, fftAxes, false, complexGrid.data(), realGrids[0].data(), 1.0f, 0);
        atomicCounter = 0;
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
@@ -890,14 +883,14 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre
    int complexStart = std::max(1, ((index*complexSize)/numThreads));
    int complexEnd = (((index+1)*complexSize)/numThreads);
    const float epsilonFactor = 1.0f;
-    spreadCharge(posq, tempGrid[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, index, numThreads, deterministic);
+    spreadCharge(posq, realGrids[index], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, index, numThreads, deterministic);
    threads.syncThreads();
-    int numGrids = tempGrid.size();
+    int numGrids = realGrids.size();
    for (int i = gridStart; i < gridEnd; i += 4) {
-        fvec4 sum(&realGrid[i]);
+        fvec4 sum(&realGrids[0][i]);
        for (int j = 1; j < numGrids; j++)
-            sum += fvec4(&tempGrid[j][i]);
+            sum += fvec4(&realGrids[j][i]);
-        sum.store(&realGrid[i]);
+        sum.store(&realGrids[0][i]);
    }
    threads.syncThreads();
    if (lastBoxVectors[0] != periodicBoxVectors[0] || lastBoxVectors[1] != periodicBoxVectors[1] || lastBoxVectors[2] != periodicBoxVectors[2]) {
@@ -912,7 +905,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runWorkerThread(ThreadPool& thre
    complexStart = (index*complexSize)/numThreads;
    reciprocalConvolution(complexStart, complexEnd, complexGrid, recipEterm);
    threads.syncThreads();
-    interpolateForces(posq, &force[0], realGrid, gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
+    interpolateForces(posq, force, realGrids[0], gridx, gridy, gridz, numParticles, periodicBoxVectors, recipBoxVectors, atomicCounter, epsilonFactor, numThreads);
 }
 void CpuCalcDispersionPmeReciprocalForceKernel::beginComputation(CalcPmeReciprocalForceKernel::IO& io, const Vec3* periodicBoxVectors, bool includeEnergy) {
@@ -960,24 +953,18 @@ void CpuCalcDispersionPmeReciprocalForceKernel::getPMEParameters(double& alpha,
    nz = gridz;
 }
-int CpuCalcDispersionPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
+int CpuCalcDispersionPmeReciprocalForceKernel::findFFTDimension(int minimum) {
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.
-        if (isZ && minimum%2 == 1) {
-            // Force the last dimension to be even, since this produces better performance in FFTW.
-            minimum++;
-            continue;
-        }
        int unfactored = minimum;
-        for (int factor = 2; factor < 8; factor++) {
+        for (int factor = 2; factor < 9; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
-        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
+        if (unfactored == 1 || unfactored == 11)
            return minimum;
        minimum++;
    }

--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -38,7 +38,7 @@
 #include "openmm/Vec3.h"
 #include "openmm/internal/ThreadPool.h"
 #include <atomic>
-#include <fftw3.h>
+#include <complex>
 #include <pthread.h>
 #include <vector>
@@ -46,13 +46,13 @@ namespace OpenMM {
 /**
 * This is an optimized CPU implementation of CalcPmeReciprocalForceKernel.  It is both
- * vectorized (requiring SSE 4.1) and multithreaded.  It uses FFTW to perform the FFTs.
+ * vectorized (requiring SSE 4.1) and multithreaded.  It uses PocketFFT to perform the FFTs.
 */
 class OPENMM_EXPORT_PME CpuCalcPmeReciprocalForceKernel : public CalcPmeReciprocalForceKernel {
 public:
    CpuCalcPmeReciprocalForceKernel(const std::string& name, const Platform& platform) : CalcPmeReciprocalForceKernel(name, platform),
-            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL) {
+            isDeleted(false) {
    }
    /**
     * Initialize the kernel.
@@ -104,24 +104,24 @@ public:
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    /**
-     * Select a size for one grid dimension that FFTW can handle efficiently.
+     * Select a size for one grid dimension that PocketFFT can handle efficiently.
     */
-    int findFFTDimension(int minimum, bool isZ);
+    int findFFTDimension(int minimum);
    static bool hasInitializedThreads;
    static int numThreads;
    int gridx, gridy, gridz, numParticles;
    double alpha;
    bool deterministic;
-    bool hasCreatedPlan, isFinished, isDeleted;
+    bool isFinished, isDeleted;
    std::vector<float> force;
    std::vector<float> bsplineModuli[3];
    std::vector<float> recipEterm;
    Vec3 lastBoxVectors[3];
    std::vector<float> threadEnergy;
-    std::vector<float*> tempGrid;
+    std::vector<std::vector<float> > realGrids;
-    float* realGrid;
+    std::vector<std::complex<float> > complexGrid;
-    fftwf_complex* complexGrid;
+    std::vector<std::size_t> gridShape, fftAxes;
-    fftwf_plan forwardFFT, backwardFFT;
+    std::vector<std::ptrdiff_t> realGridStride, complexGridStride;
    int waitCount;
    pthread_cond_t startCondition, endCondition;
    pthread_mutex_t lock;
@@ -139,13 +139,13 @@ private:
 /**
 * This is an optimized CPU implementation of CalcDispersionPmeReciprocalForceKernel.  It is both
- * vectorized (requiring SSE 4.1) and multithreaded.  It uses FFTW to perform the FFTs.
+ * vectorized (requiring SSE 4.1) and multithreaded.  It uses PocketFFT to perform the FFTs.
 */
 class OPENMM_EXPORT_PME CpuCalcDispersionPmeReciprocalForceKernel : public CalcDispersionPmeReciprocalForceKernel {
 public:
    CpuCalcDispersionPmeReciprocalForceKernel(const std::string& name, const Platform& platform) : CalcDispersionPmeReciprocalForceKernel(name, platform),
-            hasCreatedPlan(false), isDeleted(false), realGrid(NULL), complexGrid(NULL)  {
+            isDeleted(false) {
    }
    /**
     * Initialize the kernel.
@@ -198,24 +198,24 @@ public:
 private:
    class ComputeTask;
    /**
-     * Select a size for one grid dimension that FFTW can handle efficiently.
+     * Select a size for one grid dimension that PocketFFT can handle efficiently.
     */
-    int findFFTDimension(int minimum, bool isZ);
+    int findFFTDimension(int minimum);
    static bool hasInitializedThreads;
    static int numThreads;
    int gridx, gridy, gridz, numParticles;
    double alpha;
    bool deterministic;
-    bool hasCreatedPlan, isFinished, isDeleted;
+    bool isFinished, isDeleted;
    std::vector<float> force;
    std::vector<float> bsplineModuli[3];
    std::vector<float> recipEterm;
    Vec3 lastBoxVectors[3];
    std::vector<float> threadEnergy;
-    std::vector<float*> tempGrid;
+    std::vector<std::vector<float> > realGrids;
-    float* realGrid;
+    std::vector<std::complex<float> > complexGrid;
-    fftwf_complex* complexGrid;
+    std::vector<std::size_t> gridShape, fftAxes;
-    fftwf_plan forwardFFT, backwardFFT;
+    std::vector<std::ptrdiff_t> realGridStride, complexGridStride;
    int waitCount;
    pthread_cond_t startCondition, endCondition;
    pthread_mutex_t lock;

--- a/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2020 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -33,6 +33,11 @@
 #include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
 #include "SimTKOpenMMUtilities.h"
+#ifdef _MSC_VER
+  #define POCKETFFT_NO_VECTORS
+#endif
+#include "pocketfft_hdronly.h"
+#include <complex>
 using namespace OpenMM;
 using namespace std;
@@ -52,14 +57,6 @@ static vector<Vec3>& extractForces(ContextImpl& context) {
    return *data->forces;
 }
-ReferenceIntegrateRPMDStepKernel::~ReferenceIntegrateRPMDStepKernel() {
-    if (fft != NULL)
-        fftpack_destroy(fft);
-    for (auto& c : contractionFFT)
-        if (c.second != NULL)
-            fftpack_destroy(c.second);
-}
 void ReferenceIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    int numCopies = integrator.getNumCopies();
    int numParticles = system.getNumParticles();
@@ -71,7 +68,6 @@ void ReferenceIntegrateRPMDStepKernel::initialize(const System& system, const RP
        velocities[i].resize(numParticles);
        forces[i].resize(numParticles);
    }
-    fftpack_init_1d(&fft, numCopies);
    SimTKOpenMMUtilities::setRandomNumberSeed((unsigned int) integrator.getRandomNumberSeed());
    // Build a list of contractions.
@@ -89,8 +85,6 @@ void ReferenceIntegrateRPMDStepKernel::initialize(const System& system, const RP
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
-                contractionFFT[copies] = NULL;
-                fftpack_init_1d(&contractionFFT[copies], copies);
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
@@ -128,8 +122,8 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
    // Apply the PILE-L thermostat.
-    vector<t_complex> v(numCopies);
+    vector<complex<double>> v(numCopies);
-    vector<t_complex> q(numCopies);
+    vector<complex<double>> q(numCopies);
    const double hbar = 1.054571628e-34*AVOGADRO/(1000*1e-12);
    const double scale = 1.0/sqrt((double) numCopies);
    const double nkT = numCopies*BOLTZ*integrator.getTemperature();
@@ -143,12 +137,12 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
            const double c3_0 = c2_0*sqrt(nkT/system.getParticleMass(particle));
            for (int component = 0; component < 3; component++) {
                for (int k = 0; k < numCopies; k++)
-                    v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                    v[k] = complex<double>(scale*velocities[k][particle][component], 0.0);
-                fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
+                pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, true, v.data(), v.data(), 1.0, 0);
                // Apply a local Langevin thermostat to the centroid mode.
-                v[0].re = v[0].re*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+                v[0].real(v[0].real()*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
                // Use critical damping white noise for the remaining modes.
@@ -160,13 +154,13 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
                    const double c3 = c2*sqrt(nkT/system.getParticleMass(particle));
                    double rand1 = c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
                    double rand2 = (isCenter ? 0.0 : c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
-                    v[k] = v[k]*c1 + t_complex(rand1, rand2);
+                    v[k] = v[k]*c1 + complex<double>(rand1, rand2);
                    if (k < numCopies-k)
-                        v[numCopies-k] = v[numCopies-k]*c1 + t_complex(rand1, -rand2);
+                        v[numCopies-k] = v[numCopies-k]*c1 + complex<double>(rand1, -rand2);
                }
-                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
+                pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, false, v.data(), v.data(), 1.0, 0);
                for (int k = 0; k < numCopies; k++)
-                    velocities[k][particle][component] = scale*v[k].re;
+                    velocities[k][particle][component] = scale*v[k].real();
            }
        }
    }
@@ -185,11 +179,11 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
            continue;
        for (int component = 0; component < 3; component++) {
            for (int k = 0; k < numCopies; k++) {
-                q[k] = t_complex(scale*positions[k][particle][component], 0.0);
+                q[k] = complex<double>(scale*positions[k][particle][component], 0.0);
-                v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                v[k] = complex<double>(scale*velocities[k][particle][component], 0.0);
            }
-            fftpack_exec_1d(fft, FFTPACK_FORWARD, &q[0], &q[0]);
+            pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, true, q.data(), q.data(), 1.0, 0);
-            fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
+            pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, true, v.data(), v.data(), 1.0, 0);
            q[0] += v[0]*dt;
            for (int k = 1; k < numCopies; k++) {
                const double wk = twown*sin(k*M_PI/numCopies);
@@ -197,15 +191,15 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
                const double coswt = cos(wt);
                const double sinwt = sin(wt);
                const double wm = wk*system.getParticleMass(particle);
-                const t_complex vprime = v[k]*coswt - q[k]*(wk*sinwt); // Advance velocity from t to t+dt
+                const complex<double> vprime = v[k]*coswt - q[k]*(wk*sinwt); // Advance velocity from t to t+dt
                q[k] = v[k]*(sinwt/wk) + q[k]*coswt; // Advance position from t to t+dt
                v[k] = vprime;
            }
-            fftpack_exec_1d(fft, FFTPACK_BACKWARD, &q[0], &q[0]);
+            pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, false, q.data(), q.data(), 1.0, 0);
-            fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
+            pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, false, v.data(), v.data(), 1.0, 0);
            for (int k = 0; k < numCopies; k++) {
-                positions[k][particle][component] = scale*q[k].re;
+                positions[k][particle][component] = scale*q[k].real();
-                velocities[k][particle][component] = scale*v[k].re;
+                velocities[k][particle][component] = scale*v[k].real();
            }
        }
    }
@@ -230,12 +224,12 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
            const double c3_0 = c2_0*sqrt(nkT/system.getParticleMass(particle));
            for (int component = 0; component < 3; component++) {
                for (int k = 0; k < numCopies; k++)
-                    v[k] = t_complex(scale*velocities[k][particle][component], 0.0);
+                    v[k] = complex<double>(scale*velocities[k][particle][component], 0.0);
-                fftpack_exec_1d(fft, FFTPACK_FORWARD, &v[0], &v[0]);
+                pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, true, v.data(), v.data(), 1.0, 0);
                // Apply a local Langevin thermostat to the centroid mode.
-                v[0].re = v[0].re*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
+                v[0].real(v[0].real()*c1_0 + c3_0*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
                // Use critical damping white noise for the remaining modes.
@@ -247,13 +241,13 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
                    const double c3 = c2*sqrt(nkT/system.getParticleMass(particle));
                    double rand1 = c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber();
                    double rand2 = (isCenter ? 0.0 : c3*SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
-                    v[k] = v[k]*c1 + t_complex(rand1, rand2);
+                    v[k] = v[k]*c1 + complex<double>(rand1, rand2);
                    if (k < numCopies-k)
-                        v[numCopies-k] = v[numCopies-k]*c1 + t_complex(rand1, -rand2);
+                        v[numCopies-k] = v[numCopies-k]*c1 + complex<double>(rand1, -rand2);
                }
-                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &v[0], &v[0]);
+                pocketfft::c2c({(size_t) numCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, false, v.data(), v.data(), 1.0, 0);
                for (int k = 0; k < numCopies; k++)
-                    velocities[k][particle][component] = scale*v[k].re;
+                    velocities[k][particle][component] = scale*v[k].real();
            }
        }
    }
@@ -294,28 +288,27 @@ void ReferenceIntegrateRPMDStepKernel::computeForces(ContextImpl& context, const
    for (auto& g : groupsByCopies) {
        int copies = g.first;
        int groupFlags = g.second;
-        fftpack* shortFFT = contractionFFT[copies];
        // Find the contracted positions.
-        vector<t_complex> q(totalCopies);
+        vector<complex<double>> q(totalCopies);
        const double scale1 = 1.0/totalCopies;
        for (int particle = 0; particle < numParticles; particle++) {
            for (int component = 0; component < 3; component++) {
                // Transform to the frequency domain, set high frequency components to zero, and transform back.
                for (int k = 0; k < totalCopies; k++)
-                    q[k] = t_complex(positions[k][particle][component], 0.0);
+                    q[k] = complex<double>(positions[k][particle][component], 0.0);
-                fftpack_exec_1d(fft, FFTPACK_FORWARD, &q[0], &q[0]);
+                pocketfft::c2c({(size_t) totalCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, true, q.data(), q.data(), 1.0, 0);
                if (copies > 1) {
                    int start = (copies+1)/2;
                    int end = totalCopies-copies+start;
                    for (int k = end; k < totalCopies; k++)
                        q[k-(totalCopies-copies)] = q[k];
-                    fftpack_exec_1d(shortFFT, FFTPACK_BACKWARD, &q[0], &q[0]);
+                    pocketfft::c2c({(size_t) copies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, false, q.data(), q.data(), 1.0, 0);
                }
                for (int k = 0; k < copies; k++)
-                    contractedPositions[k][particle][component] = scale1*q[k].re;
+                    contractedPositions[k][particle][component] = scale1*q[k].real();
            }
        }
@@ -336,18 +329,18 @@ void ReferenceIntegrateRPMDStepKernel::computeForces(ContextImpl& context, const
                // Transform to the frequency domain, pad with zeros, and transform back.
                for (int k = 0; k < copies; k++)
-                    q[k] = t_complex(contractedForces[k][particle][component], 0.0);
+                    q[k] = complex<double>(contractedForces[k][particle][component], 0.0);
                if (copies > 1)
-                    fftpack_exec_1d(shortFFT, FFTPACK_FORWARD, &q[0], &q[0]);
+                    pocketfft::c2c({(size_t) copies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, true, q.data(), q.data(), 1.0, 0);
                int start = (copies+1)/2;
                int end = totalCopies-copies+start;
                for (int k = end; k < totalCopies; k++)
                    q[k] = q[k-(totalCopies-copies)];
                for (int k = start; k < end; k++)
-                    q[k] = t_complex(0, 0);
+                    q[k] = complex<double>(0, 0);
-                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &q[0], &q[0]);
+                pocketfft::c2c({(size_t) totalCopies}, {sizeof(complex<double>)}, {sizeof(complex<double>)}, {0}, false, q.data(), q.data(), 1.0, 0);
                for (int k = 0; k < totalCopies; k++)
-                    forces[k][particle][component] += scale2*q[k].re;
+                    forces[k][particle][component] += scale2*q[k].real();
            }
        }
    }

--- a/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.h
+++ b/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2022 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,7 +35,6 @@
 #include "ReferencePlatform.h"
 #include "openmm/RpmdKernels.h"
 #include "openmm/Vec3.h"
-#include "fftpack.h"
 namespace OpenMM {
@@ -46,9 +45,8 @@ namespace OpenMM {
 class ReferenceIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
 public:
    ReferenceIntegrateRPMDStepKernel(const std::string& name, const Platform& platform) :
-            IntegrateRPMDStepKernel(name, platform), fft(NULL) {
+            IntegrateRPMDStepKernel(name, platform) {
    }
-    ~ReferenceIntegrateRPMDStepKernel();
    /**
     * Initialize the kernel.
     *
@@ -93,8 +91,6 @@ private:
    std::vector<std::vector<Vec3> > contractedForces;
    std::map<int, int> groupsByCopies;
    int groupsNotContracted;
-    fftpack* fft;
-    std::map<int, fftpack*> contractionFFT;
 };
 } // namespace OpenMM