Merge pull request #14 from peastman/contraction

Implemented RPMD ring polymer contraction

Merge pull request #14 from peastman/contraction
Implemented RPMD ring polymer contraction
bd4876cc · Yutong Zhao · 5db397d0 · ffdce362 · bd4876cc · bd4876cc
Commit bd4876cc authored Jun 03, 2013 by Yutong Zhao
16 changed files
--- a/plugins/rpmd/openmmapi/include/openmm/RPMDIntegrator.h
+++ b/plugins/rpmd/openmmapi/include/openmm/RPMDIntegrator.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -37,6 +37,7 @@
 #include "openmm/State.h"
 #include "openmm/Vec3.h"
 #include "openmm/internal/windowsExportRpmd.h"
+#include <map>
 namespace OpenMM {
@@ -52,6 +53,14 @@ namespace OpenMM {
 * to set them for specific copies of the System.  Similarly, you should retrieve state information
 * for particular copies by calling getState() on the Integrator.  Do not query the Context for
 * state information.
+ * 
+ * You can optionally specify a set of "ring polymer contractions", by which different force
+ * groups are evaluated on different numbers of copies, instead of computing every force on
+ * every copy.  This can be much more efficient, since different forces may vary widely in
+ * how many times they must be evaluated to produce sufficient accuracy.  For example, you
+ * might simulate a 32 copy ring polymer and evaluate bonded forces on every copy, but contract
+ * it down to only 6 copies for computing nonbonded interactions, and down to only a single
+ * copy (the centroid) for computing the reciprocal space part of PME.
 */
 class OPENMM_EXPORT_RPMD RPMDIntegrator : public Integrator {
@@ -65,6 +74,19 @@ public:
     * @param stepSize       the step size with which to integrator the system (in picoseconds)
     */
    RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize);
+    /**
+     * Create a RPMDIntegrator.
+     *
+     * @param numCopies      the number of copies of the system that should be simulated
+     * @param temperature    the temperature of the heat bath (in Kelvin)
+     * @param frictionCoeff  the friction coefficient which couples the system to the heat bath (in inverse picoseconds)
+     * @param stepSize       the step size with which to integrator the system (in picoseconds)
+     * @param contractions   the ring polymer contractions to use for evaluating different force groups.  Each key in the
+     *                       map is the index of a force group, and the corresponding value is the number of copies to evaluate
+     *                       that force group on.  If no entry is provided for a force group (the default), it is evaluated
+     *                       independently on every copy.
+     */
+    RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize, const std::map<int, int>& contractions);
    /**
     * Get the number of copies of the system being simulated.
     */
@@ -122,6 +144,15 @@ public:
    void setRandomNumberSeed(int seed) {
        randomNumberSeed = seed;
    }
+    /**
+     * Get the ring polymer contractions to use for evaluating different force groups.  Each key in the
+     * map is the index of a force group, and the corresponding value is the number of copies to evaluate
+     * that force group on.  If no entry is provided for a force group, it is evaluated independently on
+     * every copy.
+     */
+    const std::map<int, int>& getContractions() const {
+        return contractions;
+    }
    /**
     * Set the positions of all particles in one copy of the system.
     * 
@@ -182,6 +213,7 @@ protected:
 private:
    double temperature, friction;
    int numCopies, randomNumberSeed;
+    std::map<int, int> contractions;
    bool forcesAreValid, hasSetPosition, hasSetVelocity, isFirstStep;
    Kernel kernel;
 };

--- a/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
+++ b/plugins/rpmd/openmmapi/src/RPMDIntegrator.cpp
@@ -39,15 +39,23 @@
 #include <string>
 using namespace OpenMM;
-using std::string;
+using namespace std;
-using std::vector;
+RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize, const map<int, int>& contractions) :
+        numCopies(numCopies), contractions(contractions), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
+    setTemperature(temperature);
+    setFriction(frictionCoeff);
+    setStepSize(stepSize);
+    setConstraintTolerance(1e-5);
+    setRandomNumberSeed((int) time(NULL));
+}
 RPMDIntegrator::RPMDIntegrator(int numCopies, double temperature, double frictionCoeff, double stepSize) :
        numCopies(numCopies), forcesAreValid(false), hasSetPosition(false), hasSetVelocity(false), isFirstStep(true) {
    setTemperature(temperature);
    setFriction(frictionCoeff);
    setStepSize(stepSize);
-    setConstraintTolerance(1e-4);
+    setConstraintTolerance(1e-5);
    setRandomNumberSeed((int) time(NULL));
 }

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
@@ -69,6 +69,10 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
        delete positions;
    if (velocities != NULL)
        delete velocities;
+    if (contractedForces != NULL)
+        delete contractedForces;
+    if (contractedPositions != NULL)
+        delete contractedPositions;
 }
 void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
@@ -106,6 +110,34 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
            temp[i] = make_float4(0, 0, 0, 1);
        velocities->upload(temp);
    }
+    // Build a list of contractions.
+    groupsNotContracted = -1;
+    const map<int, int>& contractions = integrator.getContractions();
+    int maxContractedCopies = 0;
+    for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
+        int group = iter->first;
+        int copies = iter->second;
+        if (group < 0 || group > 31)
+            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
+        if (copies < 0 || copies > numCopies)
+            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
+        if (copies != numCopies) {
+            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
+                groupsByCopies[copies] = 1<<group;
+                groupsNotContracted -= 1<<group;
+                if (copies > maxContractedCopies)
+                    maxContractedCopies = copies;
+            }
+            else
+                groupsByCopies[copies] |= 1<<group;
+        }
+    }
+    if (maxContractedCopies > 0) {
+        contractedForces = CudaArray::create<long long>(cu, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
+        contractedPositions = new CudaArray(cu, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
+    }
    // Create kernels.
@@ -129,6 +161,23 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
    copyToContextKernel = cu.getKernel(module, "copyDataToContext");
    copyFromContextKernel = cu.getKernel(module, "copyDataFromContext");
    translateKernel = cu.getKernel(module, "applyCellTranslations");
+    // Create kernels for doing contractions.
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        replacements.clear();
+        replacements["NUM_CONTRACTED_COPIES"] = cu.intToString(copies);
+        replacements["POS_SCALE"] = cu.doubleToString(1.0/numCopies);
+        replacements["FORCE_SCALE"] = cu.doubleToString(0x100000000/(double) copies);
+        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
+        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
+        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
+        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
+        module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmdContraction, replacements), defines, "");
+        positionContractionKernels[copies] = cu.getKernel(module, "contractPositions");
+        forceContractionKernels[copies] = cu.getKernel(module, "contractForces");
+    }
 }
 void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
@@ -191,17 +240,49 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
 }
 void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
+    // Compute forces from all groups that didn't have a specified contraction.
    for (int i = 0; i < numCopies; i++) {
        void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(),
                &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
        context.computeVirtualSites();
        context.updateContextState();
-        context.calcForcesAndEnergy(true, false);
+        context.calcForcesAndEnergy(true, false, groupsNotContracted);
        void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
                &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
        cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
    }
+    // Now loop over contractions and compute forces from them.
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        int groupFlags = iter->second;
+        // Find the contracted positions.
+        void* contractPosArgs[] = {&positions->getDevicePointer(), &contractedPositions->getDevicePointer()};
+        cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize);
+        // Compute forces.
+        for (int i = 0; i < copies; i++) {
+            void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions->getDevicePointer(),
+                    &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+            cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
+            context.computeVirtualSites();
+            context.calcForcesAndEnergy(true, false, groupFlags);
+            void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces->getDevicePointer(), &cu.getVelm().getDevicePointer(),
+                   &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
+            cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
+        }
+        // Apply the forces to the original copies.
+        void* contractForceArgs[] = {&forces->getDevicePointer(), &contractedForces->getDevicePointer()};
+        cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize);
+    }
 }
 double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {

--- a/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+++ b/plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
@@ -35,6 +35,7 @@
 #include "openmm/RpmdKernels.h"
 #include "CudaContext.h"
 #include "CudaArray.h"
+#include <map>
 namespace OpenMM {
@@ -45,7 +46,7 @@ namespace OpenMM {
 class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
 public:
    CudaIntegrateRPMDStepKernel(std::string name, const Platform& platform, CudaContext& cu) :
-            IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL) {
+            IntegrateRPMDStepKernel(name, platform), cu(cu), forces(NULL), positions(NULL), velocities(NULL), contractedForces(NULL), contractedPositions(NULL) {
    }
    ~CudaIntegrateRPMDStepKernel();
    /**
@@ -88,10 +89,16 @@ private:
    std::string createFFT(int size, const std::string& variable, bool forward);
    CudaContext& cu;
    int numCopies, numParticles, workgroupSize;
+    std::map<int, int> groupsByCopies;
+    int groupsNotContracted;
    CudaArray* forces;
    CudaArray* positions;
    CudaArray* velocities;
+    CudaArray* contractedForces;
+    CudaArray* contractedPositions;
    CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    std::map<int, CUfunction> positionContractionKernels;
+    std::map<int, CUfunction> forceContractionKernels;
 };
 } // namespace OpenMM

--- a/plugins/rpmd/platforms/cuda/src/kernels/rpmdContraction.cu
+++ b/plugins/rpmd/platforms/cuda/src/kernels/rpmdContraction.cu
+__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2r-c1.y*c2i;
+}
+__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2i+c1.y*c2r;
+}
+__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2r+c1.y*c2i;
+}
+__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
+    return c1.x*c2i-c1.y*c2r;
+}
+/**
+ * Compute the contracted positions
+ */
+extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
+    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    __shared__ mixed3 q[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed2 w[NUM_COPIES];
+    mixed3* qreal = &q[blockStart];
+    mixed3* qimag = &q[blockStart+blockDim.x];
+    mixed3* tempreal = &temp[blockStart];
+    mixed3* tempimag = &temp[blockStart+blockDim.x];
+    if (threadIdx.x < NUM_COPIES)
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    __syncthreads();
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        // Load the particle position.
+        mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
+        qreal[indexInBlock] = make_mixed3(particlePosq.x, particlePosq.y, particlePosq.z);
+        qimag[indexInBlock] = make_mixed3(0);
+        // Forward FFT.
+        __syncthreads();
+        FFT_Q_FORWARD
+        if (NUM_CONTRACTED_COPIES > 1) {
+            // Compress the data to remove high frequencies.
+            int start = (NUM_CONTRACTED_COPIES+1)/2;
+            tempreal[indexInBlock] = qreal[indexInBlock];
+            tempimag[indexInBlock] = qimag[indexInBlock];
+            __syncthreads();
+            if (indexInBlock < NUM_CONTRACTED_COPIES) {
+                qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
+                qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
+            }
+            __syncthreads();
+            FFT_Q_BACKWARD
+        }
+        // Store results.
+        if (indexInBlock < NUM_CONTRACTED_COPIES)
+            contracted[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(POS_SCALE*qreal[indexInBlock].x, POS_SCALE*qreal[indexInBlock].y, POS_SCALE*qreal[indexInBlock].z, particlePosq.w);
+    }
+}
+/**
+ * Apply the contracted forces to all copies.
+ */
+extern "C" __global__ void contractForces(long long* force, long long* contracted) {
+    const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES);
+    const int indexInBlock = threadIdx.x-blockStart;
+    const mixed forceScale = 1/(mixed) 0x100000000;
+    __shared__ mixed3 f[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed3 temp[2*THREAD_BLOCK_SIZE];
+    __shared__ mixed2 w[NUM_COPIES];
+    mixed3* freal = &f[blockStart];
+    mixed3* fimag = &f[blockStart+blockDim.x];
+    mixed3* tempreal = &temp[blockStart];
+    mixed3* tempimag = &temp[blockStart+blockDim.x];
+    if (threadIdx.x < NUM_COPIES)
+        w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    __syncthreads();
+    for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        // Load the force.
+        int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
+        if (indexInBlock < NUM_CONTRACTED_COPIES) {
+            freal[indexInBlock] = make_mixed3(contracted[forceIndex]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS*2]*forceScale);
+            fimag[indexInBlock] = make_mixed3(0);
+        }
+        __syncthreads();
+        // Forward FFT.
+        if (NUM_CONTRACTED_COPIES > 1) {
+            FFT_F_FORWARD
+        }
+        // Set the high frequency components to 0.
+        int start = (NUM_CONTRACTED_COPIES+1)/2;
+        int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start;
+        tempreal[indexInBlock] = freal[indexInBlock];
+        tempimag[indexInBlock] = fimag[indexInBlock];
+        __syncthreads();
+        if (indexInBlock >= start) {
+            freal[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
+            fimag[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
+        }
+        __syncthreads();
+        FFT_F_BACKWARD
+        // Store results.
+        force[forceIndex] = (long long) (FORCE_SCALE*freal[indexInBlock].x);
+        force[forceIndex+PADDED_NUM_ATOMS] = (long long) (FORCE_SCALE*freal[indexInBlock].y);
+        force[forceIndex+PADDED_NUM_ATOMS*2] = (long long) (FORCE_SCALE*freal[indexInBlock].z);
+    }
+}
--- a/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+++ b/plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
@@ -355,6 +355,82 @@ void testVirtualSites() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testContractions() {
+    const int gridSize = 3;
+    const int numMolecules = gridSize*gridSize*gridSize;
+    const int numParticles = numMolecules*2;
+    const int numCopies = 10;
+    const double spacing = 2.0;
+    const double cutoff = 3.0;
+    const double boxSize = spacing*(gridSize+1);
+    const double temperature = 300.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setCutoffDistance(cutoff);
+    nonbonded->setNonbondedMethod(NonbondedForce::PME);
+    nonbonded->setForceGroup(1);
+    nonbonded->setReciprocalSpaceForceGroup(2);
+    system.addForce(nonbonded);
+    // Create a cloud of molecules.
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numMolecules; i++) {
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        nonbonded->addParticle(-0.2, 0.2, 0.2);
+        nonbonded->addParticle(0.2, 0.2, 0.2);
+        nonbonded->addException(2*i, 2*i+1, 0, 1, 0);
+        bonds->addBond(2*i, 2*i+1, 1.0, 10000.0);
+    }
+    map<int, int> contractions;
+    contractions[1] = 3;
+    contractions[2] = 1;
+    RPMDIntegrator integ(numCopies, temperature, 10.0, 0.001, contractions);
+    Platform& platform = Platform::getPlatformByName("CUDA");
+    Context context(system, integ, platform);
+    for (int copy = 0; copy < numCopies; copy++) {
+        for (int i = 0; i < gridSize; i++)
+            for (int j = 0; j < gridSize; j++)
+                for (int k = 0; k < gridSize; k++) {
+                    Vec3 pos = Vec3(spacing*(i+0.02*genrand_real2(sfmt)), spacing*(j+0.02*genrand_real2(sfmt)), spacing*(k+0.02*genrand_real2(sfmt)));
+                    int index = k+gridSize*(j+gridSize*i);
+                    positions[2*index] = pos;
+                    positions[2*index+1] = Vec3(pos[0]+1.0, pos[1], pos[2]);
+                }
+        integ.setPositions(copy, positions);
+    }
+    // Check the temperature.
+    const int numSteps = 1000;
+    integ.step(1000);
+    vector<double> ke(numCopies, 0.0);
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        vector<State> state(numCopies);
+        for (int j = 0; j < numCopies; j++)
+            state[j] = integ.getState(j, State::Velocities, true);
+        for (int j = 0; j < numParticles; j++) {
+            for (int k = 0; k < numCopies; k++) {
+                Vec3 v = state[k].getVelocities()[j];
+                ke[k] += 0.5*system.getParticleMass(j)*v.dot(v);
+            }
+        }
+    }
+    double meanKE = 0.0;
+    for (int i = 0; i < numCopies; i++)
+        meanKE += ke[i];
+    meanKE /= numSteps*numCopies;
+    double expectedKE = 0.5*numCopies*numParticles*3*BOLTZ*temperature;
+    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
+}
 int main(int argc, char* argv[]) {
    try {
        registerRPMDCudaKernelFactories();
@@ -364,6 +440,7 @@ int main(int argc, char* argv[]) {
        testParaHydrogen();
        testCMMotionRemoval();
        testVirtualSites();
+        testContractions();
    }
    catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
@@ -48,6 +48,10 @@ OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
        delete positions;
    if (velocities != NULL)
        delete velocities;
+    if (contractedForces != NULL)
+        delete contractedForces;
+    if (contractedPositions != NULL)
+        delete contractedPositions;
 }
 void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
@@ -86,6 +90,34 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
            temp[i] = mm_float4(0, 0, 0, 1);
        velocities->upload(temp);
    }
+    // Build a list of contractions.
+    groupsNotContracted = -1;
+    const map<int, int>& contractions = integrator.getContractions();
+    int maxContractedCopies = 0;
+    for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
+        int group = iter->first;
+        int copies = iter->second;
+        if (group < 0 || group > 31)
+            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
+        if (copies < 0 || copies > numCopies)
+            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
+        if (copies != numCopies) {
+            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
+                groupsByCopies[copies] = 1<<group;
+                groupsNotContracted -= 1<<group;
+                if (copies > maxContractedCopies)
+                    maxContractedCopies = copies;
+            }
+            else
+                groupsByCopies[copies] |= 1<<group;
+        }
+    }
+    if (maxContractedCopies > 0) {
+        contractedForces = new OpenCLArray(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
+        contractedPositions = new OpenCLArray(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
+    }
    // Create kernels.
@@ -109,6 +141,23 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
    copyToContextKernel = cl::Kernel(program, "copyDataToContext");
    copyFromContextKernel = cl::Kernel(program, "copyDataFromContext");
    translateKernel = cl::Kernel(program, "applyCellTranslations");
+    // Create kernels for doing contractions.
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        replacements.clear();
+        replacements["NUM_CONTRACTED_COPIES"] = cl.intToString(copies);
+        replacements["POS_SCALE"] = cl.doubleToString(1.0/numCopies);
+        replacements["FORCE_SCALE"] = cl.doubleToString(1.0/copies);
+        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
+        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
+        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
+        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
+        program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmdContraction, replacements), defines, "");
+        positionContractionKernels[copies] = cl::Kernel(program, "contractPositions");
+        forceContractionKernels[copies] = cl::Kernel(program, "contractForces");
+    }
 }
 void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
@@ -124,16 +173,20 @@ void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
-    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(4, cl.getAtomIndexArray().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
-    copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(3, velocities->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
-    copyFromContextKernel.setArg<cl::Buffer>(5, positions->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(6, cl.getAtomIndexArray().getDeviceBuffer());
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        positionContractionKernels[copies].setArg<cl::Buffer>(0, positions->getDeviceBuffer());
+        positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions->getDeviceBuffer());
+        forceContractionKernels[copies].setArg<cl::Buffer>(0, forces->getDeviceBuffer());
+        forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
+    }
 }
 void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
@@ -201,15 +254,51 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
 }
 void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
+    // Compute forces from all groups that didn't have a specified contraction.
+    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
+    copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
+    copyFromContextKernel.setArg<cl::Buffer>(5, positions->getDeviceBuffer());
    for (int i = 0; i < numCopies; i++) {
        copyToContextKernel.setArg<cl_int>(5, i);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
        context.computeVirtualSites();
        context.updateContextState();
-        context.calcForcesAndEnergy(true, false);
+        context.calcForcesAndEnergy(true, false, groupsNotContracted);
        copyFromContextKernel.setArg<cl_int>(7, i);
        cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
    }
+    // Now loop over contractions and compute forces from them.
+    if (groupsByCopies.size() > 0) {
+        copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions->getDeviceBuffer());
+        copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
+        copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions->getDeviceBuffer());
+        for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+            int copies = iter->first;
+            int groupFlags = iter->second;
+            // Find the contracted positions.
+            cl.executeKernel(positionContractionKernels[copies], numParticles*numCopies, workgroupSize);
+            // Compute forces.
+            for (int i = 0; i < copies; i++) {
+                copyToContextKernel.setArg<cl_int>(5, i);
+                cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
+                context.computeVirtualSites();
+                context.calcForcesAndEnergy(true, false, groupFlags);
+                copyFromContextKernel.setArg<cl_int>(7, i);
+                cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
+            }
+            // Apply the forces to the original copies.
+            cl.executeKernel(forceContractionKernels[copies], numParticles*numCopies, workgroupSize);
+        }
+    }
 }
 double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
@@ -269,6 +358,7 @@ void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>&
 void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
    if (!hasInitializedKernel)
        initializeKernels(context);
+    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
    copyToContextKernel.setArg<cl_int>(5, copy);
    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
 }

--- a/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
+++ b/plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
@@ -35,7 +35,7 @@
 #include "openmm/RpmdKernels.h"
 #include "OpenCLContext.h"
 #include "OpenCLArray.h"
+#include <map>
 namespace OpenMM {
 /**
@@ -45,7 +45,7 @@ namespace OpenMM {
 class OpenCLIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
 public:
    OpenCLIntegrateRPMDStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) :
-            IntegrateRPMDStepKernel(name, platform), cl(cl), hasInitializedKernel(false), forces(NULL), positions(NULL), velocities(NULL) {
+            IntegrateRPMDStepKernel(name, platform), cl(cl), hasInitializedKernel(false), forces(NULL), positions(NULL), velocities(NULL), contractedForces(NULL), contractedPositions(NULL) {
    }
    ~OpenCLIntegrateRPMDStepKernel();
    /**
@@ -90,10 +90,16 @@ private:
    OpenCLContext& cl;
    bool hasInitializedKernel;
    int numCopies, numParticles, workgroupSize;
+    std::map<int, int> groupsByCopies;
+    int groupsNotContracted;
    OpenCLArray* forces;
    OpenCLArray* positions;
    OpenCLArray* velocities;
+    OpenCLArray* contractedForces;
+    OpenCLArray* contractedPositions;
    cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
+    std::map<int, cl::Kernel> positionContractionKernels;
+    std::map<int, cl::Kernel> forceContractionKernels;
 };
 } // namespace OpenMM

--- a/plugins/rpmd/platforms/opencl/src/kernels/rpmdContraction.cl
+++ b/plugins/rpmd/platforms/opencl/src/kernels/rpmdContraction.cl
+mixed4 multiplyComplexRealPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
+    return c1.x*c2r-c1.y*c2i;
+}
+mixed4 multiplyComplexImagPart(mixed2 c1, mixed4 c2r, mixed4 c2i) {
+    return c1.x*c2i+c1.y*c2r;
+}
+mixed4 multiplyComplexRealPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
+    return c1.x*c2r+c1.y*c2i;
+}
+mixed4 multiplyComplexImagPartConj(mixed2 c1, mixed4 c2r, mixed4 c2i) {
+    return c1.x*c2i-c1.y*c2r;
+}
+/**
+ * Compute the contracted positions
+ */
+__kernel void contractPositions(__global mixed4* posq, __global mixed4* contracted) {
+    const int numBlocks = get_global_size(0)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
+    const int indexInBlock = get_local_id(0)-blockStart;
+    __local mixed4 q[2*THREAD_BLOCK_SIZE];
+    __local mixed4 temp[2*THREAD_BLOCK_SIZE];
+    __local mixed2 w[NUM_COPIES];
+    __local mixed4* qreal = &q[blockStart];
+    __local mixed4* qimag = &q[blockStart+get_local_size(0)];
+    __local mixed4* tempreal = &temp[blockStart];
+    __local mixed4* tempimag = &temp[blockStart+get_local_size(0)];
+    if (get_local_id(0) < NUM_COPIES)
+        w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        // Load the particle position.
+        mixed4 particlePosq = convert_mixed4(posq[particle+indexInBlock*PADDED_NUM_ATOMS]);
+        qreal[indexInBlock] = particlePosq;
+        qimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
+        // Forward FFT.
+        barrier(CLK_LOCAL_MEM_FENCE);
+        FFT_Q_FORWARD
+        if (NUM_CONTRACTED_COPIES > 1) {
+            // Compress the data to remove high frequencies.
+            int start = (NUM_CONTRACTED_COPIES+1)/2;
+            tempreal[indexInBlock] = qreal[indexInBlock];
+            tempimag[indexInBlock] = qimag[indexInBlock];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (indexInBlock < NUM_CONTRACTED_COPIES) {
+                qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
+                qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            FFT_Q_BACKWARD
+        }
+        // Store results.
+        if (indexInBlock < NUM_CONTRACTED_COPIES)
+            contracted[particle+indexInBlock*PADDED_NUM_ATOMS] = (mixed4) (POS_SCALE*qreal[indexInBlock].x, POS_SCALE*qreal[indexInBlock].y, POS_SCALE*qreal[indexInBlock].z, particlePosq.w);
+    }
+}
+/**
+ * Apply the contracted forces to all copies.
+ */
+__kernel void contractForces(__global real4* force, __global real4* contracted) {
+    const int numBlocks = get_global_size(0)/NUM_COPIES;
+    const int blockStart = NUM_COPIES*(get_local_id(0)/NUM_COPIES);
+    const int indexInBlock = get_local_id(0)-blockStart;
+    __local mixed4 f[2*THREAD_BLOCK_SIZE];
+    __local mixed4 temp[2*THREAD_BLOCK_SIZE];
+    __local mixed2 w[NUM_COPIES];
+    __local mixed4* freal = &f[blockStart];
+    __local mixed4* fimag = &f[blockStart+get_local_size(0)];
+    __local mixed4* tempreal = &temp[blockStart];
+    __local mixed4* tempimag = &temp[blockStart+get_local_size(0)];
+    if (get_local_id(0) < NUM_COPIES)
+        w[indexInBlock] = (mixed2) (cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int particle = get_global_id(0)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
+        // Load the force.
+        int index = particle+indexInBlock*PADDED_NUM_ATOMS;
+        if (indexInBlock < NUM_CONTRACTED_COPIES) {
+            freal[indexInBlock] = convert_mixed4(contracted[index]);
+            fimag[indexInBlock] = (mixed4) (0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Forward FFT.
+        if (NUM_CONTRACTED_COPIES > 1) {
+            FFT_F_FORWARD
+        }
+        // Set the high frequency components to 0.
+        int start = (NUM_CONTRACTED_COPIES+1)/2;
+        int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start;
+        tempreal[indexInBlock] = freal[indexInBlock];
+        tempimag[indexInBlock] = fimag[indexInBlock];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (indexInBlock >= start) {
+            freal[indexInBlock] = (indexInBlock < end ? (mixed4) (0.0f, 0.0f, 0.0f, 0.0f) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
+            fimag[indexInBlock] = (indexInBlock < end ? (mixed4) (0.0f, 0.0f, 0.0f, 0.0f) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        FFT_F_BACKWARD
+        // Store results.
+        force[index] = FORCE_SCALE*freal[indexInBlock];
+    }
+}
--- a/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
+++ b/plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
@@ -356,6 +356,82 @@ void testVirtualSites() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testContractions() {
+    const int gridSize = 3;
+    const int numMolecules = gridSize*gridSize*gridSize;
+    const int numParticles = numMolecules*2;
+    const int numCopies = 10;
+    const double spacing = 2.0;
+    const double cutoff = 3.0;
+    const double boxSize = spacing*(gridSize+1);
+    const double temperature = 300.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setCutoffDistance(cutoff);
+    nonbonded->setNonbondedMethod(NonbondedForce::PME);
+    nonbonded->setForceGroup(1);
+    nonbonded->setReciprocalSpaceForceGroup(2);
+    system.addForce(nonbonded);
+    // Create a cloud of molecules.
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numMolecules; i++) {
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        nonbonded->addParticle(-0.2, 0.2, 0.2);
+        nonbonded->addParticle(0.2, 0.2, 0.2);
+        nonbonded->addException(2*i, 2*i+1, 0, 1, 0);
+        bonds->addBond(2*i, 2*i+1, 1.0, 10000.0);
+    }
+    map<int, int> contractions;
+    contractions[1] = 3;
+    contractions[2] = 1;
+    RPMDIntegrator integ(numCopies, temperature, 10.0, 0.001, contractions);
+    Platform& platform = Platform::getPlatformByName("OpenCL");
+    Context context(system, integ, platform);
+    for (int copy = 0; copy < numCopies; copy++) {
+        for (int i = 0; i < gridSize; i++)
+            for (int j = 0; j < gridSize; j++)
+                for (int k = 0; k < gridSize; k++) {
+                    Vec3 pos = Vec3(spacing*(i+0.02*genrand_real2(sfmt)), spacing*(j+0.02*genrand_real2(sfmt)), spacing*(k+0.02*genrand_real2(sfmt)));
+                    int index = k+gridSize*(j+gridSize*i);
+                    positions[2*index] = pos;
+                    positions[2*index+1] = Vec3(pos[0]+1.0, pos[1], pos[2]);
+                }
+        integ.setPositions(copy, positions);
+    }
+    // Check the temperature.
+    const int numSteps = 1000;
+    integ.step(1000);
+    vector<double> ke(numCopies, 0.0);
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        vector<State> state(numCopies);
+        for (int j = 0; j < numCopies; j++)
+            state[j] = integ.getState(j, State::Velocities, true);
+        for (int j = 0; j < numParticles; j++) {
+            for (int k = 0; k < numCopies; k++) {
+                Vec3 v = state[k].getVelocities()[j];
+                ke[k] += 0.5*system.getParticleMass(j)*v.dot(v);
+            }
+        }
+    }
+    double meanKE = 0.0;
+    for (int i = 0; i < numCopies; i++)
+        meanKE += ke[i];
+    meanKE /= numSteps*numCopies;
+    double expectedKE = 0.5*numCopies*numParticles*3*BOLTZ*temperature;
+    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
+}
 int main(int argc, char* argv[]) {
    try {
        registerRPMDOpenCLKernelFactories();
@@ -365,6 +441,7 @@ int main(int argc, char* argv[]) {
        testParaHydrogen();
        testCMMotionRemoval();
        testVirtualSites();
+        testContractions();
    }
    catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;

--- a/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.cpp
+++ b/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.cpp
@@ -30,6 +30,7 @@
 * -------------------------------------------------------------------------- */
 #include "ReferenceRpmdKernels.h"
+#include "openmm/OpenMMException.h"
 #include "openmm/internal/ContextImpl.h"
 #include "SimTKUtilities/SimTKOpenMMUtilities.h"
@@ -54,7 +55,11 @@ static vector<RealVec>& extractForces(ContextImpl& context) {
 ReferenceIntegrateRPMDStepKernel::~ReferenceIntegrateRPMDStepKernel() {
    if (fft != NULL)
        fftpack_destroy(fft);
+    for (map<int, fftpack*>::const_iterator iter = contractionFFT.begin(); iter != contractionFFT.end(); ++iter)
+        if (iter->second != NULL)
+            fftpack_destroy(iter->second);
 }
 void ReferenceIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    int numCopies = integrator.getNumCopies();
    int numParticles = system.getNumParticles();
@@ -68,6 +73,41 @@ void ReferenceIntegrateRPMDStepKernel::initialize(const System& system, const RP
    }
    fftpack_init_1d(&fft, numCopies);
    SimTKOpenMMUtilities::setRandomNumberSeed((unsigned int) integrator.getRandomNumberSeed());
+    // Build a list of contractions.
+    groupsNotContracted = -1;
+    const map<int, int>& contractions = integrator.getContractions();
+    int maxContractedCopies = 0;
+    for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
+        int group = iter->first;
+        int copies = iter->second;
+        if (group < 0 || group > 31)
+            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
+        if (copies < 0 || copies > numCopies)
+            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
+        if (copies != numCopies) {
+            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
+                groupsByCopies[copies] = 1<<group;
+                groupsNotContracted -= 1<<group;
+                contractionFFT[copies] = NULL;
+                fftpack_init_1d(&contractionFFT[copies], copies);
+                if (copies > maxContractedCopies)
+                    maxContractedCopies = copies;
+            }
+            else
+                groupsByCopies[copies] |= 1<<group;
+        }
+    }
+    // Create workspace for doing contractions.
+    contractedPositions.resize(maxContractedCopies);
+    contractedForces.resize(maxContractedCopies);
+    for (int i = 0; i < maxContractedCopies; i++) {
+        contractedPositions[i].resize(numParticles);
+        contractedForces[i].resize(numParticles);
+    }
 }
 void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
@@ -82,15 +122,8 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
    // Loop over copies and compute the force on each one.
-    if (!forcesAreValid) {
+    if (!forcesAreValid)
-        for (int i = 0; i < numCopies; i++) {
+        computeForces(context, integrator);
-            pos = positions[i];
-            vel = velocities[i];
-            context.computeVirtualSites();
-            context.calcForcesAndEnergy(true, false);
-            forces[i] = f;
-        }
-    }
    // Apply the PILE-L thermostat.
@@ -176,16 +209,7 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
    // Calculate forces based on the updated positions.
-    for (int i = 0; i < numCopies; i++) {
+    computeForces(context, integrator);
-        pos = positions[i];
-        vel = velocities[i];
-        context.computeVirtualSites();
-        context.updateContextState();
-        positions[i] = pos;
-        velocities[i] = vel;
-        context.calcForcesAndEnergy(true, false);
-        forces[i] = f;
-    }
    // Update velocities.
@@ -234,6 +258,90 @@ void ReferenceIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDI
    context.setTime(context.getTime()+dt);
 }
+void ReferenceIntegrateRPMDStepKernel::computeForces(ContextImpl& context, const RPMDIntegrator& integrator) {
+    const int totalCopies = positions.size();
+    const int numParticles = positions[0].size();
+    vector<RealVec>& pos = extractPositions(context);
+    vector<RealVec>& vel = extractVelocities(context);
+    vector<RealVec>& f = extractForces(context);
+    // Compute forces from all groups that didn't have a specified contraction.
+    for (int i = 0; i < totalCopies; i++) {
+        pos = positions[i];
+        vel = velocities[i];
+        context.computeVirtualSites();
+        context.updateContextState();
+        positions[i] = pos;
+        velocities[i] = vel;
+        context.calcForcesAndEnergy(true, false, groupsNotContracted);
+        forces[i] = f;
+    }
+    // Now loop over contractions and compute forces from them.
+    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
+        int copies = iter->first;
+        int groupFlags = iter->second;
+        fftpack* shortFFT = contractionFFT[copies];
+        // Find the contracted positions.
+        vector<t_complex> q(totalCopies);
+        const RealOpenMM scale1 = 1.0/totalCopies;
+        for (int particle = 0; particle < numParticles; particle++) {
+            for (int component = 0; component < 3; component++) {
+                // Transform to the frequency domain, set high frequency components to zero, and transform back.
+                for (int k = 0; k < totalCopies; k++)
+                    q[k] = t_complex(positions[k][particle][component], 0.0);
+                fftpack_exec_1d(fft, FFTPACK_FORWARD, &q[0], &q[0]);
+                if (copies > 1) {
+                    int start = (copies+1)/2;
+                    int end = totalCopies-copies+start;
+                    for (int k = end; k < totalCopies; k++)
+                        q[k-(totalCopies-copies)] = q[k];
+                    fftpack_exec_1d(shortFFT, FFTPACK_BACKWARD, &q[0], &q[0]);
+                }
+                for (int k = 0; k < copies; k++)
+                    contractedPositions[k][particle][component] = scale1*q[k].re;
+            }
+        }
+        // Compute forces.
+        for (int i = 0; i < copies; i++) {
+            pos = contractedPositions[i];
+            context.computeVirtualSites();
+            context.calcForcesAndEnergy(true, false, groupFlags);
+            contractedForces[i] = f;
+        }
+        // Apply the forces to the original copies.
+        const RealOpenMM scale2 = 1.0/copies;
+        for (int particle = 0; particle < numParticles; particle++) {
+            for (int component = 0; component < 3; component++) {
+                // Transform to the frequency domain, pad with zeros, and transform back.
+                for (int k = 0; k < copies; k++)
+                    q[k] = t_complex(contractedForces[k][particle][component], 0.0);
+                if (copies > 1)
+                    fftpack_exec_1d(shortFFT, FFTPACK_FORWARD, &q[0], &q[0]);
+                int start = (copies+1)/2;
+                int end = totalCopies-copies+start;
+                for (int k = end; k < totalCopies; k++)
+                    q[k] = q[k-(totalCopies-copies)];
+                for (int k = start; k < end; k++)
+                    q[k] = t_complex(0, 0);
+                fftpack_exec_1d(fft, FFTPACK_BACKWARD, &q[0], &q[0]);
+                for (int k = 0; k < totalCopies; k++)
+                    forces[k][particle][component] = scale2*q[k].re;
+            }
+        }
+    }
+}
 double ReferenceIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    const System& system = context.getSystem();
    int numParticles = system.getNumParticles();

--- a/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.h
+++ b/plugins/rpmd/platforms/reference/src/ReferenceRpmdKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -85,10 +85,16 @@ public:
     */
    void copyToContext(int copy, ContextImpl& context);
 private:
+    void computeForces(ContextImpl& context, const RPMDIntegrator& integrator);
    std::vector<std::vector<RealVec> > positions;
    std::vector<std::vector<RealVec> > velocities;
    std::vector<std::vector<RealVec> > forces;
+    std::vector<std::vector<RealVec> > contractedPositions;
+    std::vector<std::vector<RealVec> > contractedForces;
+    std::map<int, int> groupsByCopies;
+    int groupsNotContracted;
    fftpack* fft;
+    std::map<int, fftpack*> contractionFFT;
 };
 } // namespace OpenMM

--- a/plugins/rpmd/platforms/reference/tests/TestReferenceRpmd.cpp
+++ b/plugins/rpmd/platforms/reference/tests/TestReferenceRpmd.cpp
@@ -237,11 +237,88 @@ void testVirtualSites() {
    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
 }
+void testContractions() {
+    const int gridSize = 3;
+    const int numMolecules = gridSize*gridSize*gridSize;
+    const int numParticles = numMolecules*2;
+    const int numCopies = 10;
+    const double spacing = 2.0;
+    const double cutoff = 3.0;
+    const double boxSize = spacing*(gridSize+1);
+    const double temperature = 300.0;
+    System system;
+    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    HarmonicBondForce* bonds = new HarmonicBondForce();
+    system.addForce(bonds);
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setCutoffDistance(cutoff);
+    nonbonded->setNonbondedMethod(NonbondedForce::PME);
+    nonbonded->setForceGroup(1);
+    nonbonded->setReciprocalSpaceForceGroup(2);
+    system.addForce(nonbonded);
+    // Create a cloud of molecules.
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+    vector<Vec3> positions(numParticles);
+    for (int i = 0; i < numMolecules; i++) {
+        system.addParticle(1.0);
+        system.addParticle(1.0);
+        nonbonded->addParticle(-0.2, 0.2, 0.2);
+        nonbonded->addParticle(0.2, 0.2, 0.2);
+        nonbonded->addException(2*i, 2*i+1, 0, 1, 0);
+        bonds->addBond(2*i, 2*i+1, 1.0, 10000.0);
+    }
+    map<int, int> contractions;
+    contractions[1] = 3;
+    contractions[2] = 1;
+    RPMDIntegrator integ(numCopies, temperature, 10.0, 0.001, contractions);
+    Platform& platform = Platform::getPlatformByName("Reference");
+    Context context(system, integ, platform);
+    for (int copy = 0; copy < numCopies; copy++) {
+        for (int i = 0; i < gridSize; i++)
+            for (int j = 0; j < gridSize; j++)
+                for (int k = 0; k < gridSize; k++) {
+                    Vec3 pos = Vec3(spacing*(i+0.02*genrand_real2(sfmt)), spacing*(j+0.02*genrand_real2(sfmt)), spacing*(k+0.02*genrand_real2(sfmt)));
+                    int index = k+gridSize*(j+gridSize*i);
+                    positions[2*index] = pos;
+                    positions[2*index+1] = Vec3(pos[0]+1.0, pos[1], pos[2]);
+                }
+        integ.setPositions(copy, positions);
+    }
+    // Check the temperature.
+    const int numSteps = 1000;
+    integ.step(1000);
+    vector<double> ke(numCopies, 0.0);
+    for (int i = 0; i < numSteps; i++) {
+        integ.step(1);
+        vector<State> state(numCopies);
+        for (int j = 0; j < numCopies; j++)
+            state[j] = integ.getState(j, State::Velocities, true);
+        for (int j = 0; j < numParticles; j++) {
+            for (int k = 0; k < numCopies; k++) {
+                Vec3 v = state[k].getVelocities()[j];
+                ke[k] += 0.5*system.getParticleMass(j)*v.dot(v);
+            }
+        }
+    }
+    double meanKE = 0.0;
+    for (int i = 0; i < numCopies; i++)
+        meanKE += ke[i];
+    meanKE /= numSteps*numCopies;
+    double expectedKE = 0.5*numCopies*numParticles*3*BOLTZ*temperature;
+    ASSERT_USUALLY_EQUAL_TOL(expectedKE, meanKE, 1e-2);
+}
 int main() {
    try {
        testFreeParticles();
        testCMMotionRemoval();
        testVirtualSites();
+        testContractions();
    }
    catch(const std::exception& e) {
        std::cout << "exception: " << e.what() << std::endl;

--- a/wrappers/python/src/swig_doxygen/OpenMM.i
+++ b/wrappers/python/src/swig_doxygen/OpenMM.i
@@ -31,6 +31,7 @@ namespace std {
  %template(vectorstring) vector<string>;
  %template(mapstringstring) map<string,string>;
  %template(mapstringdouble) map<string,double>;
+  %template(mapii) map<int,int>;
 };
 %include "windows.i"

--- a/wrappers/python/src/swig_doxygen/swigInputConfig.py
+++ b/wrappers/python/src/swig_doxygen/swigInputConfig.py
@@ -446,5 +446,6 @@ UNITS = {
 ("DrudeLangevinIntegrator", "getDrudeTemperature") : ("unit.kelvin", ()),
 ("DrudeLangevinIntegrator", "getDrudeFriction") : ("1/unit.picosecond", ()),
 ("DrudeSCFIntegrator", "getMinimizationErrorTolerance") : ("unit.kilojoules_per_mole/unit.nanometer", ()),
+("RPMDIntegrator", "getContractions") : (None, ()),
 }
--- a/wrappers/python/src/swig_doxygen/swig_lib/python/pythoncode.i
+++ b/wrappers/python/src/swig_doxygen/swig_lib/python/pythoncode.i
@@ -282,16 +282,9 @@ def stripUnits(args):
            # JDC: End workaround.
            #arg=arg.value_in_unit_system(unit.md_unit_system)
        elif isinstance(arg, dict):
-            newArg = {}
+            newKeys = stripUnits(arg.keys())
-            for key in arg:
+            newValues = stripUnits(arg.values())
-                newKey = key
+            arg = dict(zip(newKeys, newValues))
-                newValue = arg[key]
-                if not _is_string(newKey):
-                    newKey = stripUnits(newKey)
-                if not _is_string(newValue):
-                    newValue = stripUnits(newValue)
-                newArg[newKey] = newValue
-            arg = newArg
        elif not _is_string(arg):
            try:
                iter(arg)