Use lambdas for thread pool tasks

859bfd6c · peastman · 2c559596 · 859bfd6c · 859bfd6c · 859bfd6c
Commit 859bfd6c authored Feb 15, 2017 by peastman
12 changed files
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -137,35 +137,27 @@ static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>&
    return 0.5*energy;
 }

-class CpuCalcForcesAndEnergyKernel::SumForceTask : public ThreadPool::Task {
-public:
-    SumForceTask(int numParticles, vector<RealVec>& forceData, CpuPlatform::PlatformData& data) : numParticles(numParticles), forceData(forceData), data(data) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        // Sum the contributions to forces that have been calculated by different threads.
+CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
+        CalcForcesAndEnergyKernel(name, platform), data(data) {
+    // Create a Reference platform version of this kernel.
    
-        int numThreads = threads.getNumThreads();
-        int start = threadIndex*numParticles/numThreads;
-        int end = (threadIndex+1)*numParticles/numThreads;
-        for (int i = start; i < end; i++) {
-            fvec4 f(0.0f);
-            for (int j = 0; j < numThreads; j++)
-                f += fvec4(&data.threadForce[j][4*i]);
-            forceData[i][0] += f[0];
-            forceData[i][1] += f[1];
-            forceData[i][2] += f[2];
-        }
-    }
-    int numParticles;
-    vector<RealVec>& forceData;
-    CpuPlatform::PlatformData& data;
-};
+    ReferenceKernelFactory referenceFactory;
+    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
+}

-class CpuCalcForcesAndEnergyKernel::InitForceTask : public ThreadPool::Task {
-public:
-    InitForceTask(int numParticles, ContextImpl& context, CpuPlatform::PlatformData& data) : numParticles(numParticles), positionsValid(true), context(context), data(data) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
+void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
+    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
+}
+
+void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
+    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
+    
+    // Convert positions to single precision and clear the forces.
+
+    int numParticles = context.getSystem().getNumParticles();
+    bool positionsValid = true;
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
        // Convert the positions to single precision and apply periodic boundary conditions

        AlignedArray<float>& posq = data.posq;
@@ -218,36 +210,9 @@ public:
        fvec4 zero(0.0f);
        for (int j = 0; j < numParticles; j++)
            zero.store(&data.threadForce[threadIndex][j*4]);
-    }
-    int numParticles;
-    bool positionsValid;
-    ContextImpl& context;
-    CpuPlatform::PlatformData& data;
-};
-
-CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
-        CalcForcesAndEnergyKernel(name, platform), data(data) {
-    // Create a Reference platform version of this kernel.
-    
-    ReferenceKernelFactory referenceFactory;
-    referenceKernel = Kernel(referenceFactory.createKernelImpl(name, platform, context));
-}
-
-void CpuCalcForcesAndEnergyKernel::initialize(const System& system) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().initialize(system);
-    lastPositions.resize(system.getNumParticles(), Vec3(1e10, 1e10, 1e10));
-}
-
-void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
-    referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().beginComputation(context, includeForce, includeEnergy, groups);
-    
-    // Convert positions to single precision and clear the forces.
-
-    int numParticles = context.getSystem().getNumParticles();
-    InitForceTask task(numParticles, context, data);
-    data.threads.execute(task);
+    });
    data.threads.waitForThreads();
-    if (!task.positionsValid)
+    if (!positionsValid)
        throw OpenMMException("Particle coordinate is nan");

    // Determine whether we need to recompute the neighbor list.
@@ -302,8 +267,23 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
 double CpuCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    // Sum the forces from all the threads.
    
-    SumForceTask task(context.getSystem().getNumParticles(), extractForces(context), data);
-    data.threads.execute(task);
+    data.threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        // Sum the contributions to forces that have been calculated by different threads.
+        
+        int numParticles = context.getSystem().getNumParticles();
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        vector<RealVec>& forceData = extractForces(context);
+        for (int i = start; i < end; i++) {
+            fvec4 f(0.0f);
+            for (int j = 0; j < numThreads; j++)
+                f += fvec4(&data.threadForce[j][4*i]);
+            forceData[i][0] += f[0];
+            forceData[i][1] += f[1];
+            forceData[i][2] += f[2];
+        }
+    });
    data.threads.waitForThreads();
    return referenceKernel.getAs<ReferenceCalcForcesAndEnergyKernel>().finishComputation(context, includeForce, includeEnergy, groups, valid);
 }

--- a/platforms/cpu/src/CpuLangevinDynamics.cpp
+++ b/platforms/cpu/src/CpuLangevinDynamics.cpp

-/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -29,36 +29,6 @@
 using namespace OpenMM;
 using namespace std;

-class CpuLangevinDynamics::Update1Task : public ThreadPool::Task {
-public:
-    Update1Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate1(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-
-class CpuLangevinDynamics::Update2Task : public ThreadPool::Task {
-public:
-    Update2Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate2(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-
-class CpuLangevinDynamics::Update3Task : public ThreadPool::Task {
-public:
-    Update3Task(CpuLangevinDynamics& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadUpdate3(threadIndex);
-    }
-    CpuLangevinDynamics& owner;
-};
-
 CpuLangevinDynamics::CpuLangevinDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM friction, RealOpenMM temperature, ThreadPool& threads, CpuRandom& random) : 
           ReferenceStochasticDynamics(numberOfAtoms, deltaT, friction, temperature), threads(threads), random(random) {
 }
@@ -79,8 +49,7 @@ void CpuLangevinDynamics::updatePart1(int numberOfAtoms, vector<RealVec>& atomCo
    
    // Signal the threads to start running and wait for them to finish.
    
-    Update1Task task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate1(threadIndex); });
    threads.waitForThreads();
 }

@@ -97,8 +66,7 @@ void CpuLangevinDynamics::updatePart2(int numberOfAtoms, vector<RealVec>& atomCo
    
    // Signal the threads to start running and wait for them to finish.
    
-    Update2Task task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate2(threadIndex); });
    threads.waitForThreads();
 }

@@ -114,8 +82,7 @@ void CpuLangevinDynamics::updatePart3(int numberOfAtoms, vector<RealVec>& atomCo
    
    // Signal the threads to start running and wait for them to finish.
    
-    Update3Task task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadUpdate3(threadIndex); });
    threads.waitForThreads();
 }


--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -409,16 +409,6 @@ private:
    vector<vector<vector<pair<float, int> > > > bins;
 };

-class CpuNeighborList::ThreadTask : public ThreadPool::Task {
-public:
-    ThreadTask(CpuNeighborList& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeNeighborList(threads, threadIndex);
-    }
-    CpuNeighborList& owner;
-};
-
 CpuNeighborList::CpuNeighborList(int blockSize) : blockSize(blockSize) {
 }

@@ -460,8 +450,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
    // Sort the atoms based on a Hilbert curve.
    
    atomBins.resize(numAtoms);
-    ThreadTask task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeNeighborList(threads, threadIndex); });
    threads.waitForThreads();
    sort(atomBins.begin(), atomBins.end());


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp

-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -41,16 +41,6 @@ using namespace OpenMM;
 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
 const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;

-class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
-public:
-    ComputeDirectTask(CpuNonbondedForce& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.threadComputeDirect(threads, threadIndex);
-    }
-    CpuNonbondedForce& owner;
-};
-
 /**---------------------------------------------------------------------------------------

   CpuNonbondedForce constructor
@@ -318,8 +308,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    
    // Signal the threads to start running and wait for them to finish.
    
-    ComputeDirectTask task(*this);
-    threads.execute(task);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeDirect(threads, threadIndex); });
    threads.waitForThreads();
    
    // Signal the threads to subtract the exclusions.

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,52 +35,6 @@
 using namespace OpenMM;
 using namespace std;

-class CpuSETTLE::ApplyToPositionsTask : public ThreadPool::Task {
-public:
-    ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& atomCoordinatesP;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-
-class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
-public:
-    ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
-            RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
-            inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
-        gmx_atomic_set(&atomicCounter, 0);
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
-            if (index >= threadSettle.size())
-                break;
-            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
-        }
-    }
-    vector<OpenMM::RealVec>& atomCoordinates;
-    vector<OpenMM::RealVec>& velocities;
-    vector<RealOpenMM>& inverseMasses;
-    RealOpenMM tolerance;
-    vector<ReferenceSETTLEAlgorithm*>& threadSettle;
-    gmx_atomic_t atomicCounter;
-};
-
 CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
    int numBlocks = 10*threads.getNumThreads();
    int numClusters = settle.getNumClusters();
@@ -107,13 +61,29 @@ CpuSETTLE::~CpuSETTLE() {
 }

 void CpuSETTLE::apply(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
-    ApplyToPositionsTask task(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance, threadSettle);
-    threads.execute(task);
+    gmx_atomic_t atomicCounter;
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }

 void CpuSETTLE::applyToVelocities(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, RealOpenMM tolerance) {
-    ApplyToVelocitiesTask task(atomCoordinates, velocities, inverseMasses, tolerance, threadSettle);
-    threads.execute(task);
+    gmx_atomic_t atomicCounter;
+    gmx_atomic_set(&atomicCounter, 0);
+    threads.execute([&] (ThreadPool& threads, int threadIndex) {
+        while (true) {
+            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            if (index >= threadSettle.size())
+                break;
+            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
+        }
+    });
    threads.waitForThreads();
 }
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -198,7 +198,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    CudaContext& cu;
 };


--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -147,11 +147,29 @@ void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
        contexts[i]->setTime(time);
 }

-class CudaUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
-public:
-    GetPositionsTask(CudaContext& cu, vector<Vec3>& positions, vector<float4>& posCorrection) : cu(cu), positions(positions), posCorrection(posCorrection) {
+void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
+    cu.setAsCurrent();
+    int numParticles = context.getSystem().getNumParticles();
+    positions.resize(numParticles);
+    vector<float4> posCorrection;
+    if (cu.getUseDoublePrecision()) {
+        double4* posq = (double4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
+    }
+    else if (cu.getUseMixedPrecision()) {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq, false);
+        posCorrection.resize(numParticles);
+        cu.getPosqCorrection().download(posCorrection);
+    }
+    else {
+        float4* posq = (float4*) cu.getPinnedBuffer();
+        cu.getPosq().download(posq);
    }
-    void execute(ThreadPool& threads, int threadIndex) {
+    
+    // Filling in the output array is done in parallel for speed.
+    
+    cu.getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
        
        const vector<int>& order = cu.getAtomIndex();
@@ -186,36 +204,7 @@ public:
                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
            }
        }
-    }
-    CudaContext& cu;
-    vector<Vec3>& positions;
-    vector<float4>& posCorrection;
-};
-
-void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    cu.setAsCurrent();
-    int numParticles = context.getSystem().getNumParticles();
-    positions.resize(numParticles);
-    vector<float4> posCorrection;
-    if (cu.getUseDoublePrecision()) {
-        double4* posq = (double4*) cu.getPinnedBuffer();
-        cu.getPosq().download(posq);
-    }
-    else if (cu.getUseMixedPrecision()) {
-        float4* posq = (float4*) cu.getPinnedBuffer();
-        cu.getPosq().download(posq, false);
-        posCorrection.resize(numParticles);
-        cu.getPosqCorrection().download(posCorrection);
-    }
-    else {
-        float4* posq = (float4*) cu.getPinnedBuffer();
-        cu.getPosq().download(posq);
-    }
-    
-    // Filling in the output array is done in parallel for speed.
-    
-    GetPositionsTask task(cu, positions, posCorrection);
-    cu.getPlatformData().threads.execute(task);
+    });
    cu.getPlatformData().threads.waitForThreads();
 }


--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -176,7 +176,6 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
-    class GetPositionsTask;
    OpenCLContext& cl;
 };


--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -171,11 +171,28 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
        contexts[i]->setTime(time);
 }

-class OpenCLUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
-public:
-    GetPositionsTask(OpenCLContext& cl, vector<Vec3>& positions, vector<mm_float4>& posCorrection) : cl(cl), positions(positions), posCorrection(posCorrection) {
+void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
+    int numParticles = context.getSystem().getNumParticles();
+    positions.resize(numParticles);
+    vector<mm_float4> posCorrection;
+    if (cl.getUseDoublePrecision()) {
+        mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq);
    }
-    void execute(ThreadPool& threads, int threadIndex) {
+    else if (cl.getUseMixedPrecision()) {
+        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq, false);
+        posCorrection.resize(numParticles);
+        cl.getPosqCorrection().download(posCorrection);
+    }
+    else {
+        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+        cl.getPosq().download(posq);
+    }
+    
+    // Filling in the output array is done in parallel for speed.
+    
+    cl.getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
        
        const vector<int>& order = cl.getAtomIndex();
@@ -210,35 +227,7 @@ public:
                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
            }
        }
-    }
-    OpenCLContext& cl;
-    vector<Vec3>& positions;
-    vector<mm_float4>& posCorrection;
-};
-
-void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    int numParticles = context.getSystem().getNumParticles();
-    positions.resize(numParticles);
-    vector<mm_float4> posCorrection;
-    if (cl.getUseDoublePrecision()) {
-        mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
-        cl.getPosq().download(posq);
-    }
-    else if (cl.getUseMixedPrecision()) {
-        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
-        cl.getPosq().download(posq, false);
-        posCorrection.resize(numParticles);
-        cl.getPosqCorrection().download(posCorrection);
-    }
-    else {
-        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
-        cl.getPosq().download(posq);
-    }
-    
-    // Filling in the output array is done in parallel for speed.
-    
-    GetPositionsTask task(cl, positions, posCorrection);
-    cl.getPlatformData().threads.execute(task);
+    });
    cl.getPlatformData().threads.waitForThreads();
 }


--- a/platforms/reference/src/SimTKReference/ReferenceCCMAAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceCCMAAlgorithm.cpp

-/* Portions copyright (c) 2006-2015 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
 * Contributors: Peter Eastman, Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -38,42 +38,6 @@
 using namespace OpenMM;
 using namespace std;

-// This class extracts columns from the inverse matrix one at a time.  It is done in parallel,
-// since this can be very slow.
-
-class ExtractMatrixTask : public ThreadPool::Task {
-public:
-    ExtractMatrixTask(int numConstraints, vector<vector<pair<int, RealOpenMM> > >& transposedMatrix, const vector<RealOpenMM>& distance, RealOpenMM elementCutoff,
-                      const int* qRowStart, const int* qColIndex, const int* rRowStart, const int* rColIndex, const double* qValue, const double* rValue) :
-                numConstraints(numConstraints), transposedMatrix(transposedMatrix), distance(distance), elementCutoff(elementCutoff), qRowStart(qRowStart), qColIndex(qColIndex),
-                rRowStart(rRowStart), rColIndex(rColIndex), qValue(qValue), rValue(rValue) {
-    }
-
-    void execute(ThreadPool& pool, int threadIndex) {
-        vector<double> rhs(numConstraints);
-        for (int i = threadIndex; i < numConstraints; i += pool.getNumThreads()) {
-            // Extract column i of the inverse matrix.
-
-            for (int j = 0; j < numConstraints; j++)
-                rhs[j] = (i == j ? 1.0 : 0.0);
-            QUERN_multiply_with_q_transpose(numConstraints, qRowStart, qColIndex, qValue, &rhs[0]);
-            QUERN_solve_with_r(numConstraints, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
-            for (int j = 0; j < numConstraints; j++) {
-                double value = rhs[j]*distance[i]/distance[j];
-                if (FABS((RealOpenMM) value) > elementCutoff)
-                    transposedMatrix[i].push_back(pair<int, RealOpenMM>(j, (RealOpenMM) value));
-            }
-        }
-    }
-private:
-    int numConstraints;
-    vector<vector<pair<int, RealOpenMM> > >& transposedMatrix;
-    const vector<RealOpenMM>& distance;
-    RealOpenMM elementCutoff;
-    const int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
-    const double *qValue, *rValue;
-};
-
 ReferenceCCMAAlgorithm::ReferenceCCMAAlgorithm(int numberOfAtoms,
                                               int numberOfConstraints,
                                               const vector<pair<int, int> >& atomIndices,
@@ -194,9 +158,27 @@ ReferenceCCMAAlgorithm::ReferenceCCMAAlgorithm(int numberOfAtoms,
                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
        vector<vector<pair<int, RealOpenMM> > > transposedMatrix(numberOfConstraints);
        _matrix.resize(numberOfConstraints);
+
+        // Extract columns from the inverse matrix one at a time.  It is done in parallel,
+        // since this can be very slow.
+        
        ThreadPool threads;
-        ExtractMatrixTask task(numberOfConstraints, transposedMatrix, _distance, _elementCutoff, qRowStart, qColIndex, rRowStart, rColIndex, qValue, rValue);
-        threads.execute(task);
+        threads.execute([&] (ThreadPool& pool, int threadIndex) {
+            vector<double> rhs(numberOfConstraints);
+            for (int i = threadIndex; i < numberOfConstraints; i += pool.getNumThreads()) {
+                // Extract column i of the inverse matrix.
+
+                for (int j = 0; j < numberOfConstraints; j++)
+                    rhs[j] = (i == j ? 1.0 : 0.0);
+                QUERN_multiply_with_q_transpose(numberOfConstraints, qRowStart, qColIndex, qValue, &rhs[0]);
+                QUERN_solve_with_r(numberOfConstraints, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
+                for (int j = 0; j < numberOfConstraints; j++) {
+                    double value = rhs[j]*distance[i]/distance[j];
+                    if (FABS((RealOpenMM) value) > elementCutoff)
+                        transposedMatrix[i].push_back(pair<int, RealOpenMM>(j, (RealOpenMM) value));
+                }
+            }
+        });
        threads.waitForThreads();

        // For purposes of thread safety we extracted the matrix in transposed form, so we need to transpose it again.

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -334,16 +334,6 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx,
    }
 }

-class CpuCalcPmeReciprocalForceKernel::ComputeTask : public ThreadPool::Task {
-public:
-    ComputeTask(CpuCalcPmeReciprocalForceKernel& owner) : owner(owner) {
-    }
-    void execute(ThreadPool& threads, int threadIndex) {
-        owner.runWorkerThread(threads, threadIndex);
-    }
-    CpuCalcPmeReciprocalForceKernel& owner;
-};
-
 static void* threadBody(void* args) {
    CpuCalcPmeReciprocalForceKernel& owner = *reinterpret_cast<CpuCalcPmeReciprocalForceKernel*>(args);
    owner.runMainThread();
@@ -483,9 +473,8 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
        if (isDeleted)
            break;
        posq = io->getPosq();
-        ComputeTask task(*this);
        gmx_atomic_set(&atomicCounter, 0);
-        threads.execute(task); // Signal threads to perform charge spreading.
+        threads.execute([&] (ThreadPool& threads, int threadIndex) { runWorkerThread(threads, threadIndex); }); // Signal threads to perform charge spreading.
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
        threads.waitForThreads();

--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -102,7 +102,6 @@ public:
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
-    class ComputeTask;
    /**
     * Select a size for one grid dimension that FFTW can handle efficiently.
     */