Merge pull request #1381 from peastman/parallelpositions

Parallelized the code for downloading positions

Merge pull request #1381 from peastman/parallelpositions
Parallelized the code for downloading positions
1085085d · peastman · 73a59c33 · 8d293880 · 1085085d · 1085085d
Commit 1085085d authored Feb 04, 2016 by peastman
15 changed files
--- a/openmmapi/include/openmm/internal/timer.h
+++ b/openmmapi/include/openmm/internal/timer.h
+#ifndef OPENMM_TIMER_H_
+#define OPENMM_TIMER_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2016 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This header provides a static function for querying the current system time in seconds.
+ * It is useful when profiling.
+ */
+
+#ifdef _MSC_VER
+    #include <Windows.h>
+    static double getCurrentTime() {
+        FILETIME ft;
+        GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
+        ULARGE_INTEGER result;
+        result.LowPart = ft.dwLowDateTime;
+        result.HighPart = ft.dwHighDateTime;
+        return 1e-7*result.QuadPart;
+    }
+#else
+    #include <sys/time.h> 
+    static double getCurrentTime() {
+        struct timeval tod;
+        gettimeofday(&tod, 0);
+        return tod.tv_sec+1e-6*tod.tv_usec;
+    }
+#endif
+
+#endif /*OPENMM_TIMER_H_*/
--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -192,6 +192,7 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
+    class GetPositionsTask;
    CudaContext& cu;
 };


--- a/platforms/cuda/include/CudaPlatform.h
+++ b/platforms/cuda/include/CudaPlatform.h
@@ -29,6 +29,7 @@

 #include "openmm/Platform.h"
 #include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
 #include "windowsExportCuda.h"

 namespace OpenMM {
@@ -122,7 +123,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
 public:
    PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
            const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
-            const std::string& pmeStreamProperty);
+            const std::string& pmeStreamProperty, int numThreads);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
@@ -134,6 +135,7 @@ public:
    int stepCount, computeForceCount;
    double time;
    std::map<std::string, std::string> propertyValues;
+    ThreadPool threads;
 };

 } // namespace OpenMM

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -141,43 +141,76 @@ void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
        contexts[i]->setTime(time);
 }

+class CudaUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
+public:
+    GetPositionsTask(CudaContext& cu, vector<Vec3>& positions, vector<float4>& posCorrection) : cu(cu), positions(positions), posCorrection(posCorrection) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
+        
+        const vector<int>& order = cu.getAtomIndex();
+        int numParticles = cu.getNumAtoms();
+        Vec3 boxVectors[3];
+        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        if (cu.getUseDoublePrecision()) {
+            double4* posq = (double4*) cu.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                double4 pos = posq[i];
+                int4 offset = cu.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+        else if (cu.getUseMixedPrecision()) {
+            float4* posq = (float4*) cu.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                float4 pos1 = posq[i];
+                float4 pos2 = posCorrection[i];
+                int4 offset = cu.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+        else {
+            float4* posq = (float4*) cu.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                float4 pos = posq[i];
+                int4 offset = cu.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+    }
+    CudaContext& cu;
+    vector<Vec3>& positions;
+    vector<float4>& posCorrection;
+};
+
 void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
    cu.setAsCurrent();
-    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
-    Vec3 boxVectors[3];
-    cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    vector<float4> posCorrection;
    if (cu.getUseDoublePrecision()) {
        double4* posq = (double4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
-        for (int i = 0; i < numParticles; ++i) {
-            double4 pos = posq[i];
-            int4 offset = cu.getPosCellOffsets()[i];
-            positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
-        }
    }
    else if (cu.getUseMixedPrecision()) {
        float4* posq = (float4*) cu.getPinnedBuffer();
-        vector<float4> posCorrection;
-        cu.getPosq().download(posq);
+        cu.getPosq().download(posq, false);
+        posCorrection.resize(numParticles);
        cu.getPosqCorrection().download(posCorrection);
-        for (int i = 0; i < numParticles; ++i) {
-            float4 pos1 = posq[i];
-            float4 pos2 = posCorrection[i];
-            int4 offset = cu.getPosCellOffsets()[i];
-            positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
-        }
    }
    else {
        float4* posq = (float4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
-        for (int i = 0; i < numParticles; ++i) {
-            float4 pos = posq[i];
-            int4 offset = cu.getPosCellOffsets()[i];
-            positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
-        }
    }
+    
+    // Filling in the output array is done in parallel for speed.
+    
+    GetPositionsTask task(cu, positions, posCorrection);
+    cu.getPlatformData().threads.execute(task);
+    cu.getPlatformData().threads.waitForThreads();
 }

 void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {

--- a/platforms/cuda/src/CudaPlatform.cpp
+++ b/platforms/cuda/src/CudaPlatform.cpp
@@ -29,9 +29,10 @@
 #include "CudaPlatform.h"
 #include "CudaKernelFactory.h"
 #include "CudaKernels.h"
-#include "openmm/internal/ContextImpl.h"
 #include "openmm/Context.h"
 #include "openmm/System.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/hardware.h"
 #include <algorithm>
 #include <cctype>
 #include <sstream>
@@ -175,7 +176,11 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
        cpuPmePropValue = "false";
-    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue));
+    int threads = getNumProcessors();
+    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
+    if (threadsEnv != NULL)
+        stringstream(threadsEnv) >> threads;
+    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue, threads));
 }

 void CudaPlatform::contextDestroyed(ContextImpl& context) const {
@@ -184,7 +189,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
 }

 CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
-            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
+            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, int numThreads) :
+                context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;

--- a/platforms/cuda/tests/TestCudaFFT3D.cpp
+++ b/platforms/cuda/tests/TestCudaFFT3D.cpp
@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/cuda/tests/TestCudaRandom.cpp
+++ b/platforms/cuda/tests/TestCudaRandom.cpp
@@ -56,7 +56,7 @@ void testGaussian() {
        system.addParticle(1.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
    system.addParticle(0.0);
    CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
-            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
+            platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
    CudaContext& context = *platformData.contexts[0];
    context.initialize();
    CudaArray data(context, array.size(), 4, "sortData");

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008-2015 Stanford University and the Authors.      *
+ * Portions copyright (c) 2008-2016 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -170,6 +170,7 @@ public:
     */
    void loadCheckpoint(ContextImpl& context, std::istream& stream);
 private:
+    class GetPositionsTask;
    OpenCLContext& cl;
 };


--- a/platforms/opencl/include/OpenCLPlatform.h
+++ b/platforms/opencl/include/OpenCLPlatform.h
@@ -29,6 +29,7 @@

 #include "openmm/Platform.h"
 #include "openmm/System.h"
+#include "openmm/internal/ThreadPool.h"
 #include "windowsExportOpenCL.h"
 namespace OpenMM {
    
@@ -107,7 +108,7 @@ public:
 class OPENMM_EXPORT_OPENCL OpenCLPlatform::PlatformData {
 public:
    PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty,
-            const std::string& cpuPmeProperty, const std::string& pmeStreamProperty);
+            const std::string& cpuPmeProperty, const std::string& pmeStreamProperty, int numThreads);
    ~PlatformData();
    void initializeContexts(const System& system);
    void syncContexts();
@@ -119,6 +120,7 @@ public:
    int stepCount, computeForceCount;
    double time;
    std::map<std::string, std::string> propertyValues;
+    ThreadPool threads;
 };

 } // namespace OpenMM

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -165,42 +165,75 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
        contexts[i]->setTime(time);
 }

+class OpenCLUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
+public:
+    GetPositionsTask(OpenCLContext& cl, vector<Vec3>& positions, vector<mm_float4>& posCorrection) : cl(cl), positions(positions), posCorrection(posCorrection) {
+    }
+    void execute(ThreadPool& threads, int threadIndex) {
+        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
+        
+        const vector<int>& order = cl.getAtomIndex();
+        int numParticles = cl.getNumAtoms();
+        Vec3 boxVectors[3];
+        cl.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        int numThreads = threads.getNumThreads();
+        int start = threadIndex*numParticles/numThreads;
+        int end = (threadIndex+1)*numParticles/numThreads;
+        if (cl.getUseDoublePrecision()) {
+            mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                mm_double4 pos = posq[i];
+                mm_int4 offset = cl.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+        else if (cl.getUseMixedPrecision()) {
+            mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                mm_float4 pos1 = posq[i];
+                mm_float4 pos2 = posCorrection[i];
+                mm_int4 offset = cl.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+        else {
+            mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
+            for (int i = start; i < end; ++i) {
+                mm_float4 pos = posq[i];
+                mm_int4 offset = cl.getPosCellOffsets()[i];
+                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
+            }
+        }
+    }
+    OpenCLContext& cl;
+    vector<Vec3>& positions;
+    vector<mm_float4>& posCorrection;
+};
+
 void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
-    const vector<cl_int>& order = cl.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
-    Vec3 boxVectors[3];
-    cl.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    vector<mm_float4> posCorrection;
    if (cl.getUseDoublePrecision()) {
        mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
        cl.getPosq().download(posq);
-        for (int i = 0; i < numParticles; ++i) {
-            mm_double4 pos = posq[i];
-            mm_int4 offset = cl.getPosCellOffsets()[i];
-            positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
-        }
    }
    else if (cl.getUseMixedPrecision()) {
        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
-        vector<mm_float4> posCorrection;
-        cl.getPosq().download(posq);
+        cl.getPosq().download(posq, false);
+        posCorrection.resize(numParticles);
        cl.getPosqCorrection().download(posCorrection);
-        for (int i = 0; i < numParticles; ++i) {
-            mm_float4 pos1 = posq[i];
-            mm_float4 pos2 = posCorrection[i];
-            mm_int4 offset = cl.getPosCellOffsets()[i];
-            positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
-        }
    }
    else {
        mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
        cl.getPosq().download(posq);
-        for (int i = 0; i < numParticles; ++i) {
-            mm_float4 pos = posq[i];
-            mm_int4 offset = cl.getPosCellOffsets()[i];
-            positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
-        }
    }
+    
+    // Filling in the output array is done in parallel for speed.
+    
+    GetPositionsTask task(cl, positions, posCorrection);
+    cl.getPlatformData().threads.execute(task);
+    cl.getPlatformData().threads.waitForThreads();
 }

 void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -28,9 +28,10 @@
 #include "OpenCLPlatform.h"
 #include "OpenCLKernelFactory.h"
 #include "OpenCLKernels.h"
-#include "openmm/internal/ContextImpl.h"
 #include "openmm/Context.h"
 #include "openmm/System.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/internal/hardware.h"
 #include <algorithm>
 #include <cctype>
 #include <sstream>
@@ -165,7 +166,11 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
        cpuPmePropValue = "false";
-    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue, cpuPmePropValue, pmeStreamPropValue));
+    int threads = getNumProcessors();
+    char* threadsEnv = getenv("OPENMM_CPU_THREADS");
+    if (threadsEnv != NULL)
+        stringstream(threadsEnv) >> threads;
+    context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue, cpuPmePropValue, pmeStreamPropValue, threads));
 }

 void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
@@ -174,7 +179,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
 }

 OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
-        const string& precisionProperty, const string& cpuPmeProperty, const string& pmeStreamProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false)  {
+        const string& precisionProperty, const string& cpuPmeProperty, const string& pmeStreamProperty, int numThreads) :
+            removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads)  {
    int platformIndex = -1;
    if (platformPropValue.length() > 0)
        stringstream(platformPropValue) >> platformIndex;

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -54,7 +54,7 @@ template <class Real2>
 void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    System system;
    system.addParticle(0.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false");
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false", 1);
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;

--- a/platforms/opencl/tests/TestOpenCLRandom.cpp
+++ b/platforms/opencl/tests/TestOpenCLRandom.cpp
@@ -54,7 +54,7 @@ void testGaussian() {
    System system;
    for (int i = 0; i < numAtoms; i++)
        system.addParticle(1.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false");
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false", 1);
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    context.getIntegrationUtilities().initRandomNumberGenerator(0);

--- a/platforms/opencl/tests/TestOpenCLSort.cpp
+++ b/platforms/opencl/tests/TestOpenCLSort.cpp
@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {

    System system;
    system.addParticle(0.0);
-    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false");
+    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false", 1);
    OpenCLContext& context = *platformData.contexts[0];
    context.initialize();
    OpenCLArray data(context, array.size(), sizeof(float), "sortData");