Commit 1085085d authored by peastman's avatar peastman
Browse files

Merge pull request #1381 from peastman/parallelpositions

Parallelized the code for downloading positions
parents 73a59c33 8d293880
#ifndef OPENMM_TIMER_H_
#define OPENMM_TIMER_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2016 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This header provides a static function for querying the current system time in seconds.
* It is useful when profiling.
*/
#ifdef _MSC_VER
#include <Windows.h>
static double getCurrentTime() {
FILETIME ft;
GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
ULARGE_INTEGER result;
result.LowPart = ft.dwLowDateTime;
result.HighPart = ft.dwHighDateTime;
return 1e-7*result.QuadPart;
}
#else
#include <sys/time.h>
static double getCurrentTime() {
struct timeval tod;
gettimeofday(&tod, 0);
return tod.tv_sec+1e-6*tod.tv_usec;
}
#endif
#endif /*OPENMM_TIMER_H_*/
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -192,6 +192,7 @@ public:
*/
void loadCheckpoint(ContextImpl& context, std::istream& stream);
private:
class GetPositionsTask;
CudaContext& cu;
};
......
......@@ -29,6 +29,7 @@
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include "windowsExportCuda.h"
namespace OpenMM {
......@@ -122,7 +123,7 @@ class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
public:
PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
const std::string& pmeStreamProperty);
const std::string& pmeStreamProperty, int numThreads);
~PlatformData();
void initializeContexts(const System& system);
void syncContexts();
......@@ -134,6 +135,7 @@ public:
int stepCount, computeForceCount;
double time;
std::map<std::string, std::string> propertyValues;
ThreadPool threads;
};
} // namespace OpenMM
......
......@@ -141,17 +141,23 @@ void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
contexts[i]->setTime(time);
}
void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
cu.setAsCurrent();
class CudaUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
public:
GetPositionsTask(CudaContext& cu, vector<Vec3>& positions, vector<float4>& posCorrection) : cu(cu), positions(positions), posCorrection(posCorrection) {
}
void execute(ThreadPool& threads, int threadIndex) {
// Compute the position of each particle to return to the user. This is done in parallel for speed.
const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
int numParticles = cu.getNumAtoms();
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
int numThreads = threads.getNumThreads();
int start = threadIndex*numParticles/numThreads;
int end = (threadIndex+1)*numParticles/numThreads;
if (cu.getUseDoublePrecision()) {
double4* posq = (double4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
for (int i = start; i < end; ++i) {
double4 pos = posq[i];
int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
......@@ -159,10 +165,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
}
else if (cu.getUseMixedPrecision()) {
float4* posq = (float4*) cu.getPinnedBuffer();
vector<float4> posCorrection;
cu.getPosq().download(posq);
cu.getPosqCorrection().download(posCorrection);
for (int i = 0; i < numParticles; ++i) {
for (int i = start; i < end; ++i) {
float4 pos1 = posq[i];
float4 pos2 = posCorrection[i];
int4 offset = cu.getPosCellOffsets()[i];
......@@ -171,13 +174,43 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
}
else {
float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
for (int i = start; i < end; ++i) {
float4 pos = posq[i];
int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
}
CudaContext& cu;
vector<Vec3>& positions;
vector<float4>& posCorrection;
};
void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
cu.setAsCurrent();
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
vector<float4> posCorrection;
if (cu.getUseDoublePrecision()) {
double4* posq = (double4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
}
else if (cu.getUseMixedPrecision()) {
float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq, false);
posCorrection.resize(numParticles);
cu.getPosqCorrection().download(posCorrection);
}
else {
float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
}
// Filling in the output array is done in parallel for speed.
GetPositionsTask task(cu, positions, posCorrection);
cu.getPlatformData().threads.execute(task);
cu.getPlatformData().threads.waitForThreads();
}
void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
......
......@@ -29,9 +29,10 @@
#include "CudaPlatform.h"
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/hardware.h"
#include <algorithm>
#include <cctype>
#include <sstream>
......@@ -175,7 +176,11 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
if (!supportsKernels(pmeKernelName))
cpuPmePropValue = "false";
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue));
int threads = getNumProcessors();
char* threadsEnv = getenv("OPENMM_CPU_THREADS");
if (threadsEnv != NULL)
stringstream(threadsEnv) >> threads;
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue, pmeStreamPropValue, threads));
}
void CudaPlatform::contextDestroyed(ContextImpl& context) const {
......@@ -184,7 +189,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
}
CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty, int numThreads) :
context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
bool blocking = (blockingProperty == "true");
vector<string> devices;
size_t searchPos = 0, nextPos;
......
......@@ -56,7 +56,7 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
system.addParticle(0.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
CudaContext& context = *platformData.contexts[0];
context.initialize();
OpenMM_SFMT::SFMT sfmt;
......
......@@ -56,7 +56,7 @@ void testGaussian() {
system.addParticle(1.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
CudaContext& context = *platformData.contexts[0];
context.initialize();
context.getIntegrationUtilities().initRandomNumberGenerator(0);
......
......@@ -66,7 +66,7 @@ void verifySorting(vector<float> array) {
system.addParticle(0.0);
CudaPlatform::PlatformData platformData(NULL, system, "", "true", platform.getPropertyDefaultValue("CudaPrecision"), "false",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()),
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()));
platform.getPropertyDefaultValue(CudaPlatform::CudaHostCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaDisablePmeStream()), 1);
CudaContext& context = *platformData.contexts[0];
context.initialize();
CudaArray data(context, array.size(), 4, "sortData");
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Portions copyright (c) 2008-2016 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -170,6 +170,7 @@ public:
*/
void loadCheckpoint(ContextImpl& context, std::istream& stream);
private:
class GetPositionsTask;
OpenCLContext& cl;
};
......
......@@ -29,6 +29,7 @@
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include "windowsExportOpenCL.h"
namespace OpenMM {
......@@ -107,7 +108,7 @@ public:
class OPENMM_EXPORT_OPENCL OpenCLPlatform::PlatformData {
public:
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty,
const std::string& cpuPmeProperty, const std::string& pmeStreamProperty);
const std::string& cpuPmeProperty, const std::string& pmeStreamProperty, int numThreads);
~PlatformData();
void initializeContexts(const System& system);
void syncContexts();
......@@ -119,6 +120,7 @@ public:
int stepCount, computeForceCount;
double time;
std::map<std::string, std::string> propertyValues;
ThreadPool threads;
};
} // namespace OpenMM
......
......@@ -165,16 +165,23 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
contexts[i]->setTime(time);
}
void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
class OpenCLUpdateStateDataKernel::GetPositionsTask : public ThreadPool::Task {
public:
GetPositionsTask(OpenCLContext& cl, vector<Vec3>& positions, vector<mm_float4>& posCorrection) : cl(cl), positions(positions), posCorrection(posCorrection) {
}
void execute(ThreadPool& threads, int threadIndex) {
// Compute the position of each particle to return to the user. This is done in parallel for speed.
const vector<int>& order = cl.getAtomIndex();
int numParticles = cl.getNumAtoms();
Vec3 boxVectors[3];
cl.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
int numThreads = threads.getNumThreads();
int start = threadIndex*numParticles/numThreads;
int end = (threadIndex+1)*numParticles/numThreads;
if (cl.getUseDoublePrecision()) {
mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
for (int i = start; i < end; ++i) {
mm_double4 pos = posq[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
......@@ -182,10 +189,7 @@ void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3
}
else if (cl.getUseMixedPrecision()) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
vector<mm_float4> posCorrection;
cl.getPosq().download(posq);
cl.getPosqCorrection().download(posCorrection);
for (int i = 0; i < numParticles; ++i) {
for (int i = start; i < end; ++i) {
mm_float4 pos1 = posq[i];
mm_float4 pos2 = posCorrection[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
......@@ -194,13 +198,42 @@ void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3
}
else {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
for (int i = start; i < end; ++i) {
mm_float4 pos = posq[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
}
OpenCLContext& cl;
vector<Vec3>& positions;
vector<mm_float4>& posCorrection;
};
void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
vector<mm_float4> posCorrection;
if (cl.getUseDoublePrecision()) {
mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
}
else if (cl.getUseMixedPrecision()) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq, false);
posCorrection.resize(numParticles);
cl.getPosqCorrection().download(posCorrection);
}
else {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
}
// Filling in the output array is done in parallel for speed.
GetPositionsTask task(cl, positions, posCorrection);
cl.getPlatformData().threads.execute(task);
cl.getPlatformData().threads.waitForThreads();
}
void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
......
......@@ -28,9 +28,10 @@
#include "OpenCLPlatform.h"
#include "OpenCLKernelFactory.h"
#include "OpenCLKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/hardware.h"
#include <algorithm>
#include <cctype>
#include <sstream>
......@@ -165,7 +166,11 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
if (!supportsKernels(pmeKernelName))
cpuPmePropValue = "false";
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue, cpuPmePropValue, pmeStreamPropValue));
int threads = getNumProcessors();
char* threadsEnv = getenv("OPENMM_CPU_THREADS");
if (threadsEnv != NULL)
stringstream(threadsEnv) >> threads;
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue, cpuPmePropValue, pmeStreamPropValue, threads));
}
void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
......@@ -174,7 +179,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
}
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
const string& precisionProperty, const string& cpuPmeProperty, const string& pmeStreamProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
const string& precisionProperty, const string& cpuPmeProperty, const string& pmeStreamProperty, int numThreads) :
removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
int platformIndex = -1;
if (platformPropValue.length() > 0)
stringstream(platformPropValue) >> platformIndex;
......
......@@ -54,7 +54,7 @@ template <class Real2>
void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
System system;
system.addParticle(0.0);
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false");
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false", 1);
OpenCLContext& context = *platformData.contexts[0];
context.initialize();
OpenMM_SFMT::SFMT sfmt;
......
......@@ -54,7 +54,7 @@ void testGaussian() {
System system;
for (int i = 0; i < numAtoms; i++)
system.addParticle(1.0);
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false");
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false", 1);
OpenCLContext& context = *platformData.contexts[0];
context.initialize();
context.getIntegrationUtilities().initRandomNumberGenerator(0);
......
......@@ -64,7 +64,7 @@ void verifySorting(vector<float> array) {
System system;
system.addParticle(0.0);
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false");
OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false", "false", 1);
OpenCLContext& context = *platformData.contexts[0];
context.initialize();
OpenCLArray data(context, array.size(), sizeof(float), "sortData");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment