Unverified Commit dbf25377 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improved load balancing between GPUs (#4013)

parent ae42c911
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. * * Portions copyright (c) 2011-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -84,6 +84,7 @@ private: ...@@ -84,6 +84,7 @@ private:
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long long> completionTimes; std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions; std::vector<double> contextNonbondedFractions;
bool loadBalance;
int2* interactionCounts; int2* interactionCounts;
CudaArray contextForces; CudaArray contextForces;
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2021 Stanford University and the Authors. * * Portions copyright (c) 2011-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -95,17 +95,17 @@ class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public Cuda ...@@ -95,17 +95,17 @@ class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public Cuda
public: public:
FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces,
bool& valid, int2& interactionCount, CUstream stream, CUevent event, CUevent localEvent) : bool& valid, int2& interactionCount, CUstream stream, CUevent event, CUevent localEvent, bool loadBalance) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy), context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount), completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount),
stream(stream), event(event), localEvent(localEvent) { stream(stream), event(event), localEvent(localEvent), loadBalance(loadBalance) {
} }
void execute() { void execute() {
// Execute the kernel, then download forces. // Execute the kernel, then download forces.
ContextSelector selector(cu); ContextSelector selector(cu);
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid); energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
if (cu.getComputeForceCount() < 200) { if (loadBalance) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation. // Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context"); CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
...@@ -137,7 +137,7 @@ private: ...@@ -137,7 +137,7 @@ private:
ContextImpl& context; ContextImpl& context;
CudaContext& cu; CudaContext& cu;
CudaCalcForcesAndEnergyKernel& kernel; CudaCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy; bool includeForce, includeEnergy, loadBalance;
int groups; int groups;
double& energy; double& energy;
long long& completionTime; long long& completionTime;
...@@ -182,8 +182,11 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -182,8 +182,11 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
int numContexts = data.contexts.size(); int numContexts = data.contexts.size();
for (int i = 0; i < numContexts; i++) for (int i = 0; i < numContexts; i++)
getKernel(i).initialize(system); getKernel(i).initialize(system);
for (int i = 0; i < numContexts; i++) for (int i = 0; i < contextNonbondedFractions.size(); i++) {
contextNonbondedFractions[i] = 1/(double) numContexts; double x0 = i/(double) contextNonbondedFractions.size();
double x1 = (i+1)/(double) contextNonbondedFractions.size();
contextNonbondedFractions[i] = x1*x1 - x0*x0;
}
CHECK_RESULT(cuEventCreate(&event, cu.getEventFlags()), "Error creating event"); CHECK_RESULT(cuEventCreate(&event, cu.getEventFlags()), "Error creating event");
peerCopyEvent.resize(numContexts); peerCopyEvent.resize(numContexts);
peerCopyEventLocal.resize(numContexts); peerCopyEventLocal.resize(numContexts);
...@@ -208,6 +211,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex ...@@ -208,6 +211,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
CHECK_RESULT(cuMemHostAlloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), CU_MEMHOSTALLOC_PORTABLE), "Error allocating pinned memory"); CHECK_RESULT(cuMemHostAlloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), CU_MEMHOSTALLOC_PORTABLE), "Error allocating pinned memory");
CHECK_RESULT(cuMemHostAlloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), CU_MEMHOSTALLOC_PORTABLE), "Error allocating pinned memory"); CHECK_RESULT(cuMemHostAlloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), CU_MEMHOSTALLOC_PORTABLE), "Error allocating pinned memory");
} }
loadBalance = (cu.getComputeForceCount() < 200 || cu.getComputeForceCount()%30 == 0);
// Copy coordinates over to each device and execute the kernel. // Copy coordinates over to each device and execute the kernel.
...@@ -239,7 +243,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -239,7 +243,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread(); ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i])); pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i], loadBalance));
} }
data.syncContexts(); data.syncContexts();
CudaContext& cu = *data.contexts[0]; CudaContext& cu = *data.contexts[0];
...@@ -263,7 +267,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -263,7 +267,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
if (cu.getComputeForceCount() < 200) { if (loadBalance) {
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. * * Portions copyright (c) 2011-2023 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -133,8 +133,11 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne ...@@ -133,8 +133,11 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) { void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
for (int i = 0; i < (int) kernels.size(); i++) for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system); getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) for (int i = 0; i < contextNonbondedFractions.size(); i++) {
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size(); double x0 = i/(double) contextNonbondedFractions.size();
double x1 = (i+1)/(double) contextNonbondedFractions.size();
contextNonbondedFractions[i] = x1*x1 - x0*x0;
}
} }
void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -184,7 +187,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -184,7 +187,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
// Balance work between the contexts by transferring a little nonbonded work from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
if (cl.getComputeForceCount() < 200) { if (cl.getComputeForceCount() < 200 || cl.getComputeForceCount()%30 == 0) {
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment