Commit 8f12d8fc authored by peastman's avatar peastman
Browse files

Further improvements to multi-GPU scaling

parent 67a3b4c0
......@@ -98,6 +98,12 @@ public:
// Execute the kernel, then download forces.
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups);
if (cu.getComputeForceCount() < 200) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
completionTime = getTime();
}
if (includeForce) {
if (cu.getContextIndex() > 0) {
int numAtoms = cu.getPaddedNumAtoms();
......@@ -110,13 +116,8 @@ public:
else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
}
else {
// In principle this should make the load balancing more accurate, but in practice it just seems to make things slower.
//CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
}
}
completionTime = getTime();
}
private:
ContextImpl& context;
CudaContext& cu;
......@@ -192,6 +193,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
}
}
#include <cstdio>
double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
......@@ -216,6 +218,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
......@@ -223,7 +226,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
}
double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
......@@ -235,6 +238,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
startFraction = endFraction;
}
}
}
return energy;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment