Commit 8f12d8fc authored by peastman's avatar peastman
Browse files

Further improvements to multi-GPU scaling

parent 67a3b4c0
...@@ -98,6 +98,12 @@ public: ...@@ -98,6 +98,12 @@ public:
// Execute the kernel, then download forces. // Execute the kernel, then download forces.
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups); energy += kernel.finishComputation(context, includeForce, includeEnergy, groups);
if (cu.getComputeForceCount() < 200) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
completionTime = getTime();
}
if (includeForce) { if (includeForce) {
if (cu.getContextIndex() > 0) { if (cu.getContextIndex() > 0) {
int numAtoms = cu.getPaddedNumAtoms(); int numAtoms = cu.getPaddedNumAtoms();
...@@ -110,13 +116,8 @@ public: ...@@ -110,13 +116,8 @@ public:
else else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]); cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
} }
else {
// In principle this should make the load balancing more accurate, but in practice it just seems to make things slower.
//CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
} }
} }
completionTime = getTime();
}
private: private:
ContextImpl& context; ContextImpl& context;
CudaContext& cu; CudaContext& cu;
...@@ -192,6 +193,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex ...@@ -192,6 +193,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
} }
} }
#include <cstdio>
double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
...@@ -216,6 +218,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -216,6 +218,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
...@@ -223,7 +226,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -223,7 +226,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
if (completionTimes[i] > completionTimes[lastIndex]) if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i; lastIndex = i;
} }
double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]); double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer; contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer; contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0; double startFraction = 0.0;
...@@ -235,6 +238,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -235,6 +238,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
startFraction = endFraction; startFraction = endFraction;
} }
} }
}
return energy; return energy;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment