"wrappers/python/vscode:/vscode.git/clone" did not exist on "4e4db1f174ed500ce0d32fb29f73a246848aaa50"
Commit 924af739 authored by peastman's avatar peastman
Browse files

Merge pull request #701 from peastman/multigpu

Improvements to multi-GPU performance
parents 6269454b f32c804b
...@@ -342,6 +342,8 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -342,6 +342,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
void CudaNonbondedUtilities::prepareInteractions() { void CudaNonbondedUtilities::prepareInteractions() {
if (!useCutoff) if (!useCutoff)
return; return;
if (numTiles == 0)
return;
if (usePeriodic) { if (usePeriodic) {
double4 box = context.getPeriodicBoxSize(); double4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*cutoff; double minAllowedSize = 1.999999*cutoff;
......
...@@ -71,12 +71,7 @@ public: ...@@ -71,12 +71,7 @@ public:
cu.setAsCurrent(); cu.setAsCurrent();
if (cu.getContextIndex() > 0) { if (cu.getContextIndex() > 0) {
if (cu.getPlatformData().peerAccessSupported && false) { // Why is the peer-to-peer copy slower??? if (!cu.getPlatformData().peerAccessSupported) {
CudaContext& context0 = *cu.getPlatformData().contexts[0];
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
CHECK_RESULT(cuMemcpyAsync(cu.getPosq().getDevicePointer(), context0.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions");
}
else {
cuStreamWaitEvent(cu.getCurrentStream(), event, 0); cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
cu.getPosq().upload(pinnedMemory, false); cu.getPosq().upload(pinnedMemory, false);
} }
...@@ -117,7 +112,8 @@ public: ...@@ -117,7 +112,8 @@ public:
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]); cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
} }
else { else {
CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context"); // In principle this should make the load balancing more accurate, but in practice it just seems to make things slower.
//CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
} }
} }
completionTime = getTime(); completionTime = getTime();
...@@ -175,10 +171,18 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex ...@@ -175,10 +171,18 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel. // Copy coordinates over to each device and execute the kernel.
if (!(cu.getPlatformData().peerAccessSupported && false)) { // Why is this faster than a peer-to-peer copy??? if (!cu.getPlatformData().peerAccessSupported) {
cu.getPosq().download(pinnedPositionBuffer, false); cu.getPosq().download(pinnedPositionBuffer, false);
cuEventRecord(event, cu.getCurrentStream()); cuEventRecord(event, cu.getCurrentStream());
} }
else {
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
for (int i = 1; i < (int) data.contexts.size(); i++) {
data.contexts[i]->setAsCurrent();
CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions");
}
cu.setAsCurrent();
}
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0; data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
......
...@@ -386,6 +386,8 @@ static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int ...@@ -386,6 +386,8 @@ static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int
void OpenCLNonbondedUtilities::prepareInteractions() { void OpenCLNonbondedUtilities::prepareInteractions() {
if (!useCutoff) if (!useCutoff)
return; return;
if (numTiles == 0)
return;
if (usePeriodic) { if (usePeriodic) {
mm_float4 box = context.getPeriodicBoxSize(); mm_float4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*cutoff; double minAllowedSize = 1.999999*cutoff;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment