Commit 55bae85f authored by peastman's avatar peastman
Browse files

Merge pull request #668 from peastman/multi

Optimizations to multi-GPU calculations
parents e19cefde 390e0a6b
...@@ -85,6 +85,7 @@ private: ...@@ -85,6 +85,7 @@ private:
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
CUfunction sumKernel; CUfunction sumKernel;
CUevent event;
}; };
/** /**
......
...@@ -154,6 +154,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -154,6 +154,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuCtxCreate(&context, flags, device)); CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true; contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED)); CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) {
int canAccess;
cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice());
if (canAccess) {
platformData.contexts[0]->setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0));
setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
}
}
numAtoms = system.getNumParticles(); numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
......
...@@ -63,22 +63,24 @@ if (result != CUDA_SUCCESS) { \ ...@@ -63,22 +63,24 @@ if (result != CUDA_SUCCESS) { \
class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask { class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
public: public:
BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel, BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cu(cu), kernel(kernel), bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) { includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
} }
void execute() { void execute() {
// Copy coordinates over to this device and execute the kernel. // Copy coordinates over to this device and execute the kernel.
cu.setAsCurrent(); cu.setAsCurrent();
if (cu.getContextIndex() > 0) { if (cu.getContextIndex() > 0) {
if (cu.getPlatformData().peerAccessSupported && cu.getPlatformData().contexts.size() < 3) { if (cu.getPlatformData().peerAccessSupported && false) { // Why is the peer-to-peer copy slower???
CudaContext& context0 = *cu.getPlatformData().contexts[0]; CudaContext& context0 = *cu.getPlatformData().contexts[0];
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize(); int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
CHECK_RESULT(cuMemcpyPeerAsync(cu.getPosq().getDevicePointer(), cu.getContext(), context0.getPosq().getDevicePointer(), context0.getContext(), numBytes, 0), "Error copying positions"); CHECK_RESULT(cuMemcpyAsync(cu.getPosq().getDevicePointer(), context0.getPosq().getDevicePointer(), numBytes, 0), "Error copying positions");
} }
else else {
cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
cu.getPosq().upload(pinnedMemory, false); cu.getPosq().upload(pinnedMemory, false);
} }
}
kernel.beginComputation(context, includeForce, includeEnergy, groups); kernel.beginComputation(context, includeForce, includeEnergy, groups);
} }
private: private:
...@@ -88,6 +90,7 @@ private: ...@@ -88,6 +90,7 @@ private:
bool includeForce, includeEnergy; bool includeForce, includeEnergy;
int groups; int groups;
void* pinnedMemory; void* pinnedMemory;
CUevent event;
}; };
class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask { class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
...@@ -108,7 +111,7 @@ public: ...@@ -108,7 +111,7 @@ public:
int numBytes = numAtoms*3*sizeof(long long); int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes; int offset = (cu.getContextIndex()-1)*numBytes;
CudaContext& context0 = *cu.getPlatformData().contexts[0]; CudaContext& context0 = *cu.getPlatformData().contexts[0];
CHECK_RESULT(cuMemcpyPeer(contextForces.getDevicePointer()+offset, context0.getContext(), cu.getForce().getDevicePointer(), cu.getContext(), numBytes), "Error copying forces"); CHECK_RESULT(cuMemcpy(contextForces.getDevicePointer()+offset, cu.getForce().getDevicePointer(), numBytes), "Error copying forces");
} }
else else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]); cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
...@@ -146,6 +149,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() ...@@ -146,6 +149,7 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost(pinnedPositionBuffer); cuMemFreeHost(pinnedPositionBuffer);
if (pinnedForceBuffer != NULL) if (pinnedForceBuffer != NULL)
cuMemFreeHost(pinnedForceBuffer); cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event);
} }
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...@@ -157,6 +161,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -157,6 +161,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
getKernel(i).initialize(system); getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size(); contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
} }
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -170,13 +175,15 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex ...@@ -170,13 +175,15 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel. // Copy coordinates over to each device and execute the kernel.
if (!(cu.getPlatformData().peerAccessSupported && cu.getPlatformData().contexts.size() < 3)) if (!(cu.getPlatformData().peerAccessSupported && false)) { // Why is this faster than a peer-to-peer copy???
cu.getPosq().download(pinnedPositionBuffer); cu.getPosq().download(pinnedPositionBuffer, false);
cuEventRecord(event, cu.getCurrentStream());
}
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0; data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread(); CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer)); thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event));
} }
} }
......
...@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys ...@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
// Determine whether peer-to-peer copying is supported, and enable it if so. // Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported = false; // Disable until I figure out why it usually makes things slower peerAccessSupported = true;
// peerAccessSupported = true;
// for (int i = 1; i < contexts.size(); i++) {
// int canAccess;
// cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
// if (!canAccess) {
// peerAccessSupported = false;
// break;
// }
// }
if (peerAccessSupported) {
for (int i = 1; i < contexts.size(); i++) { for (int i = 1; i < contexts.size(); i++) {
contexts[0]->setAsCurrent(); int canAccess;
CHECK_RESULT(cuCtxEnablePeerAccess(contexts[i]->getContext(), 0), "Error enabling peer access"); cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
contexts[i]->setAsCurrent(); if (!canAccess) {
CHECK_RESULT(cuCtxEnablePeerAccess(contexts[0]->getContext(), 0), "Error enabling peer access"); peerAccessSupported = false;
break;
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment