Commit 832e7f04 authored by peastman's avatar peastman
Browse files

CUDA platform does PME on a separate stream

parent b8bae04c
......@@ -133,6 +133,18 @@ public:
int getContextIndex() const {
return contextIndex;
}
/**
* Get the stream currently being used for execution.
*/
CUstream getCurrentStream();
/**
* Set the stream to use for execution.
*/
void setCurrentStream(CUstream stream);
/**
* Reset the context to using the default stream for execution.
*/
void restoreDefaultStream();
/**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/
......@@ -521,6 +533,7 @@ private:
std::map<std::string, std::string> compilationDefines;
CUcontext context;
CUdevice device;
CUstream currentStream;
CUfunction clearBufferKernel;
CUfunction clearTwoBuffersKernel;
CUfunction clearThreeBuffersKernel;
......
......@@ -599,6 +599,8 @@ private:
class PmeIO;
class PmePreComputation;
class PmePostComputation;
class SyncStreamPreComputation;
class SyncStreamPostComputation;
CudaContext& cu;
bool hasInitializedFFT;
CudaArray* sigmaEpsilon;
......@@ -614,6 +616,8 @@ private:
CudaSort* sort;
Kernel cpuPme;
PmeIO* pmeio;
CUstream pmeStream;
CUevent pmeSyncEvent;
cufftHandle fftForward;
cufftHandle fftBackward;
CUfunction ewaldSumsKernel;
......
......@@ -58,7 +58,7 @@ void CudaArray::upload(const void* data, bool blocking) {
if (blocking)
result = cuMemcpyHtoD(pointer, data, size*elementSize);
else
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......@@ -71,7 +71,7 @@ void CudaArray::download(void* data, bool blocking) const {
if (blocking)
result = cuMemcpyDtoH(data, pointer, size*elementSize);
else
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......@@ -82,7 +82,7 @@ void CudaArray::download(void* data, bool blocking) const {
void CudaArray::copyTo(CudaArray& dest) const {
if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, 0);
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......
......@@ -72,7 +72,7 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system),
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
this->compiler = "\""+compiler+"\"";
......@@ -507,6 +507,18 @@ CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
return function;
}
CUstream CudaContext::getCurrentStream() {
return currentStream;
}
void CudaContext::setCurrentStream(CUstream stream) {
currentStream = stream;
}
void CudaContext::restoreDefaultStream() {
setCurrentStream(0);
}
string CudaContext::doubleToString(double value) {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
......@@ -575,7 +587,7 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, 0, arguments, NULL);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != CUDA_SUCCESS) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
......
......@@ -1398,6 +1398,31 @@ private:
CalcPmeReciprocalForceKernel::IO& io;
};
class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
public:
SyncStreamPreComputation(CUstream stream, CUevent event) : stream(stream), event(event) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
cuEventRecord(event, 0);
cuStreamWaitEvent(stream, event, 0);
}
private:
CUstream stream;
CUevent event;
};
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public:
SyncStreamPostComputation(CUevent event) : event(event) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
cuStreamWaitEvent(0, event, 0);
return 0.0;
}
private:
CUevent event;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent();
if (sigmaEpsilon != NULL)
......@@ -1427,6 +1452,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (hasInitializedFFT) {
cufftDestroy(fftForward);
cufftDestroy(fftBackward);
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
}
}
......@@ -1636,6 +1663,14 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
// Prepare for doing PME on its own stream.
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cufftSetStream(fftForward, pmeStream);
cufftSetStream(fftBackward, pmeStream);
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
cu.addPreComputation(new SyncStreamPreComputation(pmeStream, pmeSyncEvent));
cu.addPostComputation(new SyncStreamPostComputation(pmeSyncEvent));
hasInitializedFFT = true;
// Initialize the b-spline moduli.
......@@ -1752,6 +1787,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (directPmeGrid != NULL && includeReciprocal) {
cu.setCurrentStream(pmeStream);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
......@@ -1788,7 +1824,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment