Commit 8b69b9c0 authored by peastman's avatar peastman
Browse files

Merge pull request #647 from peastman/stream

Compute PME on a separate stream
parents b8bae04c 19841146
...@@ -133,6 +133,18 @@ public: ...@@ -133,6 +133,18 @@ public:
int getContextIndex() const { int getContextIndex() const {
return contextIndex; return contextIndex;
} }
/**
* Get the stream currently being used for execution.
*/
CUstream getCurrentStream();
/**
* Set the stream to use for execution.
*/
void setCurrentStream(CUstream stream);
/**
* Reset the context to using the default stream for execution.
*/
void restoreDefaultStream();
/** /**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom. * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/ */
...@@ -521,6 +533,7 @@ private: ...@@ -521,6 +533,7 @@ private:
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
CUcontext context; CUcontext context;
CUdevice device; CUdevice device;
CUstream currentStream;
CUfunction clearBufferKernel; CUfunction clearBufferKernel;
CUfunction clearTwoBuffersKernel; CUfunction clearTwoBuffersKernel;
CUfunction clearThreeBuffersKernel; CUfunction clearThreeBuffersKernel;
......
...@@ -599,6 +599,8 @@ private: ...@@ -599,6 +599,8 @@ private:
class PmeIO; class PmeIO;
class PmePreComputation; class PmePreComputation;
class PmePostComputation; class PmePostComputation;
class SyncStreamPreComputation;
class SyncStreamPostComputation;
CudaContext& cu; CudaContext& cu;
bool hasInitializedFFT; bool hasInitializedFFT;
CudaArray* sigmaEpsilon; CudaArray* sigmaEpsilon;
...@@ -614,6 +616,8 @@ private: ...@@ -614,6 +616,8 @@ private:
CudaSort* sort; CudaSort* sort;
Kernel cpuPme; Kernel cpuPme;
PmeIO* pmeio; PmeIO* pmeio;
CUstream pmeStream;
CUevent pmeSyncEvent;
cufftHandle fftForward; cufftHandle fftForward;
cufftHandle fftBackward; cufftHandle fftBackward;
CUfunction ewaldSumsKernel; CUfunction ewaldSumsKernel;
......
...@@ -58,7 +58,7 @@ void CudaArray::upload(const void* data, bool blocking) { ...@@ -58,7 +58,7 @@ void CudaArray::upload(const void* data, bool blocking) {
if (blocking) if (blocking)
result = cuMemcpyHtoD(pointer, data, size*elementSize); result = cuMemcpyHtoD(pointer, data, size*elementSize);
else else
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0); result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
...@@ -71,7 +71,7 @@ void CudaArray::download(void* data, bool blocking) const { ...@@ -71,7 +71,7 @@ void CudaArray::download(void* data, bool blocking) const {
if (blocking) if (blocking)
result = cuMemcpyDtoH(data, pointer, size*elementSize); result = cuMemcpyDtoH(data, pointer, size*elementSize);
else else
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0); result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
...@@ -82,7 +82,7 @@ void CudaArray::download(void* data, bool blocking) const { ...@@ -82,7 +82,7 @@ void CudaArray::download(void* data, bool blocking) const {
void CudaArray::copyTo(CudaArray& dest) const { void CudaArray::copyTo(CudaArray& dest) const {
if (dest.getSize() != size || dest.getElementSize() != elementSize) if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array"); throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, 0); CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......
...@@ -72,7 +72,7 @@ const int CudaContext::TileSize = sizeof(tileflags)*8; ...@@ -72,7 +72,7 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
bool CudaContext::hasInitializedCuda = false; bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
this->compiler = "\""+compiler+"\""; this->compiler = "\""+compiler+"\"";
...@@ -507,6 +507,18 @@ CUfunction CudaContext::getKernel(CUmodule& module, const string& name) { ...@@ -507,6 +507,18 @@ CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
return function; return function;
} }
CUstream CudaContext::getCurrentStream() {
return currentStream;
}
void CudaContext::setCurrentStream(CUstream stream) {
currentStream = stream;
}
void CudaContext::restoreDefaultStream() {
setCurrentStream(0);
}
string CudaContext::doubleToString(double value) { string CudaContext::doubleToString(double value) {
stringstream s; stringstream s;
s.precision(useDoublePrecision ? 16 : 8); s.precision(useDoublePrecision ? 16 : 8);
...@@ -575,7 +587,7 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads ...@@ -575,7 +587,7 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
if (blockSize == -1) if (blockSize == -1)
blockSize = ThreadBlockSize; blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks); int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, 0, arguments, NULL); CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
stringstream str; stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")"; str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
......
...@@ -1398,6 +1398,36 @@ private: ...@@ -1398,6 +1398,36 @@ private:
CalcPmeReciprocalForceKernel::IO& io; CalcPmeReciprocalForceKernel::IO& io;
}; };
class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
public:
SyncStreamPreComputation(CUstream stream, CUevent event, int forceGroup) : stream(stream), event(event), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
cuEventRecord(event, 0);
cuStreamWaitEvent(stream, event, 0);
}
}
private:
CUstream stream;
CUevent event;
int forceGroup;
};
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public:
SyncStreamPostComputation(CUevent event, int forceGroup) : event(event), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0)
cuStreamWaitEvent(0, event, 0);
return 0.0;
}
private:
CUevent event;
int forceGroup;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent(); cu.setAsCurrent();
if (sigmaEpsilon != NULL) if (sigmaEpsilon != NULL)
...@@ -1427,6 +1457,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1427,6 +1457,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (hasInitializedFFT) { if (hasInitializedFFT) {
cufftDestroy(fftForward); cufftDestroy(fftForward);
cufftDestroy(fftBackward); cufftDestroy(fftBackward);
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
} }
} }
...@@ -1635,7 +1667,18 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1635,7 +1667,18 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE); cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE); cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
// Prepare for doing PME on its own stream.
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cufftSetStream(fftForward, pmeStream);
cufftSetStream(fftBackward, pmeStream);
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(pmeSyncEvent, recipForceGroup));
hasInitializedFFT = true; hasInitializedFFT = true;
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -1752,6 +1795,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1752,6 +1795,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (directPmeGrid != NULL && includeReciprocal) { if (directPmeGrid != NULL && includeReciprocal) {
cu.setCurrentStream(pmeStream);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
...@@ -1788,7 +1832,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1788,7 +1832,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
} }
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) { if (dispersionCoefficient != 0.0 && includeDirect) {
......
...@@ -211,11 +211,17 @@ public: ...@@ -211,11 +211,17 @@ public:
return contextIndex; return contextIndex;
} }
/** /**
* Get the cl::CommandQueue associated with this object. * Get the cl::CommandQueue currently being used for execution.
*/ */
cl::CommandQueue& getQueue() { cl::CommandQueue& getQueue();
return queue; /**
} * Set the cl::ComandQueue to use for execution.
*/
void setQueue(cl::CommandQueue& queue);
/**
* Reset the context to using the default queue for execution.
*/
void restoreDefaultQueue();
/** /**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom. * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/ */
...@@ -629,7 +635,7 @@ private: ...@@ -629,7 +635,7 @@ private:
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
cl::Context context; cl::Context context;
cl::Device device; cl::Device device;
cl::CommandQueue queue; cl::CommandQueue defaultQueue, currentQueue;
cl::Kernel clearBufferKernel; cl::Kernel clearBufferKernel;
cl::Kernel clearTwoBuffersKernel; cl::Kernel clearTwoBuffersKernel;
cl::Kernel clearThreeBuffersKernel; cl::Kernel clearThreeBuffersKernel;
......
...@@ -599,6 +599,8 @@ private: ...@@ -599,6 +599,8 @@ private:
class PmeIO; class PmeIO;
class PmePreComputation; class PmePreComputation;
class PmePostComputation; class PmePostComputation;
class SyncQueuePreComputation;
class SyncQueuePostComputation;
OpenCLContext& cl; OpenCLContext& cl;
bool hasInitializedKernel; bool hasInitializedKernel;
OpenCLArray* sigmaEpsilon; OpenCLArray* sigmaEpsilon;
...@@ -613,6 +615,8 @@ private: ...@@ -613,6 +615,8 @@ private:
OpenCLArray* pmeAtomRange; OpenCLArray* pmeAtomRange;
OpenCLArray* pmeAtomGridIndex; OpenCLArray* pmeAtomGridIndex;
OpenCLSort* sort; OpenCLSort* sort;
cl::CommandQueue pmeQueue;
cl::Event pmeSyncEvent;
OpenCLFFT3D* fft; OpenCLFFT3D* fft;
Kernel cpuPme; Kernel cpuPme;
PmeIO* pmeio; PmeIO* pmeio;
......
...@@ -248,7 +248,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -248,7 +248,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
contextDevices.push_back(device); contextDevices.push_back(device);
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0}; cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
context = cl::Context(contextDevices, cprops, errorCallback); context = cl::Context(contextDevices, cprops, errorCallback);
queue = cl::CommandQueue(context, device); defaultQueue = cl::CommandQueue(context, device);
currentQueue = defaultQueue;
numAtoms = system.getNumParticles(); numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
...@@ -414,7 +415,7 @@ void OpenCLContext::initialize() { ...@@ -414,7 +415,7 @@ void OpenCLContext::initialize() {
addAutoclearBuffer(*energyBuffer); addAutoclearBuffer(*energyBuffer);
int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize()); int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes); pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes); pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i); double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision) if (useDoublePrecision || useMixedPrecision)
...@@ -514,6 +515,18 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string, ...@@ -514,6 +515,18 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
return program; return program;
} }
cl::CommandQueue& OpenCLContext::getQueue() {
return currentQueue;
}
void OpenCLContext::setQueue(cl::CommandQueue& queue) {
currentQueue = queue;
}
void OpenCLContext::restoreDefaultQueue() {
currentQueue = defaultQueue;
}
string OpenCLContext::doubleToString(double value) { string OpenCLContext::doubleToString(double value) {
stringstream s; stringstream s;
s.precision(useDoublePrecision ? 16 : 8); s.precision(useDoublePrecision ? 16 : 8);
...@@ -534,7 +547,7 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi ...@@ -534,7 +547,7 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
blockSize = ThreadBlockSize; blockSize = ThreadBlockSize;
int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize; int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
try { try {
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize)); currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
} }
catch (cl::Error err) { catch (cl::Error err) {
stringstream str; stringstream str;
......
...@@ -1384,6 +1384,41 @@ private: ...@@ -1384,6 +1384,41 @@ private:
CalcPmeReciprocalForceKernel::IO& io; CalcPmeReciprocalForceKernel::IO& io;
}; };
class OpenCLCalcNonbondedForceKernel::SyncQueuePreComputation : public OpenCLContext::ForcePreComputation {
public:
SyncQueuePreComputation(OpenCLContext& cl, cl::CommandQueue queue, int forceGroup) : cl(cl), queue(queue), events(1), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
cl.getQueue().enqueueMarker(&events[0]);
queue.enqueueWaitForEvents(events);
}
}
private:
OpenCLContext& cl;
cl::CommandQueue queue;
vector<cl::Event> events;
int forceGroup;
};
class OpenCLCalcNonbondedForceKernel::SyncQueuePostComputation : public OpenCLContext::ForcePostComputation {
public:
SyncQueuePostComputation(OpenCLContext& cl, cl::Event& event, int forceGroup) : cl(cl), event(event), events(1), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
events[0] = event;
cl.getQueue().enqueueWaitForEvents(events);
}
return 0.0;
}
private:
OpenCLContext& cl;
cl::Event& event;
vector<cl::Event> events;
int forceGroup;
};
OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() { OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
if (sigmaEpsilon != NULL) if (sigmaEpsilon != NULL)
delete sigmaEpsilon; delete sigmaEpsilon;
...@@ -1574,6 +1609,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1574,6 +1609,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms()); sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ); fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cl.addPreComputation(new SyncQueuePreComputation(cl, pmeQueue, recipForceGroup));
cl.addPostComputation(new SyncQueuePostComputation(cl, pmeSyncEvent, recipForceGroup));
// Initialize the b-spline moduli. // Initialize the b-spline moduli.
...@@ -1753,6 +1794,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1753,6 +1794,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms()); cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
} }
if (pmeGrid != NULL && includeReciprocal) { if (pmeGrid != NULL && includeReciprocal) {
cl.setQueue(pmeQueue);
setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4); setPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 4);
setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5); setInvPeriodicBoxSizeArg(cl, pmeUpdateBsplinesKernel, 5);
cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms()); cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
...@@ -1795,6 +1837,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1795,6 +1837,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1); cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
else else
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms()); cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
pmeQueue.enqueueMarker(&pmeSyncEvent);
cl.restoreDefaultQueue();
} }
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) { if (dispersionCoefficient != 0.0 && includeDirect) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment