Commit 9ab32c50 authored by Peter Eastman's avatar Peter Eastman
Browse files

Reduced the number of kernel executions to clear buffers

parent 61c50862
...@@ -148,6 +148,8 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform:: ...@@ -148,6 +148,8 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
clearTwoBuffersKernel = cl::Kernel(utilities, "clearTwoBuffers"); clearTwoBuffersKernel = cl::Kernel(utilities, "clearTwoBuffers");
clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers"); clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers"); clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer"); reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
reduceForcesKernel = cl::Kernel(utilities, "reduceForces"); reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
...@@ -351,7 +353,36 @@ void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) { ...@@ -351,7 +353,36 @@ void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
void OpenCLContext::clearAutoclearBuffers() { void OpenCLContext::clearAutoclearBuffers() {
int base = 0; int base = 0;
int total = autoclearBufferSizes.size(); int total = autoclearBufferSizes.size();
while (total-base >= 4) { while (total-base >= 6) {
clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
base += 6;
}
if (total-base == 5) {
clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
}
else if (total-base == 4) {
clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]); clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]); clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]); clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
...@@ -361,9 +392,8 @@ void OpenCLContext::clearAutoclearBuffers() { ...@@ -361,9 +392,8 @@ void OpenCLContext::clearAutoclearBuffers() {
clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]); clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]); clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128); executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
base += 4;
} }
if (total-base == 3) { else if (total-base == 3) {
clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]); clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]); clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]); clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
......
...@@ -472,6 +472,8 @@ private: ...@@ -472,6 +472,8 @@ private:
cl::Kernel clearTwoBuffersKernel; cl::Kernel clearTwoBuffersKernel;
cl::Kernel clearThreeBuffersKernel; cl::Kernel clearThreeBuffersKernel;
cl::Kernel clearFourBuffersKernel; cl::Kernel clearFourBuffersKernel;
cl::Kernel clearFiveBuffersKernel;
cl::Kernel clearSixBuffersKernel;
cl::Kernel reduceFloat4Kernel; cl::Kernel reduceFloat4Kernel;
cl::Kernel reduceForcesKernel; cl::Kernel reduceForcesKernel;
std::vector<OpenCLForceInfo*> forces; std::vector<OpenCLForceInfo*> forces;
......
...@@ -1266,6 +1266,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1266,6 +1266,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// Create required data structures. // Create required data structures.
pmeGrid = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid"); pmeGrid = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid");
cl.addAutoclearBuffer(pmeGrid->getDeviceBuffer(), pmeGrid->getSize()*2);
pmeGrid2 = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid2"); pmeGrid2 = new OpenCLArray<mm_float2>(cl, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid2");
pmeBsplineModuliX = new OpenCLArray<cl_float>(cl, gridSizeX, "pmeBsplineModuliX"); pmeBsplineModuliX = new OpenCLArray<cl_float>(cl, gridSizeX, "pmeBsplineModuliX");
pmeBsplineModuliY = new OpenCLArray<cl_float>(cl, gridSizeY, "pmeBsplineModuliY"); pmeBsplineModuliY = new OpenCLArray<cl_float>(cl, gridSizeY, "pmeBsplineModuliY");
...@@ -1464,7 +1465,6 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1464,7 +1465,6 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize); pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms()); cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid->getDeviceBuffer(), pmeGrid->getSize()*2);
pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize); pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize); pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize);
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder); cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
......
...@@ -42,6 +42,29 @@ __kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* b ...@@ -42,6 +42,29 @@ __kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* b
clearBuffer(buffer4, size4); clearBuffer(buffer4, size4);
} }
/**
* Fill five buffers with 0.
*/
__kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
clearBuffer(buffer4, size4);
clearBuffer(buffer5, size5);
}
/**
* Fill six buffers with 0.
*/
__kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5, __global int* buffer6, int size6) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
clearBuffer(buffer4, size4);
clearBuffer(buffer5, size5);
clearBuffer(buffer6, size6);
}
/** /**
* Sum a collection of buffers into the first one. * Sum a collection of buffers into the first one.
*/ */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment