"vscode:/vscode.git/clone" did not exist on "9e2349833e994cb829b77cc08f1aacc6ab6d2458"
Commit 990e28c9 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merge branch 'qc' of https://github.com/peastman/openmm into qc

parents 1e857c31 a3fd6c23
...@@ -284,7 +284,8 @@ private: ...@@ -284,7 +284,8 @@ private:
std::map<std::string, std::string> kernelDefines; std::map<std::string, std::string> kernelDefines;
double cutoff; double cutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding; bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup; int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, nonbondedForceGroup;
}; };
/** /**
......
...@@ -74,9 +74,12 @@ int OpenCLFFT3D::findLegalDimension(int minimum) { ...@@ -74,9 +74,12 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
} }
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) { cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
bool loopRequired = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
while (true) {
bool loopRequired = (zsize > maxThreads || isCPU);
stringstream source; stringstream source;
int blocksPerGroup = (loopRequired ? 1 : max(1, 256/zsize)); int blocksPerGroup = (loopRequired ? 1 : max(1, maxThreads/zsize));
int stage = 0; int stage = 0;
int L = zsize; int L = zsize;
int m = 1; int m = 1;
...@@ -242,10 +245,18 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -242,10 +245,18 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0"); replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT"); cl::Kernel kernel(program, "execFFT");
threads = (isCPU ? 1 : blocksPerGroup*zsize);
int kernelMaxThreads = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
if (threads > kernelMaxThreads) {
// The device can't handle this block size, so reduce it.
maxThreads = kernelMaxThreads;
continue;
}
int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)); int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(3, bufferSize, NULL); kernel.setArg(3, bufferSize, NULL);
kernel.setArg(4, bufferSize, NULL); kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL); kernel.setArg(5, bufferSize, NULL);
threads = (loopRequired ? 1 : blocksPerGroup*zsize);
return kernel; return kernel;
}
} }
...@@ -317,9 +317,11 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -317,9 +317,11 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++) for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size()); maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["GROUP_SIZE"] = (deviceIsCpu ? "32" : "128");
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2"); defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks); string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
int groupSize = (deviceIsCpu ? 32 : 128);
while (true) {
defines["GROUP_SIZE"] = context.intToString(groupSize);
cl::Program interactingBlocksProgram = context.createProgram(file, defines); cl::Program interactingBlocksProgram = context.createProgram(file, defines);
findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds"); findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms()); findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
...@@ -353,6 +355,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -353,6 +355,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel.setArg<cl::Buffer>(13, exclusionRowIndices->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(13, exclusionRowIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(14, oldPositions->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(14, oldPositions->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(15, rebuildNeighborList->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(15, rebuildNeighborList->getDeviceBuffer());
if (findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
// The device can't handle this block size, so reduce it.
groupSize -= 32;
if (groupSize < 32)
throw OpenMMException("Failed to create findInteractingBlocks kernel");
continue;
}
break;
}
interactingBlocksThreadBlockSize = (deviceIsCpu ? 1 : groupSize);
} }
} }
...@@ -389,7 +402,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -389,7 +402,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
context.executeKernel(sortBoxDataKernel, context.getNumAtoms()); context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0); setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0);
setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1); setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : 128); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
} }
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
......
...@@ -56,10 +56,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -56,10 +56,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer); unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2) unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = rangeKernelSize; positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2); sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
if (rangeKernelSize > length) if (rangeKernelSize > length)
rangeKernelSize = length; rangeKernelSize = length;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment