Minor optimizations

b3af19cd · peastman · 143fe36d · b3af19cd · b3af19cd · b3af19cd
Commit b3af19cd authored May 16, 2018 by peastman
3 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -218,7 +218,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    int numThreadBlocksPerComputeUnit = (major >= 6 ? 4 : 6);
+    int numThreadBlocksPerComputeUnit = (major == 6 ? 4 : 6);
    if (cudaDriverVersion < 7000) {
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
        // its compute capability as 5.2, but the compiler doesn't support

--- a/platforms/cuda/src/CudaSort.cpp
+++ b/platforms/cuda/src/CudaSort.cpp
@@ -26,6 +26,7 @@
 #include "CudaSort.h"
 #include "CudaKernelSources.h"
+#include <algorithm>
 #include <map>
 using namespace OpenMM;
@@ -56,8 +57,9 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
    cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
    int maxSharedMem;
    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
-    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
-    isShortList = (length <= maxLocalBuffer || length <= CudaContext::ThreadBlockSize*context.getNumThreadBlocks());
+    int maxShortList = min(8192, max(maxLocalBuffer, CudaContext::ThreadBlockSize*context.getNumThreadBlocks()));
+    isShortList = (length <= maxShortList);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = rangeKernelSize;

--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -59,15 +59,16 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
    int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
-    unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
+    int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
-    // On Qualcomm's OpenCL, it's essential to check against maxShortListSize.  Otherwise you get a crash.
+    unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()));
+    // On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE.  Otherwise you get a crash.
    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
    // If we officially support Qualcomm in the future, we'll need to do something better.
-    isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/ || length <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
+    //maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
+    isShortList = (length <= maxShortList);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);