[macOS GPU Support] Tune dispatching of persistent threads for Apple silicon GPUs (#3978)

* Use 768 instead of 384 threads in generic kernels * Use 1536 instead of 1024 threads in force kernels.

[macOS GPU Support] Tune dispatching of persistent threads for Apple silicon GPUs (#3978)
* Use 768 instead of 384 threads in generic kernels * Use 1536 instead of 1024 threads in force kernels.
fa893467 · Philip Turner · GitHub · aa363660 · fa893467 · fa893467
Unverified Commit fa893467 authored Feb 24, 2023 by Philip Turner Committed by GitHub Feb 24, 2023
Showing with 11 additions and 2 deletions

platforms/opencl/src/OpenCLContext.cpp platforms/opencl/src/OpenCLContext.cpp +3 -0

platforms/opencl/src/OpenCLNonbondedUtilities.cpp platforms/opencl/src/OpenCLNonbondedUtilities.cpp +8 -2

No files found.
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -218,6 +218,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
      
        if (vendor.size() >= 5 && vendor.substr(0, 5) == "Apple") {
            simdWidth = 32;
+
+            // 768 threads per GPU core.
+            numThreadBlocksPerComputeUnit = 12;
        }
        else if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
            compilationDefines["WARPS_ARE_ATOMIC"] = "";

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -65,8 +65,14 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
        forceThreadBlockSize = 1;
    }
    else if (context.getSIMDWidth() == 32) {
-            numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-            forceThreadBlockSize = 256;
+        int blocksPerComputeUnit = 4;
+        std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
+        if (vendor.size() >= 5 && vendor.substr(0, 5) == "Apple") {
+            // 1536 threads per GPU core.
+            blocksPerComputeUnit = 6;
+        }
+        numForceThreadBlocks = blocksPerComputeUnit*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        forceThreadBlockSize = 256;
    }
    else {
        numForceThreadBlocks = context.getNumThreadBlocks();