"vscode:/vscode.git/clone" did not exist on "fade85c4d8bf40640b1821a3b5fbbd2d8f13d67b"
Unverified Commit fa893467 authored by Philip Turner's avatar Philip Turner Committed by GitHub
Browse files

[macOS GPU Support] Tune dispatching of persistent threads for Apple silicon GPUs (#3978)

* Use 768 instead of 384 threads in generic kernels

* Use 1536 instead of 1024 threads in force kernels.
parent aa363660
......@@ -218,6 +218,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
if (vendor.size() >= 5 && vendor.substr(0, 5) == "Apple") {
simdWidth = 32;
// 768 threads per GPU core.
numThreadBlocksPerComputeUnit = 12;
}
else if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationDefines["WARPS_ARE_ATOMIC"] = "";
......
......@@ -65,8 +65,14 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
forceThreadBlockSize = 1;
}
else if (context.getSIMDWidth() == 32) {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
int blocksPerComputeUnit = 4;
std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 5 && vendor.substr(0, 5) == "Apple") {
// 1536 threads per GPU core.
blocksPerComputeUnit = 6;
}
numForceThreadBlocks = blocksPerComputeUnit*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
}
else {
numForceThreadBlocks = context.getNumThreadBlocks();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment