"wrappers/vscode:/vscode.git/clone" did not exist on "1d3804dfc381ce4d65b2fb65ce5d81014fa132df"
Commit 385c1475 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony's patch to set number of work groups better on AMD processors

parent 801c43ee
...@@ -85,12 +85,37 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -85,12 +85,37 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
int bestSpeed = -1; int bestSpeed = -1;
for (int i = 0; i < (int) devices.size(); i++) { for (int i = 0; i < (int) devices.size(); i++) {
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int processingElementsPerComputeUnit = (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU ? 8 : 1); int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) { if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
processingElementsPerComputeUnit = 1;
}
else if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) {
cl_uint computeCapabilityMajor; cl_uint computeCapabilityMajor;
clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL); clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32); processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32);
} }
else if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
// or the CPU device) so still have to check for errors.
try {
processingElementsPerComputeUnit =
// AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
// The SIMD width is the number of threads the SIMD executes per cycle.
// This will be less than the wavefront width since it takes several
// cycles to execute the full wavefront.
// The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
// this is the number of ALUs that can be executing per instruction per thread.
devices[i].getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>() *
devices[i].getInfo<CL_DEVICE_SIMD_WIDTH_AMD>() *
devices[i].getInfo<CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD>();
// Just in case any of the queries return 0.
if (processingElementsPerComputeUnit <= 0)
processingElementsPerComputeUnit = 1;
}
catch (cl::Error err) {
// Runtime does not support the queries so use default.
}
}
int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
if (maxSize >= minThreadBlockSize && speed > bestSpeed) { if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
deviceIndex = i; deviceIndex = i;
...@@ -109,6 +134,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -109,6 +134,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos); supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
string vendor = device.getInfo<CL_DEVICE_VENDOR>(); string vendor = device.getInfo<CL_DEVICE_VENDOR>();
int numThreadBlocksPerComputeUnit = 6;
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") { if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationDefines["WARPS_ARE_ATOMIC"] = ""; compilationDefines["WARPS_ARE_ATOMIC"] = "";
simdWidth = 32; simdWidth = 32;
...@@ -124,12 +150,46 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -124,12 +150,46 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
} }
} }
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") { else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
compilationDefines["AMD_ATOMIC_WORK_AROUND"] = ""; /// \todo Is 6 a good value for the OpenCL CPU device?
// AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query. // numThreadBlocksPerComputeUnit = ?;
// For now default to 1 which will use the default kernels.
simdWidth = 1; simdWidth = 1;
} }
else {
bool amdPostSdk2_4 = false;
// Default to 1 which will use the default kernels.
simdWidth = 1;
if (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime so still have to
// check for errors.
try {
// AMD has both 32 and 64 width SIMDs. Can determine by using:
// simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
// Must catch cl:Error as will fail if runtime does not support queries.
// However, the 32 width NVIDIA kernels do not have all the necessary
// barriers and so will not work for AMD.
// So for now leave default of 1 which will use the default kernels.
cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
if (simdPerComputeUnit > 1)
numThreadBlocksPerComputeUnit = 4 * simdPerComputeUnit;
// If the queries are supported then must be newer than SDK 2.4.
amdPostSdk2_4 = true;
}
catch (cl::Error err) {
// Runtime does not support the query so is unlikely to be the newer scalar GPU.
// Stay with the default simdWidth and numThreadBlocksPerComputeUnit.
}
}
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. This is fixed after SDK 2.4.
if (!amdPostSdk2_4)
compilationDefines["AMD_ATOMIC_WORK_AROUND"] = "";
}
}
else else
simdWidth = 1; simdWidth = 1;
if (platforms[0].getInfo<CL_PLATFORM_VENDOR>() == "Apple" && vendor == "AMD") if (platforms[0].getInfo<CL_PLATFORM_VENDOR>() == "Apple" && vendor == "AMD")
...@@ -142,7 +202,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -142,7 +202,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
numAtoms = numParticles; numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
numThreadBlocks = 6*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
bonded = new OpenCLBondedUtilities(*this); bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this); nonbonded = new OpenCLNonbondedUtilities(*this);
posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true); posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
......
...@@ -182,6 +182,38 @@ ...@@ -182,6 +182,38 @@
#include <cstring> #include <cstring>
#include <cstdlib> #include <cstdlib>
// Defines from cl_ext.h that may not be present in the installed version.
#ifndef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
#endif
#ifndef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#endif
#ifndef CL_DEVICE_SIMD_WIDTH_AMD
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#endif
#ifndef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#endif
#ifndef CL_DEVICE_WAVEFRONT_WIDTH_AMD
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
#endif
#ifndef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#endif
#ifndef CL_DEVICE_LOCAL_MEM_BANKS_AMD
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
#endif
/*! \namespace cl /*! \namespace cl
* *
* \brief The OpenCL C++ bindings are defined within this namespace. * \brief The OpenCL C++ bindings are defined within this namespace.
...@@ -988,6 +1020,36 @@ __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_C ...@@ -988,6 +1020,36 @@ __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_C
#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
#endif #endif
#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
#endif
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_SIMD_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment