Commit 385c1475 authored by Peter Eastman's avatar Peter Eastman
Browse files

Tony's patch to set number of work groups better on AMD processors

parent 801c43ee
...@@ -85,12 +85,37 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -85,12 +85,37 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
int bestSpeed = -1; int bestSpeed = -1;
for (int i = 0; i < (int) devices.size(); i++) { for (int i = 0; i < (int) devices.size(); i++) {
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int processingElementsPerComputeUnit = (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU ? 8 : 1); int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) { if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
processingElementsPerComputeUnit = 1;
}
else if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) {
cl_uint computeCapabilityMajor; cl_uint computeCapabilityMajor;
clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL); clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32); processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32);
} }
else if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
// or the CPU device) so still have to check for errors.
try {
processingElementsPerComputeUnit =
// AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
// The SIMD width is the number of threads the SIMD executes per cycle.
// This will be less than the wavefront width since it takes several
// cycles to execute the full wavefront.
// The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
// this is the number of ALUs that can be executing per instruction per thread.
devices[i].getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>() *
devices[i].getInfo<CL_DEVICE_SIMD_WIDTH_AMD>() *
devices[i].getInfo<CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD>();
// Just in case any of the queries return 0.
if (processingElementsPerComputeUnit <= 0)
processingElementsPerComputeUnit = 1;
}
catch (cl::Error err) {
// Runtime does not support the queries so use default.
}
}
int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
if (maxSize >= minThreadBlockSize && speed > bestSpeed) { if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
deviceIndex = i; deviceIndex = i;
...@@ -109,6 +134,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -109,6 +134,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos); supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
string vendor = device.getInfo<CL_DEVICE_VENDOR>(); string vendor = device.getInfo<CL_DEVICE_VENDOR>();
int numThreadBlocksPerComputeUnit = 6;
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") { if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationDefines["WARPS_ARE_ATOMIC"] = ""; compilationDefines["WARPS_ARE_ATOMIC"] = "";
simdWidth = 32; simdWidth = 32;
...@@ -124,11 +150,45 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -124,11 +150,45 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
} }
} }
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") { else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
compilationDefines["AMD_ATOMIC_WORK_AROUND"] = ""; /// \todo Is 6 a good value for the OpenCL CPU device?
// AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query. // numThreadBlocksPerComputeUnit = ?;
// For now default to 1 which will use the default kernels. simdWidth = 1;
simdWidth = 1; }
else {
bool amdPostSdk2_4 = false;
// Default to 1 which will use the default kernels.
simdWidth = 1;
if (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime so still have to
// check for errors.
try {
// AMD has both 32 and 64 width SIMDs. Can determine by using:
// simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
// Must catch cl:Error as will fail if runtime does not support queries.
// However, the 32 width NVIDIA kernels do not have all the necessary
// barriers and so will not work for AMD.
// So for now leave default of 1 which will use the default kernels.
cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
if (simdPerComputeUnit > 1)
numThreadBlocksPerComputeUnit = 4 * simdPerComputeUnit;
// If the queries are supported then must be newer than SDK 2.4.
amdPostSdk2_4 = true;
}
catch (cl::Error err) {
// Runtime does not support the query so is unlikely to be the newer scalar GPU.
// Stay with the default simdWidth and numThreadBlocksPerComputeUnit.
}
}
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. This is fixed after SDK 2.4.
if (!amdPostSdk2_4)
compilationDefines["AMD_ATOMIC_WORK_AROUND"] = "";
}
} }
else else
simdWidth = 1; simdWidth = 1;
...@@ -142,7 +202,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -142,7 +202,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
numAtoms = numParticles; numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
numThreadBlocks = 6*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
bonded = new OpenCLBondedUtilities(*this); bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this); nonbonded = new OpenCLNonbondedUtilities(*this);
posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true); posq = new OpenCLArray<mm_float4>(*this, paddedNumAtoms, "posq", true);
......
...@@ -182,6 +182,38 @@ ...@@ -182,6 +182,38 @@
#include <cstring> #include <cstring>
#include <cstdlib> #include <cstdlib>
// Defines from cl_ext.h that may not be present in the installed version.
#ifndef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
#endif
#ifndef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#endif
#ifndef CL_DEVICE_SIMD_WIDTH_AMD
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#endif
#ifndef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#endif
#ifndef CL_DEVICE_WAVEFRONT_WIDTH_AMD
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
#endif
#ifndef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#endif
#ifndef CL_DEVICE_LOCAL_MEM_BANKS_AMD
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
#endif
/*! \namespace cl /*! \namespace cl
* *
* \brief The OpenCL C++ bindings are defined within this namespace. * \brief The OpenCL C++ bindings are defined within this namespace.
...@@ -988,6 +1020,36 @@ __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_C ...@@ -988,6 +1020,36 @@ __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_C
#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
#endif #endif
#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
#endif
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_SIMD_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) __CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment