Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
385c1475
Commit
385c1475
authored
Mar 02, 2012
by
Peter Eastman
Browse files
Tony's patch to set number of work groups better on AMD processors
parent
801c43ee
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
130 additions
and
8 deletions
+130
-8
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+68
-8
platforms/opencl/src/cl.hpp
platforms/opencl/src/cl.hpp
+62
-0
No files found.
platforms/opencl/src/OpenCLContext.cpp
View file @
385c1475
...
@@ -85,12 +85,37 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
...
@@ -85,12 +85,37 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
int
bestSpeed
=
-
1
;
int
bestSpeed
=
-
1
;
for
(
int
i
=
0
;
i
<
(
int
)
devices
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
devices
.
size
();
i
++
)
{
int
maxSize
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
];
int
maxSize
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
];
int
processingElementsPerComputeUnit
=
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_GPU
?
8
:
1
);
int
processingElementsPerComputeUnit
=
8
;
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_nv_device_attribute_query"
)
!=
string
::
npos
)
{
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
processingElementsPerComputeUnit
=
1
;
}
else
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_nv_device_attribute_query"
)
!=
string
::
npos
)
{
cl_uint
computeCapabilityMajor
;
cl_uint
computeCapabilityMajor
;
clGetDeviceInfo
(
devices
[
i
](),
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
,
sizeof
(
cl_uint
),
&
computeCapabilityMajor
,
NULL
);
clGetDeviceInfo
(
devices
[
i
](),
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
,
sizeof
(
cl_uint
),
&
computeCapabilityMajor
,
NULL
);
processingElementsPerComputeUnit
=
(
computeCapabilityMajor
<
2
?
8
:
32
);
processingElementsPerComputeUnit
=
(
computeCapabilityMajor
<
2
?
8
:
32
);
}
}
else
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_amd_device_attribute_query"
)
!=
string
::
npos
)
{
// This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
// or the CPU device) so still have to check for errors.
try
{
processingElementsPerComputeUnit
=
// AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
// The SIMD width is the number of threads the SIMD executes per cycle.
// This will be less than the wavefront width since it takes several
// cycles to execute the full wavefront.
// The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
// this is the number of ALUs that can be executing per instruction per thread.
devices
[
i
].
getInfo
<
CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
>
()
*
devices
[
i
].
getInfo
<
CL_DEVICE_SIMD_WIDTH_AMD
>
()
*
devices
[
i
].
getInfo
<
CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
>
();
// Just in case any of the queries return 0.
if
(
processingElementsPerComputeUnit
<=
0
)
processingElementsPerComputeUnit
=
1
;
}
catch
(
cl
::
Error
err
)
{
// Runtime does not support the queries so use default.
}
}
int
speed
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
()
*
processingElementsPerComputeUnit
*
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_CLOCK_FREQUENCY
>
();
int
speed
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
()
*
processingElementsPerComputeUnit
*
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_CLOCK_FREQUENCY
>
();
if
(
maxSize
>=
minThreadBlockSize
&&
speed
>
bestSpeed
)
{
if
(
maxSize
>=
minThreadBlockSize
&&
speed
>
bestSpeed
)
{
deviceIndex
=
i
;
deviceIndex
=
i
;
...
@@ -109,6 +134,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
...
@@ -109,6 +134,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
supportsDoublePrecision
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_fp64"
)
!=
string
::
npos
);
supportsDoublePrecision
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_fp64"
)
!=
string
::
npos
);
string
vendor
=
device
.
getInfo
<
CL_DEVICE_VENDOR
>
();
string
vendor
=
device
.
getInfo
<
CL_DEVICE_VENDOR
>
();
int
numThreadBlocksPerComputeUnit
=
6
;
if
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
)
{
if
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
)
{
compilationDefines
[
"WARPS_ARE_ATOMIC"
]
=
""
;
compilationDefines
[
"WARPS_ARE_ATOMIC"
]
=
""
;
simdWidth
=
32
;
simdWidth
=
32
;
...
@@ -124,11 +150,45 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
...
@@ -124,11 +150,45 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
}
}
}
}
else
if
(
vendor
.
size
()
>=
28
&&
vendor
.
substr
(
0
,
28
)
==
"Advanced Micro Devices, Inc."
)
{
else
if
(
vendor
.
size
()
>=
28
&&
vendor
.
substr
(
0
,
28
)
==
"Advanced Micro Devices, Inc."
)
{
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
if
(
device
.
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
compilationDefines
[
"AMD_ATOMIC_WORK_AROUND"
]
=
""
;
/// \todo Is 6 a good value for the OpenCL CPU device?
// AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query.
// numThreadBlocksPerComputeUnit = ?;
// For now default to 1 which will use the default kernels.
simdWidth
=
1
;
simdWidth
=
1
;
}
else
{
bool
amdPostSdk2_4
=
false
;
// Default to 1 which will use the default kernels.
simdWidth
=
1
;
if
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_amd_device_attribute_query"
)
!=
string
::
npos
)
{
// This attribute does not ensure that all queries are supported by the runtime so still have to
// check for errors.
try
{
// AMD has both 32 and 64 width SIMDs. Can determine by using:
// simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
// Must catch cl:Error as will fail if runtime does not support queries.
// However, the 32 width NVIDIA kernels do not have all the necessary
// barriers and so will not work for AMD.
// So for now leave default of 1 which will use the default kernels.
cl_uint
simdPerComputeUnit
=
device
.
getInfo
<
CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
>
();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
if
(
simdPerComputeUnit
>
1
)
numThreadBlocksPerComputeUnit
=
4
*
simdPerComputeUnit
;
// If the queries are supported then must be newer than SDK 2.4.
amdPostSdk2_4
=
true
;
}
catch
(
cl
::
Error
err
)
{
// Runtime does not support the query so is unlikely to be the newer scalar GPU.
// Stay with the default simdWidth and numThreadBlocksPerComputeUnit.
}
}
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. This is fixed after SDK 2.4.
if
(
!
amdPostSdk2_4
)
compilationDefines
[
"AMD_ATOMIC_WORK_AROUND"
]
=
""
;
}
}
}
else
else
simdWidth
=
1
;
simdWidth
=
1
;
...
@@ -142,7 +202,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
...
@@ -142,7 +202,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
numAtoms
=
numParticles
;
numAtoms
=
numParticles
;
paddedNumAtoms
=
TileSize
*
((
numParticles
+
TileSize
-
1
)
/
TileSize
);
paddedNumAtoms
=
TileSize
*
((
numParticles
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
numThreadBlocks
=
6
*
device
.
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
numThreadBlocks
=
numThreadBlocksPerComputeUnit
*
device
.
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
bonded
=
new
OpenCLBondedUtilities
(
*
this
);
bonded
=
new
OpenCLBondedUtilities
(
*
this
);
nonbonded
=
new
OpenCLNonbondedUtilities
(
*
this
);
nonbonded
=
new
OpenCLNonbondedUtilities
(
*
this
);
posq
=
new
OpenCLArray
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
,
true
);
posq
=
new
OpenCLArray
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
,
true
);
...
...
platforms/opencl/src/cl.hpp
View file @
385c1475
...
@@ -182,6 +182,38 @@
...
@@ -182,6 +182,38 @@
#include <cstring>
#include <cstring>
#include <cstdlib>
#include <cstdlib>
// Defines from cl_ext.h that may not be present in the installed version.
#ifndef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
#endif
#ifndef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#endif
#ifndef CL_DEVICE_SIMD_WIDTH_AMD
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#endif
#ifndef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#endif
#ifndef CL_DEVICE_WAVEFRONT_WIDTH_AMD
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
#endif
#ifndef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
#endif
#ifndef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#endif
#ifndef CL_DEVICE_LOCAL_MEM_BANKS_AMD
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
#endif
/*! \namespace cl
/*! \namespace cl
*
*
* \brief The OpenCL C++ bindings are defined within this namespace.
* \brief The OpenCL C++ bindings are defined within this namespace.
...
@@ -988,6 +1020,36 @@ __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_C
...
@@ -988,6 +1020,36 @@ __CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_C
#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
,
cl_ulong
)
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
,
cl_ulong
)
#endif
#endif
#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
,
VECTOR_CLASS
<
::
size_t
>
)
#endif
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_SIMD_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_SIMD_WIDTH_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_WAVEFRONT_WIDTH_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_LOCAL_MEM_BANKS_AMD
,
cl_uint
)
#endif
#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
,
cl_uint
)
__CL_DECLARE_PARAM_TRAITS
(
cl_device_info
,
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
,
cl_uint
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment