Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
c35407a8
Commit
c35407a8
authored
Jan 13, 2012
by
Peter Eastman
Browse files
Tony Tye's optimizations to sorting. Also a couple of other very minor fixes.
parent
052aea3e
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
145 additions
and
89 deletions
+145
-89
platforms/opencl/src/OpenCLArray.h
platforms/opencl/src/OpenCLArray.h
+2
-2
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+1
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+2
-2
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+11
-1
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+0
-2
platforms/opencl/src/OpenCLSort.h
platforms/opencl/src/OpenCLSort.h
+57
-28
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+60
-52
platforms/opencl/tests/TestOpenCLSort.cpp
platforms/opencl/tests/TestOpenCLSort.cpp
+12
-1
No files found.
platforms/opencl/src/OpenCLArray.h
View file @
c35407a8
...
@@ -90,13 +90,13 @@ public:
...
@@ -90,13 +90,13 @@ public:
/**
/**
* Get the size of the array.
* Get the size of the array.
*/
*/
int
getSize
()
{
int
getSize
()
const
{
return
size
;
return
size
;
}
}
/**
/**
* Get the name of the array.
* Get the name of the array.
*/
*/
const
std
::
string
&
getName
()
{
const
std
::
string
&
getName
()
const
{
return
name
;
return
name
;
}
}
/**
/**
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
c35407a8
...
@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
...
@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
else
else
simdWidth
=
1
;
simdWidth
=
1
;
if
(
platforms
[
0
].
getInfo
<
CL_PLATFORM_VENDOR
>
()
==
"Apple"
&&
vendor
==
"AMD"
)
if
(
platforms
[
0
].
getInfo
<
CL_PLATFORM_VENDOR
>
()
==
"Apple"
&&
vendor
==
"AMD"
)
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
=
""
;
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
""
;
if
(
supports64BitGlobalAtomics
)
if
(
supports64BitGlobalAtomics
)
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
if
(
supportsDoublePrecision
)
if
(
supportsDoublePrecision
)
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
c35407a8
...
@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
pmeBsplineDTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineDTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBspline
D
Theta"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_int2
>
(
cl
,
cl
.
getNumAtoms
()
,
"int2"
,
"value.y"
);
sort
=
new
OpenCLSort
<
SortTrait
>
(
cl
,
cl
.
getNumAtoms
());
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
// Initialize the b-spline moduli.
// Initialize the b-spline moduli.
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
c35407a8
...
@@ -481,6 +481,16 @@ public:
...
@@ -481,6 +481,16 @@ public:
*/
*/
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
private:
private:
struct
SortTrait
{
typedef
mm_int2
DataType
;
typedef
cl_int
KeyType
;
static
const
char
*
clDataType
()
{
return
"int2"
;}
static
const
char
*
clKeyType
()
{
return
"int"
;}
static
const
char
*
clMinKey
()
{
return
"INT_MIN"
;}
static
const
char
*
clMaxKey
()
{
return
"INT_MAX"
;}
static
const
char
*
clMaxValue
()
{
return
"(int2) (INT_MAX, INT_MAX)"
;}
static
const
char
*
clSortKey
()
{
return
"value.y"
;}
};
OpenCLContext
&
cl
;
OpenCLContext
&
cl
;
bool
hasInitializedKernel
;
bool
hasInitializedKernel
;
OpenCLArray
<
mm_float2
>*
sigmaEpsilon
;
OpenCLArray
<
mm_float2
>*
sigmaEpsilon
;
...
@@ -495,7 +505,7 @@ private:
...
@@ -495,7 +505,7 @@ private:
OpenCLArray
<
mm_float4
>*
pmeBsplineDTheta
;
OpenCLArray
<
mm_float4
>*
pmeBsplineDTheta
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
mm_int2
>*
pmeAtomGridIndex
;
OpenCLArray
<
mm_int2
>*
pmeAtomGridIndex
;
OpenCLSort
<
mm_int2
>*
sort
;
OpenCLSort
<
SortTrait
>*
sort
;
OpenCLFFT3D
*
fft
;
OpenCLFFT3D
*
fft
;
cl
::
Kernel
ewaldSumsKernel
;
cl
::
Kernel
ewaldSumsKernel
;
cl
::
Kernel
ewaldForcesKernel
;
cl
::
Kernel
ewaldForcesKernel
;
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
c35407a8
#include "OpenCLSort.h"
#include "OpenCLSort.h"
template
class
OpenMM
::
OpenCLSort
<
float
>;
platforms/opencl/src/OpenCLSort.h
View file @
c35407a8
...
@@ -37,6 +37,28 @@ namespace OpenMM {
...
@@ -37,6 +37,28 @@ namespace OpenMM {
/**
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The class is templatized by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* struct FloatTrait {
* // The name of the data and key types being sorted.
* // Both the host type and OpenCL type is required.
* // For primitive types they will be the same.
* typedef cl_float DataType;
* typedef cl_float KeyType;
* static const char* clDataType() {return "float";}
* static const char* clKeyType() {return "float";}
* // The minimum value a key can take.
* static const char* clMinKey() {return "-MAXFLOAT";}
* // The maximum value a key can take.
* static const char* clMaxKey() {return "MAXFLOAT";}
* // A value whose key is guaranteed to equal clMaxKey().
* static const char* clMaxValue() {return "MAXFLOAT";}
* // The OpenCL code to select the key from the data value.
* static const char* clSortKey() {return "value";}
* };
*
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* (in local memory when possible, in global memory otherwise). This is similar to
...
@@ -51,8 +73,8 @@ namespace OpenMM {
...
@@ -51,8 +73,8 @@ namespace OpenMM {
* good performance with the array sizes we typically work with (10,000 to 100,000
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
* elements).
*/
*/
template
<
class
T
YPE
>
template
<
class
T
RAIT
>
class
OPENMM_EXPORT
OpenCLSort
{
class
OPENMM_EXPORT
OpenCLSort
{
public:
public:
/**
/**
...
@@ -60,17 +82,18 @@ public:
...
@@ -60,17 +82,18 @@ public:
*
*
* @param context the context in which to perform calculations
* @param context the context in which to perform calculations
* @param length the length of the arrays this object will be used to sort
* @param length the length of the arrays this object will be used to sort
* @param typeName the name of the data type being sorting (e.g. "float")
* @param sortKey an expression that returns the value by which the variable "value" should be sorted.
* For primitive types, this will simply be "value".
*/
*/
OpenCLSort
(
OpenCLContext
&
context
,
int
length
,
const
std
::
string
&
typeName
,
const
std
::
string
&
sortKey
)
:
context
(
context
),
OpenCLSort
(
OpenCLContext
&
context
,
unsigned
int
length
)
:
context
(
context
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"TYPE"
]
=
typeName
;
replacements
[
"DATA_TYPE"
]
=
TRAIT
::
clDataType
();
replacements
[
"SORT_KEY"
]
=
sortKey
;
replacements
[
"KEY_TYPE"
]
=
TRAIT
::
clKeyType
();
replacements
[
"SORT_KEY"
]
=
TRAIT
::
clSortKey
();
replacements
[
"MIN_KEY"
]
=
TRAIT
::
clMinKey
();
replacements
[
"MAX_KEY"
]
=
TRAIT
::
clMaxKey
();
replacements
[
"MAX_VALUE"
]
=
TRAIT
::
clMaxValue
();
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
...
@@ -80,18 +103,18 @@ public:
...
@@ -80,18 +103,18 @@ public:
// Work out the work group sizes for various kernels.
// Work out the work group sizes for various kernels.
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
rangeKernelSize
=
length
;
int
maxLocalBuffer
=
(
int
)
((
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
TYPE
))
/
2
);
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
typename
TRAIT
::
DataType
))
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
sortKernelSize
=
maxLocalBuffer
;
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
int
numBuckets
=
length
/
targetBucketSize
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
if
(
numBuckets
<
1
)
numBuckets
=
1
;
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
if
(
positionsKernelSize
>
numBuckets
)
...
@@ -99,11 +122,11 @@ public:
...
@@ -99,11 +122,11 @@ public:
// Create workspace arrays.
// Create workspace arrays.
dataRange
=
new
OpenCLArray
<
mm_float2
>
(
context
,
1
,
"sortDataRange"
);
dataRange
=
new
OpenCLArray
<
typename
TRAIT
::
KeyType
>
(
context
,
2
,
"sortDataRange"
);
bucketOffset
=
new
OpenCLArray
<
cl_int
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOffset
=
new
OpenCLArray
<
cl_
u
int
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
new
OpenCLArray
<
cl_int
>
(
context
,
length
,
"bucketOfElement"
);
bucketOfElement
=
new
OpenCLArray
<
cl_
u
int
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
new
OpenCLArray
<
cl_int
>
(
context
,
length
,
"offsetInBucket"
);
offsetInBucket
=
new
OpenCLArray
<
cl_
u
int
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
OpenCLArray
<
TYPE
>
(
context
,
length
,
"buckets"
);
buckets
=
new
OpenCLArray
<
typename
TRAIT
::
DataType
>
(
context
,
length
,
"buckets"
);
}
}
~
OpenCLSort
()
{
~
OpenCLSort
()
{
if
(
dataRange
!=
NULL
)
if
(
dataRange
!=
NULL
)
...
@@ -120,18 +143,24 @@ public:
...
@@ -120,18 +143,24 @@ public:
/**
/**
* Sort an array.
* Sort an array.
*/
*/
void
sort
(
OpenCLArray
<
TYPE
>&
data
)
{
void
sort
(
OpenCLArray
<
typename
TRAIT
::
DataType
>&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
())
throw
OpenMMException
(
"OpenCLSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl_
u
int
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
cl_float
),
NULL
);
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
typename
TRAIT
::
KeyType
),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
// Assign array elements to buckets.
int
numBuckets
=
bucketOffset
->
getSize
();
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
bucketOffset
->
getDeviceBuffer
(),
numBuckets
);
context
.
clearBuffer
(
bucketOffset
->
getDeviceBuffer
(),
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
...
@@ -165,18 +194,18 @@ public:
...
@@ -165,18 +194,18 @@ public:
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
TYPE
),
NULL
);
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
typename
TRAIT
::
DataType
),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
}
private:
private:
OpenCLContext
&
context
;
OpenCLContext
&
context
;
OpenCLArray
<
mm_float2
>*
dataRange
;
OpenCLArray
<
typename
TRAIT
::
KeyType
>*
dataRange
;
OpenCLArray
<
cl_int
>*
bucketOfElement
;
OpenCLArray
<
cl_
u
int
>*
bucketOfElement
;
OpenCLArray
<
cl_int
>*
offsetInBucket
;
OpenCLArray
<
cl_
u
int
>*
offsetInBucket
;
OpenCLArray
<
cl_int
>*
bucketOffset
;
OpenCLArray
<
cl_
u
int
>*
bucketOffset
;
OpenCLArray
<
TYPE
>*
buckets
;
OpenCLArray
<
typename
TRAIT
::
DataType
>*
buckets
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
};
}
// namespace OpenMM
}
// namespace OpenMM
...
...
platforms/opencl/src/kernels/sort.cl
View file @
c35407a8
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
float
getValue
(
TYPE
value
)
{
KEY_TYPE
getValue
(
DATA_
TYPE
value
)
{
return
SORT_KEY
;
return
SORT_KEY
;
}
}
...
@@ -8,14 +8,14 @@ float getValue(TYPE value) {
...
@@ -8,14 +8,14 @@ float getValue(TYPE value) {
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
is
executed
as
a
single
work
group.
*
is
executed
as
a
single
work
group.
*/
*/
__kernel
void
computeRange
(
__global
const
TYPE*
restrict
data,
int
length,
__global
float2
*
restrict
range,
__local
float
*
restrict
buffer
)
{
__kernel
void
computeRange
(
__global
const
DATA_
TYPE*
restrict
data,
u
int
length,
__global
KEY_TYPE
*
restrict
range,
__local
KEY_TYPE
*
restrict
buffer
)
{
float
minimum
=
MAX
FLOAT
;
KEY_TYPE
minimum
=
MAX
_KEY
;
float
maximum
=
-MAXFLOAT
;
KEY_TYPE
maximum
=
MIN_KEY
;
//
Each
thread
calculates
the
range
of
a
subset
of
values.
//
Each
thread
calculates
the
range
of
a
subset
of
values.
for
(
int
index
=
get_local_id
(
0
)
; index < length; index += get_local_size(0)) {
for
(
u
int
index
=
get_local_id
(
0
)
; index < length; index += get_local_size(0)) {
float
value
=
getValue
(
data[index]
)
;
KEY_TYPE
value
=
getValue
(
data[index]
)
;
minimum
=
min
(
minimum,
value
)
;
minimum
=
min
(
minimum,
value
)
;
maximum
=
max
(
maximum,
value
)
;
maximum
=
max
(
maximum,
value
)
;
}
}
...
@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
...
@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
buffer[get_local_id
(
0
)
]
=
minimum
;
buffer[get_local_id
(
0
)
]
=
minimum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
u
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
min
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
buffer[get_local_id
(
0
)
]
=
min
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
...
@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
minimum
=
buffer[0]
;
minimum
=
buffer[0]
;
buffer[get_local_id
(
0
)
]
=
maximum
;
buffer[get_local_id
(
0
)
]
=
maximum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
u
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
max
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
buffer[get_local_id
(
0
)
]
=
max
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
maximum
=
buffer[0]
;
maximum
=
buffer[0]
;
if
(
get_local_id
(
0
)
==
0
)
if
(
get_local_id
(
0
)
==
0
)
{
range[0]
=
(
float2
)
(
minimum,
maximum
)
;
range[0]
=
minimum
;
range[1]
=
maximum
;
}
}
}
/**
/**
*
Assign
elements
to
buckets.
*
Assign
elements
to
buckets.
*/
*/
__kernel
void
assignElementsToBuckets
(
__global
const
TYPE*
restrict
data,
int
length,
int
numBuckets,
__global
const
float2
*
restrict
range,
__kernel
void
assignElementsToBuckets
(
__global
const
DATA_
TYPE*
restrict
data,
u
int
length,
u
int
numBuckets,
__global
const
KEY_TYPE
*
restrict
range,
__global
int*
bucketOffset,
__global
int*
restrict
bucketOfElement,
__global
int*
restrict
offsetInBucket
)
{
__global
u
int*
bucketOffset,
__global
u
int*
restrict
bucketOfElement,
__global
u
int*
restrict
offsetInBucket
)
{
#
ifdef
AMD_ATOMIC_WORK_AROUND
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
...
@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
...
@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
bucketOffset
)
[sizeof
(
int
)
*numBuckets+1]
=
0
;
((
__global
char*
)
bucketOffset
)
[sizeof
(
int
)
*numBuckets+1]
=
0
;
#
endif
#
endif
float2
dataRange
=
range[0]
;
float
minValue
=
(
float
)
(
range[0]
)
;
float
minValue
=
dataRange.x
;
float
maxValue
=
(
float
)
(
range[1]
)
;
float
maxValue
=
dataRange.y
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
for
(
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
for
(
u
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
#
ifdef
MAC_AMD_WORKAROUND
#
ifdef
MAC_AMD_WORKAROUND
__global
int*
d
=
(
__global
int*
)
data
;
__global
int*
d
=
(
__global
int*
)
data
;
int2
element
=
(
int2
)
(
d[2*index],
d[2*index+1]
)
;
int2
element
=
(
int2
)
(
d[2*index],
d[2*index+1]
)
;
float
key
=
(
float
)
getValue
(
element
)
;
#
else
#
else
TYPE
element
=
data[index]
;
float
key
=
(
float
)
getValue
(
data[index]
)
;
#
endif
#
endif
float
value
=
getValue
(
element
)
;
uint
bucketIndex
=
min
((
uint
)
((
key-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
int
bucketIndex
=
min
((
int
)
((
value-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
bucketOfElement[index]
=
bucketIndex
;
bucketOfElement[index]
=
bucketIndex
;
}
}
...
@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
...
@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
*
Sum
the
bucket
sizes
to
compute
the
start
position
of
each
bucket.
This
kernel
*
Sum
the
bucket
sizes
to
compute
the
start
position
of
each
bucket.
This
kernel
*
is
executed
as
a
single
work
group.
*
is
executed
as
a
single
work
group.
*/
*/
__kernel
void
computeBucketPositions
(
int
numBuckets,
__global
int*
restrict
bucketOffset,
__local
int*
restrict
buffer
)
{
__kernel
void
computeBucketPositions
(
u
int
numBuckets,
__global
u
int*
restrict
bucketOffset,
__local
u
int*
restrict
buffer
)
{
int
globalOffset
=
0
;
u
int
globalOffset
=
0
;
for
(
int
startBucket
=
0
; startBucket < numBuckets; startBucket += get_local_size(0)) {
for
(
u
int
startBucket
=
0
; startBucket < numBuckets; startBucket += get_local_size(0)) {
//
Load
the
bucket
sizes
into
local
memory.
//
Load
the
bucket
sizes
into
local
memory.
int
globalIndex
=
startBucket+get_local_id
(
0
)
;
u
int
globalIndex
=
startBucket+get_local_id
(
0
)
;
buffer[get_local_id
(
0
)
]
=
(
globalIndex
<
numBuckets
?
bucketOffset[globalIndex]
:
0
)
;
buffer[get_local_id
(
0
)
]
=
(
globalIndex
<
numBuckets
?
bucketOffset[globalIndex]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
parallel
prefix
sum.
//
Perform
a
parallel
prefix
sum.
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
u
int
step
=
1
; step < get_local_size(0); step *= 2) {
int
add
=
(
get_local_id
(
0
)
>=
step
?
buffer[get_local_id
(
0
)
-step]
:
0
)
;
u
int
add
=
(
get_local_id
(
0
)
>=
step
?
buffer[get_local_id
(
0
)
-step]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
buffer[get_local_id
(
0
)
]
+=
add
;
buffer[get_local_id
(
0
)
]
+=
add
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
...
@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
/**
/**
*
Copy
the
input
data
into
the
buckets
for
sorting.
*
Copy
the
input
data
into
the
buckets
for
sorting.
*/
*/
__kernel
void
copyDataToBuckets
(
__global
const
TYPE*
restrict
data,
__global
TYPE*
restrict
buckets,
int
length,
__global
const
int*
restrict
bucketOffset,
__global
const
int*
restrict
bucketOfElement,
__global
const
int*
restrict
offsetInBucket
)
{
__kernel
void
copyDataToBuckets
(
__global
const
DATA_
TYPE*
restrict
data,
__global
DATA_
TYPE*
restrict
buckets,
u
int
length,
__global
const
u
int*
restrict
bucketOffset,
__global
const
u
int*
restrict
bucketOfElement,
__global
const
u
int*
restrict
offsetInBucket
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
for
(
u
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
TYPE
element
=
data[index]
;
DATA_
TYPE
element
=
data[index]
;
int
bucketIndex
=
bucketOfElement[index]
;
u
int
bucketIndex
=
bucketOfElement[index]
;
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset[bucketIndex-1]
)
;
u
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset[bucketIndex-1]
)
;
buckets[offset+offsetInBucket[index]]
=
element
;
buckets[offset+offsetInBucket[index]]
=
element
;
}
}
}
}
...
@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
...
@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
/**
/**
*
Sort
the
data
in
each
bucket.
*
Sort
the
data
in
each
bucket.
*/
*/
__kernel
void
sortBuckets
(
__global
TYPE*
data,
__global
const
TYPE*
buckets,
int
numBuckets,
__global
const
int*
restrict
bucketOffset,
__local
TYPE*
buffer
)
{
__kernel
void
sortBuckets
(
__global
DATA_
TYPE*
restrict
data,
__global
const
DATA_
TYPE*
restrict
buckets,
u
int
numBuckets,
__global
const
u
int*
restrict
bucketOffset,
__local
DATA_TYPE*
restrict
buffer
)
{
for
(
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
for
(
u
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
u
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
int
endIndex
=
bucketOffset[index]
;
u
int
endIndex
=
bucketOffset[index]
;
int
length
=
endIndex-startIndex
;
u
int
length
=
endIndex-startIndex
;
if
(
length
<=
get_local_size
(
0
))
{
if
(
length
<=
get_local_size
(
0
))
{
//
Load
the
data
into
local
memory.
//
Load
the
data
into
local
memory.
buffer[get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
length
?
buckets[startIndex+get_local_id
(
0
)
]
:
(
TYPE
)
MAXFLOAT
)
;
if
(
get_local_id
(
0
)
<
length
)
buffer[get_local_id
(
0
)
]
=
buckets[startIndex+get_local_id
(
0
)
]
;
else
buffer[get_local_id
(
0
)
]
=
MAX_VALUE
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
bitonic
sort
in
local
memory.
//
Perform
a
bitonic
sort
in
local
memory.
for
(
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
u
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
int
j
=
k/2
; j > 0; j /= 2) {
for
(
u
int
j
=
k/2
; j > 0; j /= 2) {
int
ixj
=
get_local_id
(
0
)
^j
;
int
ixj
=
get_local_id
(
0
)
^j
;
if
(
ixj
>
get_local_id
(
0
))
{
if
(
ixj
>
get_local_id
(
0
))
{
if
(((
get_local_id
(
0
)
&k
)
==
0
&&
getValue
(
buffer[get_local_id
(
0
)
]
)
>
getValue
(
buffer[ixj]
))
|
|
DATA_TYPE
value1
=
buffer[get_local_id
(
0
)
]
;
((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
DATA_TYPE
value2
=
buffer[ixj]
;
TYPE temp = buffer[get_local_id(0)];
bool
ascending
=
(
get_local_id
(
0
)
&k
)
==
0
;
buffer[get_local_id(0)] = buffer[ixj];
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
))
;
buffer[ixj] = temp;
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
))
;
if
(
lowKey
>
highKey
)
{
buffer[get_local_id
(
0
)
]
=
value2
;
buffer[ixj]
=
value1
;
}
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
...
@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
else
{
else
{
//
Copy
the
bucket
data
over
to
the
output
array.
//
Copy
the
bucket
data
over
to
the
output
array.
for (int i = get_local_id(0); i < length; i += get_local_size(0))
for
(
u
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0))
data[startIndex+i]
=
buckets[startIndex+i]
;
data[startIndex+i]
=
buckets[startIndex+i]
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
//
Perform
a
bitonic
sort
in
global
memory.
//
Perform
a
bitonic
sort
in
global
memory.
for (int k = 2; k < 2*length; k *= 2) {
for
(
u
int
k
=
2
; k < 2*length; k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
for
(
u
int
j
=
k/2
; j > 0; j /= 2) {
for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
for
(
u
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0)) {
int
ixj
=
i^j
;
int
ixj
=
i^j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
if
(
ixj
>
i
&&
ixj
<
length
)
{
TYPE value1 = data[startIndex+i];
DATA_
TYPE
value1
=
data[startIndex+i]
;
TYPE value2 = data[startIndex+ixj];
DATA_
TYPE
value2
=
data[startIndex+ixj]
;
bool
ascending
=
((
i&k
)
==
0
)
;
bool
ascending
=
((
i&k
)
==
0
)
;
for (int mask = k*2; mask < 2*length; mask *= 2)
for
(
u
int
mask
=
k*2
; mask < 2*length; mask *= 2)
ascending
=
((
i&mask
)
==
0
?
!ascending
:
ascending
)
;
ascending
=
((
i&mask
)
==
0
?
!ascending
:
ascending
)
;
if ((ascending && getValue(value1) > getValue(value2)) |
|
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
))
;
(
!ascending
&&
getValue
(
value1
)
<
getValue
(
value2
)))
{
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
))
;
if
(
lowKey
>
highKey
)
{
data[startIndex+i]
=
value2
;
data[startIndex+i]
=
value2
;
data[startIndex+ixj]
=
value1
;
data[startIndex+ixj]
=
value1
;
}
}
...
...
platforms/opencl/tests/TestOpenCLSort.cpp
View file @
c35407a8
...
@@ -46,6 +46,17 @@
...
@@ -46,6 +46,17 @@
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
struct
SortTrait
{
typedef
cl_float
DataType
;
typedef
cl_float
KeyType
;
static
const
char
*
clDataType
()
{
return
"float"
;}
static
const
char
*
clKeyType
()
{
return
"float"
;}
static
const
char
*
clMinKey
()
{
return
"-MAXFLOAT"
;}
static
const
char
*
clMaxKey
()
{
return
"MAXFLOAT"
;}
static
const
char
*
clMaxValue
()
{
return
"MAXFLOAT"
;}
static
const
char
*
clSortKey
()
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
// Sort the array.
...
@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
...
@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
context
.
initialize
(
system
);
context
.
initialize
(
system
);
OpenCLArray
<
float
>
data
(
context
,
array
.
size
(),
"sortData"
);
OpenCLArray
<
float
>
data
(
context
,
array
.
size
(),
"sortData"
);
data
.
upload
(
array
);
data
.
upload
(
array
);
OpenCLSort
<
floa
t
>
sort
(
context
,
array
.
size
()
,
"float"
,
"value"
);
OpenCLSort
<
SortTrai
t
>
sort
(
context
,
array
.
size
());
sort
.
sort
(
data
);
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
data
.
download
(
sorted
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment