Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
c35407a8
"vscode:/vscode.git/clone" did not exist on "56766760fe15e1ee7ca572998b32c25d5d2abed7"
Commit
c35407a8
authored
Jan 13, 2012
by
Peter Eastman
Browse files
Tony Tye's optimizations to sorting. Also a couple of other very minor fixes.
parent
052aea3e
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
145 additions
and
89 deletions
+145
-89
platforms/opencl/src/OpenCLArray.h
platforms/opencl/src/OpenCLArray.h
+2
-2
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+1
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+2
-2
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+11
-1
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+0
-2
platforms/opencl/src/OpenCLSort.h
platforms/opencl/src/OpenCLSort.h
+57
-28
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+60
-52
platforms/opencl/tests/TestOpenCLSort.cpp
platforms/opencl/tests/TestOpenCLSort.cpp
+12
-1
No files found.
platforms/opencl/src/OpenCLArray.h
View file @
c35407a8
...
@@ -90,13 +90,13 @@ public:
...
@@ -90,13 +90,13 @@ public:
/**
/**
* Get the size of the array.
* Get the size of the array.
*/
*/
int
getSize
()
{
int
getSize
()
const
{
return
size
;
return
size
;
}
}
/**
/**
* Get the name of the array.
* Get the name of the array.
*/
*/
const
std
::
string
&
getName
()
{
const
std
::
string
&
getName
()
const
{
return
name
;
return
name
;
}
}
/**
/**
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
c35407a8
...
@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
...
@@ -131,7 +131,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
else
else
simdWidth
=
1
;
simdWidth
=
1
;
if
(
platforms
[
0
].
getInfo
<
CL_PLATFORM_VENDOR
>
()
==
"Apple"
&&
vendor
==
"AMD"
)
if
(
platforms
[
0
].
getInfo
<
CL_PLATFORM_VENDOR
>
()
==
"Apple"
&&
vendor
==
"AMD"
)
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
=
""
;
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
""
;
if
(
supports64BitGlobalAtomics
)
if
(
supports64BitGlobalAtomics
)
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
if
(
supportsDoublePrecision
)
if
(
supportsDoublePrecision
)
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
c35407a8
...
@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1077,10 +1077,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
pmeBsplineDTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineDTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBspline
D
Theta"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_int2
>
(
cl
,
cl
.
getNumAtoms
()
,
"int2"
,
"value.y"
);
sort
=
new
OpenCLSort
<
SortTrait
>
(
cl
,
cl
.
getNumAtoms
());
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
// Initialize the b-spline moduli.
// Initialize the b-spline moduli.
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
c35407a8
...
@@ -481,6 +481,16 @@ public:
...
@@ -481,6 +481,16 @@ public:
*/
*/
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
private:
private:
struct
SortTrait
{
typedef
mm_int2
DataType
;
typedef
cl_int
KeyType
;
static
const
char
*
clDataType
()
{
return
"int2"
;}
static
const
char
*
clKeyType
()
{
return
"int"
;}
static
const
char
*
clMinKey
()
{
return
"INT_MIN"
;}
static
const
char
*
clMaxKey
()
{
return
"INT_MAX"
;}
static
const
char
*
clMaxValue
()
{
return
"(int2) (INT_MAX, INT_MAX)"
;}
static
const
char
*
clSortKey
()
{
return
"value.y"
;}
};
OpenCLContext
&
cl
;
OpenCLContext
&
cl
;
bool
hasInitializedKernel
;
bool
hasInitializedKernel
;
OpenCLArray
<
mm_float2
>*
sigmaEpsilon
;
OpenCLArray
<
mm_float2
>*
sigmaEpsilon
;
...
@@ -495,7 +505,7 @@ private:
...
@@ -495,7 +505,7 @@ private:
OpenCLArray
<
mm_float4
>*
pmeBsplineDTheta
;
OpenCLArray
<
mm_float4
>*
pmeBsplineDTheta
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
mm_int2
>*
pmeAtomGridIndex
;
OpenCLArray
<
mm_int2
>*
pmeAtomGridIndex
;
OpenCLSort
<
mm_int2
>*
sort
;
OpenCLSort
<
SortTrait
>*
sort
;
OpenCLFFT3D
*
fft
;
OpenCLFFT3D
*
fft
;
cl
::
Kernel
ewaldSumsKernel
;
cl
::
Kernel
ewaldSumsKernel
;
cl
::
Kernel
ewaldForcesKernel
;
cl
::
Kernel
ewaldForcesKernel
;
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
c35407a8
#include "OpenCLSort.h"
#include "OpenCLSort.h"
template
class
OpenMM
::
OpenCLSort
<
float
>;
platforms/opencl/src/OpenCLSort.h
View file @
c35407a8
...
@@ -38,6 +38,28 @@ namespace OpenMM {
...
@@ -38,6 +38,28 @@ namespace OpenMM {
* This class sorts arrays of values. It supports any type of values, not just scalars,
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
* so long as an appropriate sorting key can be defined by which to sort them.
*
*
* The class is templatized by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* struct FloatTrait {
* // The name of the data and key types being sorted.
* // Both the host type and OpenCL type is required.
* // For primitive types they will be the same.
* typedef cl_float DataType;
* typedef cl_float KeyType;
* static const char* clDataType() {return "float";}
* static const char* clKeyType() {return "float";}
* // The minimum value a key can take.
* static const char* clMinKey() {return "-MAXFLOAT";}
* // The maximum value a key can take.
* static const char* clMaxKey() {return "MAXFLOAT";}
* // A value whose key is guaranteed to equal clMaxKey().
* static const char* clMaxValue() {return "MAXFLOAT";}
* // The OpenCL code to select the key from the data value.
* static const char* clSortKey() {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
* the algorithm described in
...
@@ -52,7 +74,7 @@ namespace OpenMM {
...
@@ -52,7 +74,7 @@ namespace OpenMM {
* elements).
* elements).
*/
*/
template
<
class
T
YPE
>
template
<
class
T
RAIT
>
class
OPENMM_EXPORT
OpenCLSort
{
class
OPENMM_EXPORT
OpenCLSort
{
public:
public:
/**
/**
...
@@ -60,17 +82,18 @@ public:
...
@@ -60,17 +82,18 @@ public:
*
*
* @param context the context in which to perform calculations
* @param context the context in which to perform calculations
* @param length the length of the arrays this object will be used to sort
* @param length the length of the arrays this object will be used to sort
* @param typeName the name of the data type being sorting (e.g. "float")
* @param sortKey an expression that returns the value by which the variable "value" should be sorted.
* For primitive types, this will simply be "value".
*/
*/
OpenCLSort
(
OpenCLContext
&
context
,
int
length
,
const
std
::
string
&
typeName
,
const
std
::
string
&
sortKey
)
:
context
(
context
),
OpenCLSort
(
OpenCLContext
&
context
,
unsigned
int
length
)
:
context
(
context
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"TYPE"
]
=
typeName
;
replacements
[
"DATA_TYPE"
]
=
TRAIT
::
clDataType
();
replacements
[
"SORT_KEY"
]
=
sortKey
;
replacements
[
"KEY_TYPE"
]
=
TRAIT
::
clKeyType
();
replacements
[
"SORT_KEY"
]
=
TRAIT
::
clSortKey
();
replacements
[
"MIN_KEY"
]
=
TRAIT
::
clMinKey
();
replacements
[
"MAX_KEY"
]
=
TRAIT
::
clMaxKey
();
replacements
[
"MAX_VALUE"
]
=
TRAIT
::
clMaxValue
();
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
...
@@ -80,18 +103,18 @@ public:
...
@@ -80,18 +103,18 @@ public:
// Work out the work group sizes for various kernels.
// Work out the work group sizes for various kernels.
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
rangeKernelSize
=
length
;
int
maxLocalBuffer
=
(
int
)
((
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
TYPE
))
/
2
);
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
typename
TRAIT
::
DataType
))
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
sortKernelSize
=
maxLocalBuffer
;
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
int
numBuckets
=
length
/
targetBucketSize
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
if
(
numBuckets
<
1
)
numBuckets
=
1
;
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
if
(
positionsKernelSize
>
numBuckets
)
...
@@ -99,11 +122,11 @@ public:
...
@@ -99,11 +122,11 @@ public:
// Create workspace arrays.
// Create workspace arrays.
dataRange
=
new
OpenCLArray
<
mm_float2
>
(
context
,
1
,
"sortDataRange"
);
dataRange
=
new
OpenCLArray
<
typename
TRAIT
::
KeyType
>
(
context
,
2
,
"sortDataRange"
);
bucketOffset
=
new
OpenCLArray
<
cl_int
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOffset
=
new
OpenCLArray
<
cl_
u
int
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
new
OpenCLArray
<
cl_int
>
(
context
,
length
,
"bucketOfElement"
);
bucketOfElement
=
new
OpenCLArray
<
cl_
u
int
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
new
OpenCLArray
<
cl_int
>
(
context
,
length
,
"offsetInBucket"
);
offsetInBucket
=
new
OpenCLArray
<
cl_
u
int
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
OpenCLArray
<
TYPE
>
(
context
,
length
,
"buckets"
);
buckets
=
new
OpenCLArray
<
typename
TRAIT
::
DataType
>
(
context
,
length
,
"buckets"
);
}
}
~
OpenCLSort
()
{
~
OpenCLSort
()
{
if
(
dataRange
!=
NULL
)
if
(
dataRange
!=
NULL
)
...
@@ -120,18 +143,24 @@ public:
...
@@ -120,18 +143,24 @@ public:
/**
/**
* Sort an array.
* Sort an array.
*/
*/
void
sort
(
OpenCLArray
<
TYPE
>&
data
)
{
void
sort
(
OpenCLArray
<
typename
TRAIT
::
DataType
>&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
())
throw
OpenMMException
(
"OpenCLSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl_
u
int
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
cl_float
),
NULL
);
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
typename
TRAIT
::
KeyType
),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
// Assign array elements to buckets.
int
numBuckets
=
bucketOffset
->
getSize
();
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
bucketOffset
->
getDeviceBuffer
(),
numBuckets
);
context
.
clearBuffer
(
bucketOffset
->
getDeviceBuffer
(),
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
...
@@ -165,18 +194,18 @@ public:
...
@@ -165,18 +194,18 @@ public:
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
TYPE
),
NULL
);
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
typename
TRAIT
::
DataType
),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
}
private:
private:
OpenCLContext
&
context
;
OpenCLContext
&
context
;
OpenCLArray
<
mm_float2
>*
dataRange
;
OpenCLArray
<
typename
TRAIT
::
KeyType
>*
dataRange
;
OpenCLArray
<
cl_int
>*
bucketOfElement
;
OpenCLArray
<
cl_
u
int
>*
bucketOfElement
;
OpenCLArray
<
cl_int
>*
offsetInBucket
;
OpenCLArray
<
cl_
u
int
>*
offsetInBucket
;
OpenCLArray
<
cl_int
>*
bucketOffset
;
OpenCLArray
<
cl_
u
int
>*
bucketOffset
;
OpenCLArray
<
TYPE
>*
buckets
;
OpenCLArray
<
typename
TRAIT
::
DataType
>*
buckets
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
};
}
// namespace OpenMM
}
// namespace OpenMM
...
...
platforms/opencl/src/kernels/sort.cl
View file @
c35407a8
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
float
getValue
(
TYPE
value
)
{
KEY_TYPE
getValue
(
DATA_
TYPE
value
)
{
return
SORT_KEY
;
return
SORT_KEY
;
}
}
...
@@ -8,14 +8,14 @@ float getValue(TYPE value) {
...
@@ -8,14 +8,14 @@ float getValue(TYPE value) {
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
is
executed
as
a
single
work
group.
*
is
executed
as
a
single
work
group.
*/
*/
__kernel
void
computeRange
(
__global
const
TYPE*
restrict
data,
int
length,
__global
float2
*
restrict
range,
__local
float
*
restrict
buffer
)
{
__kernel
void
computeRange
(
__global
const
DATA_
TYPE*
restrict
data,
u
int
length,
__global
KEY_TYPE
*
restrict
range,
__local
KEY_TYPE
*
restrict
buffer
)
{
float
minimum
=
MAX
FLOAT
;
KEY_TYPE
minimum
=
MAX
_KEY
;
float
maximum
=
-MAXFLOAT
;
KEY_TYPE
maximum
=
MIN_KEY
;
//
Each
thread
calculates
the
range
of
a
subset
of
values.
//
Each
thread
calculates
the
range
of
a
subset
of
values.
for
(
int
index
=
get_local_id
(
0
)
; index < length; index += get_local_size(0)) {
for
(
u
int
index
=
get_local_id
(
0
)
; index < length; index += get_local_size(0)) {
float
value
=
getValue
(
data[index]
)
;
KEY_TYPE
value
=
getValue
(
data[index]
)
;
minimum
=
min
(
minimum,
value
)
;
minimum
=
min
(
minimum,
value
)
;
maximum
=
max
(
maximum,
value
)
;
maximum
=
max
(
maximum,
value
)
;
}
}
...
@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
...
@@ -24,7 +24,7 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
buffer[get_local_id
(
0
)
]
=
minimum
;
buffer[get_local_id
(
0
)
]
=
minimum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
u
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
min
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
buffer[get_local_id
(
0
)
]
=
min
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
...
@@ -32,21 +32,23 @@ __kernel void computeRange(__global const TYPE* restrict data, int length, __glo
minimum
=
buffer[0]
;
minimum
=
buffer[0]
;
buffer[get_local_id
(
0
)
]
=
maximum
;
buffer[get_local_id
(
0
)
]
=
maximum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
u
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
max
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
buffer[get_local_id
(
0
)
]
=
max
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
maximum
=
buffer[0]
;
maximum
=
buffer[0]
;
if
(
get_local_id
(
0
)
==
0
)
if
(
get_local_id
(
0
)
==
0
)
{
range[0]
=
(
float2
)
(
minimum,
maximum
)
;
range[0]
=
minimum
;
range[1]
=
maximum
;
}
}
}
/**
/**
*
Assign
elements
to
buckets.
*
Assign
elements
to
buckets.
*/
*/
__kernel
void
assignElementsToBuckets
(
__global
const
TYPE*
restrict
data,
int
length,
int
numBuckets,
__global
const
float2
*
restrict
range,
__kernel
void
assignElementsToBuckets
(
__global
const
DATA_
TYPE*
restrict
data,
u
int
length,
u
int
numBuckets,
__global
const
KEY_TYPE
*
restrict
range,
__global
int*
bucketOffset,
__global
int*
restrict
bucketOfElement,
__global
int*
restrict
offsetInBucket
)
{
__global
u
int*
bucketOffset,
__global
u
int*
restrict
bucketOfElement,
__global
u
int*
restrict
offsetInBucket
)
{
#
ifdef
AMD_ATOMIC_WORK_AROUND
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
...
@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
...
@@ -55,19 +57,18 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
bucketOffset
)
[sizeof
(
int
)
*numBuckets+1]
=
0
;
((
__global
char*
)
bucketOffset
)
[sizeof
(
int
)
*numBuckets+1]
=
0
;
#
endif
#
endif
float2
dataRange
=
range[0]
;
float
minValue
=
(
float
)
(
range[0]
)
;
float
minValue
=
dataRange.x
;
float
maxValue
=
(
float
)
(
range[1]
)
;
float
maxValue
=
dataRange.y
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
for
(
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
for
(
u
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
#
ifdef
MAC_AMD_WORKAROUND
#
ifdef
MAC_AMD_WORKAROUND
__global
int*
d
=
(
__global
int*
)
data
;
__global
int*
d
=
(
__global
int*
)
data
;
int2
element
=
(
int2
)
(
d[2*index],
d[2*index+1]
)
;
int2
element
=
(
int2
)
(
d[2*index],
d[2*index+1]
)
;
float
key
=
(
float
)
getValue
(
element
)
;
#
else
#
else
TYPE
element
=
data[index]
;
float
key
=
(
float
)
getValue
(
data[index]
)
;
#
endif
#
endif
float
value
=
getValue
(
element
)
;
uint
bucketIndex
=
min
((
uint
)
((
key-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
int
bucketIndex
=
min
((
int
)
((
value-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
bucketOfElement[index]
=
bucketIndex
;
bucketOfElement[index]
=
bucketIndex
;
}
}
...
@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
...
@@ -77,19 +78,19 @@ __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int le
*
Sum
the
bucket
sizes
to
compute
the
start
position
of
each
bucket.
This
kernel
*
Sum
the
bucket
sizes
to
compute
the
start
position
of
each
bucket.
This
kernel
*
is
executed
as
a
single
work
group.
*
is
executed
as
a
single
work
group.
*/
*/
__kernel
void
computeBucketPositions
(
int
numBuckets,
__global
int*
restrict
bucketOffset,
__local
int*
restrict
buffer
)
{
__kernel
void
computeBucketPositions
(
u
int
numBuckets,
__global
u
int*
restrict
bucketOffset,
__local
u
int*
restrict
buffer
)
{
int
globalOffset
=
0
;
u
int
globalOffset
=
0
;
for
(
int
startBucket
=
0
; startBucket < numBuckets; startBucket += get_local_size(0)) {
for
(
u
int
startBucket
=
0
; startBucket < numBuckets; startBucket += get_local_size(0)) {
//
Load
the
bucket
sizes
into
local
memory.
//
Load
the
bucket
sizes
into
local
memory.
int
globalIndex
=
startBucket+get_local_id
(
0
)
;
u
int
globalIndex
=
startBucket+get_local_id
(
0
)
;
buffer[get_local_id
(
0
)
]
=
(
globalIndex
<
numBuckets
?
bucketOffset[globalIndex]
:
0
)
;
buffer[get_local_id
(
0
)
]
=
(
globalIndex
<
numBuckets
?
bucketOffset[globalIndex]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
parallel
prefix
sum.
//
Perform
a
parallel
prefix
sum.
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
u
int
step
=
1
; step < get_local_size(0); step *= 2) {
int
add
=
(
get_local_id
(
0
)
>=
step
?
buffer[get_local_id
(
0
)
-step]
:
0
)
;
u
int
add
=
(
get_local_id
(
0
)
>=
step
?
buffer[get_local_id
(
0
)
-step]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
buffer[get_local_id
(
0
)
]
+=
add
;
buffer[get_local_id
(
0
)
]
+=
add
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
...
@@ -106,11 +107,11 @@ __kernel void computeBucketPositions(int numBuckets, __global int* restrict buck
/**
/**
*
Copy
the
input
data
into
the
buckets
for
sorting.
*
Copy
the
input
data
into
the
buckets
for
sorting.
*/
*/
__kernel
void
copyDataToBuckets
(
__global
const
TYPE*
restrict
data,
__global
TYPE*
restrict
buckets,
int
length,
__global
const
int*
restrict
bucketOffset,
__global
const
int*
restrict
bucketOfElement,
__global
const
int*
restrict
offsetInBucket
)
{
__kernel
void
copyDataToBuckets
(
__global
const
DATA_
TYPE*
restrict
data,
__global
DATA_
TYPE*
restrict
buckets,
u
int
length,
__global
const
u
int*
restrict
bucketOffset,
__global
const
u
int*
restrict
bucketOfElement,
__global
const
u
int*
restrict
offsetInBucket
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
for
(
u
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
TYPE
element
=
data[index]
;
DATA_
TYPE
element
=
data[index]
;
int
bucketIndex
=
bucketOfElement[index]
;
u
int
bucketIndex
=
bucketOfElement[index]
;
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset[bucketIndex-1]
)
;
u
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset[bucketIndex-1]
)
;
buckets[offset+offsetInBucket[index]]
=
element
;
buckets[offset+offsetInBucket[index]]
=
element
;
}
}
}
}
...
@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
...
@@ -118,28 +119,34 @@ __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYP
/**
/**
*
Sort
the
data
in
each
bucket.
*
Sort
the
data
in
each
bucket.
*/
*/
__kernel
void
sortBuckets
(
__global
TYPE*
data,
__global
const
TYPE*
buckets,
int
numBuckets,
__global
const
int*
restrict
bucketOffset,
__local
TYPE*
buffer
)
{
__kernel
void
sortBuckets
(
__global
DATA_
TYPE*
restrict
data,
__global
const
DATA_
TYPE*
restrict
buckets,
u
int
numBuckets,
__global
const
u
int*
restrict
bucketOffset,
__local
DATA_TYPE*
restrict
buffer
)
{
for
(
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
for
(
u
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
u
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
int
endIndex
=
bucketOffset[index]
;
u
int
endIndex
=
bucketOffset[index]
;
int
length
=
endIndex-startIndex
;
u
int
length
=
endIndex-startIndex
;
if
(
length
<=
get_local_size
(
0
))
{
if
(
length
<=
get_local_size
(
0
))
{
//
Load
the
data
into
local
memory.
//
Load
the
data
into
local
memory.
buffer[get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
length
?
buckets[startIndex+get_local_id
(
0
)
]
:
(
TYPE
)
MAXFLOAT
)
;
if
(
get_local_id
(
0
)
<
length
)
buffer[get_local_id
(
0
)
]
=
buckets[startIndex+get_local_id
(
0
)
]
;
else
buffer[get_local_id
(
0
)
]
=
MAX_VALUE
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
bitonic
sort
in
local
memory.
//
Perform
a
bitonic
sort
in
local
memory.
for
(
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
u
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
int
j
=
k/2
; j > 0; j /= 2) {
for
(
u
int
j
=
k/2
; j > 0; j /= 2) {
int
ixj
=
get_local_id
(
0
)
^j
;
int
ixj
=
get_local_id
(
0
)
^j
;
if
(
ixj
>
get_local_id
(
0
))
{
if
(
ixj
>
get_local_id
(
0
))
{
if
(((
get_local_id
(
0
)
&k
)
==
0
&&
getValue
(
buffer[get_local_id
(
0
)
]
)
>
getValue
(
buffer[ixj]
))
|
|
DATA_TYPE
value1
=
buffer[get_local_id
(
0
)
]
;
((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
DATA_TYPE
value2
=
buffer[ixj]
;
TYPE temp = buffer[get_local_id(0)];
bool
ascending
=
(
get_local_id
(
0
)
&k
)
==
0
;
buffer[get_local_id(0)] = buffer[ixj];
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
))
;
buffer[ixj] = temp;
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
))
;
if
(
lowKey
>
highKey
)
{
buffer[get_local_id
(
0
)
]
=
value2
;
buffer[ixj]
=
value1
;
}
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
...
@@ -154,24 +161,25 @@ __kernel void sortBuckets(__global TYPE* data, __global const TYPE* buckets, int
else
{
else
{
//
Copy
the
bucket
data
over
to
the
output
array.
//
Copy
the
bucket
data
over
to
the
output
array.
for (int i = get_local_id(0); i < length; i += get_local_size(0))
for
(
u
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0))
data[startIndex+i]
=
buckets[startIndex+i]
;
data[startIndex+i]
=
buckets[startIndex+i]
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
//
Perform
a
bitonic
sort
in
global
memory.
//
Perform
a
bitonic
sort
in
global
memory.
for (int k = 2; k < 2*length; k *= 2) {
for
(
u
int
k
=
2
; k < 2*length; k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
for
(
u
int
j
=
k/2
; j > 0; j /= 2) {
for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
for
(
u
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0)) {
int
ixj
=
i^j
;
int
ixj
=
i^j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
if
(
ixj
>
i
&&
ixj
<
length
)
{
TYPE value1 = data[startIndex+i];
DATA_
TYPE
value1
=
data[startIndex+i]
;
TYPE value2 = data[startIndex+ixj];
DATA_
TYPE
value2
=
data[startIndex+ixj]
;
bool
ascending
=
((
i&k
)
==
0
)
;
bool
ascending
=
((
i&k
)
==
0
)
;
for (int mask = k*2; mask < 2*length; mask *= 2)
for
(
u
int
mask
=
k*2
; mask < 2*length; mask *= 2)
ascending
=
((
i&mask
)
==
0
?
!ascending
:
ascending
)
;
ascending
=
((
i&mask
)
==
0
?
!ascending
:
ascending
)
;
if ((ascending && getValue(value1) > getValue(value2)) |
|
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
))
;
(
!ascending
&&
getValue
(
value1
)
<
getValue
(
value2
)))
{
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
))
;
if
(
lowKey
>
highKey
)
{
data[startIndex+i]
=
value2
;
data[startIndex+i]
=
value2
;
data[startIndex+ixj]
=
value1
;
data[startIndex+ixj]
=
value1
;
}
}
...
...
platforms/opencl/tests/TestOpenCLSort.cpp
View file @
c35407a8
...
@@ -46,6 +46,17 @@
...
@@ -46,6 +46,17 @@
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
struct
SortTrait
{
typedef
cl_float
DataType
;
typedef
cl_float
KeyType
;
static
const
char
*
clDataType
()
{
return
"float"
;}
static
const
char
*
clKeyType
()
{
return
"float"
;}
static
const
char
*
clMinKey
()
{
return
"-MAXFLOAT"
;}
static
const
char
*
clMaxKey
()
{
return
"MAXFLOAT"
;}
static
const
char
*
clMaxValue
()
{
return
"MAXFLOAT"
;}
static
const
char
*
clSortKey
()
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
// Sort the array.
...
@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
...
@@ -56,7 +67,7 @@ void verifySorting(vector<float> array) {
context
.
initialize
(
system
);
context
.
initialize
(
system
);
OpenCLArray
<
float
>
data
(
context
,
array
.
size
(),
"sortData"
);
OpenCLArray
<
float
>
data
(
context
,
array
.
size
(),
"sortData"
);
data
.
upload
(
array
);
data
.
upload
(
array
);
OpenCLSort
<
floa
t
>
sort
(
context
,
array
.
size
()
,
"float"
,
"value"
);
OpenCLSort
<
SortTrai
t
>
sort
(
context
,
array
.
size
());
sort
.
sort
(
data
);
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
data
.
download
(
sorted
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment