Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
6c542539
Commit
6c542539
authored
Jan 20, 2010
by
Peter Eastman
Browse files
Created OpenCL sorting algorithm
parent
07073906
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
457 additions
and
0 deletions
+457
-0
platforms/opencl/src/OpenCLSort.h
platforms/opencl/src/OpenCLSort.h
+182
-0
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+169
-0
platforms/opencl/tests/TestOpenCLSort.cpp
platforms/opencl/tests/TestOpenCLSort.cpp
+106
-0
No files found.
platforms/opencl/src/OpenCLSort.h
0 → 100644
View file @
6c542539
#ifndef __OPENMM_OPENCLSORT_H__
#define __OPENMM_OPENCLSORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "OpenCLArray.h"
#include "OpenCLKernelSources.h"
#include "openmm/internal/windowsExport.h"
#include <map>
namespace
OpenMM
{
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
template
<
class
TYPE
>
class
OPENMM_EXPORT
OpenCLSort
{
public:
/**
* Create an OpenCLSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param length the length of the arrays this object will be used to sort
* @param typeName the name of the data type being sorting (e.g. "float")
* @param sortKey an expression that returns the value by which the variable "value" should be sorted.
* For primitive types, this will simply be "value".
*/
OpenCLSort
(
OpenCLContext
&
context
,
int
length
,
const
std
::
string
&
typeName
,
const
std
::
string
&
sortKey
)
:
context
(
context
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"TYPE"
]
=
typeName
;
replacements
[
"SORT_KEY"
]
=
sortKey
;
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
copyToBucketsKernel
=
cl
::
Kernel
(
program
,
"copyDataToBuckets"
);
sortBucketsKernel
=
cl
::
Kernel
(
program
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
int
maxGroupSize
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
();
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxLocalBuffer
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
TYPE
))
/
2
;
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
int
targetBucketSize
=
sortKernelSize
/
2
;
int
numBuckets
=
length
/
targetBucketSize
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
new
OpenCLArray
<
mm_float2
>
(
context
,
1
,
"sortDataRange"
);
bucketOffset
=
new
OpenCLArray
<
cl_int
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
new
OpenCLArray
<
cl_int
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
new
OpenCLArray
<
cl_int
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
OpenCLArray
<
TYPE
>
(
context
,
length
,
"buckets"
);
}
~
OpenCLSort
()
{
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
/**
* Sort an array.
*/
void
sort
(
OpenCLArray
<
TYPE
>&
data
)
{
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
cl_float
),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
bucketOffset
->
getDeviceBuffer
(),
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
dataRange
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOffset
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bucketOfElement
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
assignElementsKernel
,
data
.
getSize
());
// Compute the position of each bucket.
computeBucketPositionsKernel
.
setArg
<
cl_int
>
(
0
,
numBuckets
);
computeBucketPositionsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
bucketOffset
->
getDeviceBuffer
());
computeBucketPositionsKernel
.
setArg
(
2
,
positionsKernelSize
*
sizeof
(
cl_int
),
NULL
);
context
.
executeKernel
(
computeBucketPositionsKernel
,
positionsKernelSize
,
positionsKernelSize
);
// Copy the data into the buckets.
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl_int
>
(
2
,
data
.
getSize
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOfElement
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
copyToBucketsKernel
,
data
.
getSize
());
// Sort each bucket.
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
mm_float2
),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
private:
OpenCLContext
&
context
;
OpenCLArray
<
mm_float2
>*
dataRange
;
OpenCLArray
<
cl_int
>*
bucketOfElement
;
OpenCLArray
<
cl_int
>*
offsetInBucket
;
OpenCLArray
<
cl_int
>*
bucketOffset
;
OpenCLArray
<
TYPE
>*
buckets
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
}
// namespace OpenMM
#endif // __OPENMM_OPENCLSORT_H__
\ No newline at end of file
platforms/opencl/src/kernels/sort.cl
0 → 100644
View file @
6c542539
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
float
getValue
(
TYPE
value
)
{
return
SORT_KEY
;
}
/**
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
is
executed
as
a
single
work
group.
*/
__kernel
void
computeRange
(
__global
TYPE*
data,
int
length,
__global
float2*
range,
__local
float*
buffer
)
{
float
minimum
=
MAXFLOAT
;
float
maximum
=
-MAXFLOAT
;
//
Each
thread
calculates
the
range
of
a
subset
of
values.
for
(
int
index
=
get_local_id
(
0
)
; index < length; index += get_local_size(0)) {
float
value
=
getValue
(
data[index]
)
;
minimum
=
min
(
minimum,
value
)
;
maximum
=
max
(
maximum,
value
)
;
}
//
Now
reduce
them.
buffer[get_local_id
(
0
)
]
=
minimum
;
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
min
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
minimum
=
buffer[0]
;
buffer[get_local_id
(
0
)
]
=
maximum
;
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
max
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
maximum
=
buffer[0]
;
if
(
get_local_id
(
0
)
==
0
)
range[0]
=
(
float2
)
(
minimum,
maximum
)
;
}
/**
*
Assign
elements
to
buckets.
*/
__kernel
void
assignElementsToBuckets
(
__global
TYPE*
data,
int
length,
int
numBuckets,
__global
float2*
range,
__global
int*
bucketOffset,
__global
int*
bucketOfElement,
__global
int*
offsetInBucket
)
{
float2
dataRange
=
range[0]
;
float
minValue
=
dataRange.x
;
float
maxValue
=
dataRange.y
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
for
(
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
TYPE
element
=
data[index]
;
float
value
=
getValue
(
element
)
;
int
bucketIndex
=
min
((
int
)
((
value-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
bucketOfElement[index]
=
bucketIndex
;
}
}
/**
*
Sum
the
bucket
sizes
to
compute
the
start
position
of
each
bucket.
This
kernel
*
is
executed
as
a
single
work
group.
*/
__kernel
void
computeBucketPositions
(
int
numBuckets,
__global
int*
bucketOffset,
__local
int*
buffer
)
{
int
globalOffset
=
0
;
for
(
int
startBucket
=
0
; startBucket < numBuckets; startBucket += get_local_size(0)) {
//
Load
the
bucket
sizes
into
local
memory.
int
globalIndex
=
startBucket+get_local_id
(
0
)
;
buffer[get_local_id
(
0
)
]
=
(
globalIndex
<
numBuckets
?
bucketOffset[globalIndex]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
parallel
prefix
sum.
for
(
int
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
>=
step
)
buffer[get_local_id
(
0
)
]
+=
buffer[get_local_id
(
0
)
-step]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
//
Write
the
results
back
to
global
memory.
if
(
globalIndex
<
numBuckets
)
bucketOffset[globalIndex]
=
buffer[get_local_id
(
0
)
]+globalOffset
;
globalOffset
+=
buffer[get_local_size
(
0
)
-1]
;
}
}
/**
*
Copy
the
input
data
into
the
buckets
for
sorting.
*/
__kernel
void
copyDataToBuckets
(
__global
TYPE*
data,
__global
TYPE*
buckets,
int
length,
__global
int*
bucketOffset,
__global
int*
bucketOfElement,
__global
int*
offsetInBucket
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
TYPE
element
=
data[index]
;
int
bucketIndex
=
bucketOfElement[index]
;
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset[bucketIndex-1]
)
;
buckets[offset+offsetInBucket[index]]
=
element
;
}
}
/**
*
Sort
the
data
in
each
bucket.
*/
__kernel
void
sortBuckets
(
__global
TYPE*
data,
__global
TYPE*
buckets,
int
numBuckets,
__global
int*
bucketOffset,
__local
TYPE*
buffer
)
{
for
(
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
int
endIndex
=
bucketOffset[index]
;
int
length
=
endIndex-startIndex
;
if
(
length
<=
get_local_size
(
0
))
{
//
Load
the
data
into
local
memory.
buffer[get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
length
?
buckets[startIndex+get_local_id
(
0
)
]
:
MAXFLOAT
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
bitonic
sort
in
local
memory.
for
(
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
int
j
=
k/2
; j > 0; j /= 2) {
int
ixj
=
get_local_id
(
0
)
^j
;
if
(
ixj
>
get_local_id
(
0
))
{
if
(((
get_local_id
(
0
)
&k
)
==
0
&&
getValue
(
buffer[get_local_id
(
0
)
]
)
>
getValue
(
buffer[ixj]
))
|
|
((get_local_id(0)&k) != 0 && getValue(buffer[get_local_id(0)]) < getValue(buffer[ixj]))) {
TYPE temp = buffer[get_local_id(0)];
buffer[get_local_id(0)] = buffer[ixj];
buffer[ixj] = temp;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Write the data to the sorted array.
if (get_local_id(0) < length)
data[startIndex+get_local_id(0)] = buffer[get_local_id(0)];
}
else {
// Copy the bucket data over to the output array.
for (int i = get_local_id(0); i < length; i += get_local_size(0))
data[startIndex+i] = buckets[startIndex+i];
barrier(CLK_GLOBAL_MEM_FENCE);
// Perform a bitonic sort in global memory.
for (int k = 2; k < 2*length; k *= 2) {
for (int j = k/2; j > 0; j /= 2) {
for (int i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j;
if (ixj > i && ixj < length) {
TYPE value1 = data[startIndex+i];
TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
if ((ascending && getValue(value1) > getValue(value2)) |
|
(
!ascending
&&
getValue
(
value1
)
<
getValue
(
value2
)))
{
data[startIndex+i]
=
value2
;
data[startIndex+ixj]
=
value1
;
}
}
}
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
}
}
}
}
}
\ No newline at end of file
platforms/opencl/tests/TestOpenCLSort.cpp
0 → 100644
View file @
6c542539
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the OpenCL implementation of sorting.
*/
#include "../../../tests/AssertionUtilities.h"
#include "../src/OpenCLArray.h"
#include "../src/OpenCLContext.h"
#include "../src/OpenCLSort.h"
#include "../src/sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
System
system
;
system
.
addParticle
(
0.0
);
OpenCLContext
context
(
1
,
-
1
);
context
.
initialize
(
system
);
OpenCLArray
<
float
>
data
(
context
,
array
.
size
(),
"sortData"
);
data
.
upload
(
array
);
OpenCLSort
<
float
>
sort
(
context
,
array
.
size
(),
"float"
,
"value"
);
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
// Verify that it is in sorted order.
for
(
int
i
=
1
;
i
<
(
int
)
sorted
.
size
();
i
++
)
ASSERT
(
sorted
[
i
-
1
]
<=
sorted
[
i
]);
// Make sure the sorted array contains the same values as the original one.
multiset
<
float
>
elements1
(
array
.
begin
(),
array
.
end
());
multiset
<
float
>
elements2
(
sorted
.
begin
(),
sorted
.
end
());
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
init_gen_rand
(
0
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
genrand_real2
();
verifySorting
(
array
);
}
void
testLogValues
()
{
init_gen_rand
(
0
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
());
verifySorting
(
array
);
}
int
main
()
{
try
{
testUniformValues
();
testLogValues
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment