Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
143fe36d
Unverified
Commit
143fe36d
authored
May 04, 2018
by
peastman
Committed by
GitHub
May 04, 2018
Browse files
Merge pull request #2060 from peastman/opt
Minor optimizations
parents
38e14267
f3c55c28
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
87 additions
and
16 deletions
+87
-16
platforms/cuda/include/CudaSort.h
platforms/cuda/include/CudaSort.h
+1
-1
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+13
-5
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+1
-1
platforms/cuda/src/kernels/sort.cu
platforms/cuda/src/kernels/sort.cu
+27
-0
platforms/opencl/include/OpenCLSort.h
platforms/opencl/include/OpenCLSort.h
+1
-1
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+18
-8
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+26
-0
No files found.
platforms/cuda/include/CudaSort.h
View file @
143fe36d
...
@@ -92,7 +92,7 @@ private:
...
@@ -92,7 +92,7 @@ private:
CudaArray
offsetInBucket
;
CudaArray
offsetInBucket
;
CudaArray
bucketOffset
;
CudaArray
bucketOffset
;
CudaArray
buckets
;
CudaArray
buckets
;
CUfunction
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
CUfunction
shortListKernel
,
shortList2Kernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
bool
isShortList
;
};
};
...
...
platforms/cuda/src/CudaSort.cpp
View file @
143fe36d
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
shortList2Kernel
=
context
.
getKernel
(
module
,
"sortShortList2"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
...
@@ -56,7 +57,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -56,7 +57,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxSharedMem
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
isShortList
=
(
length
<=
maxLocalBuffer
||
length
<=
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
...
@@ -79,8 +80,8 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -79,8 +80,8 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
bucketOffset
.
initialize
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOffset
.
initialize
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
.
initialize
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
bucketOfElement
.
initialize
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
.
initialize
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
offsetInBucket
.
initialize
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
CudaSort
::~
CudaSort
()
{
CudaSort
::~
CudaSort
()
{
...
@@ -93,10 +94,17 @@ void CudaSort::sort(CudaArray& data) {
...
@@ -93,10 +94,17 @@ void CudaSort::sort(CudaArray& data) {
if
(
data
.
getSize
()
==
0
)
if
(
data
.
getSize
()
==
0
)
return
;
return
;
if
(
isShortList
)
{
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation
at once in local memory
.
// We can use a simpler sort kernel that does the entire operation
in one kernel
.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
if
(
dataLength
<=
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
())
{
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortList2Kernel
,
sortArgs
,
dataLength
);
buckets
.
copyTo
(
data
);
}
else
{
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
}
}
}
else
{
else
{
// Compute the range of data values.
// Compute the range of data values.
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
143fe36d
...
@@ -195,8 +195,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -195,8 +195,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
sumBuffer
=
(
int
*
)
posBuffer
;
// Reuse the same buffer to save memory
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
...
...
platforms/cuda/src/kernels/sort.cu
View file @
143fe36d
...
@@ -46,6 +46,33 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
...
@@ -46,6 +46,33 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
data
[
index
]
=
dataBuffer
[
index
];
data
[
index
]
=
dataBuffer
[
index
];
}
}
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__global__
void
sortShortList2
(
const
DATA_TYPE
*
__restrict__
dataIn
,
DATA_TYPE
*
__restrict__
dataOut
,
unsigned
int
length
)
{
__shared__
DATA_TYPE
dataBuffer
[
64
];
int
globalId
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
DATA_TYPE
value
=
dataIn
[
globalId
<
length
?
globalId
:
0
];
KEY_TYPE
key
=
getValue
(
value
);
int
count
=
0
;
for
(
int
blockStart
=
0
;
blockStart
<
length
;
blockStart
+=
blockDim
.
x
)
{
int
numInBlock
=
min
(
blockDim
.
x
,
length
-
blockStart
);
__syncthreads
();
if
(
threadIdx
.
x
<
numInBlock
)
dataBuffer
[
threadIdx
.
x
]
=
dataIn
[
blockStart
+
threadIdx
.
x
];
__syncthreads
();
for
(
int
i
=
0
;
i
<
numInBlock
;
i
++
)
{
KEY_TYPE
otherKey
=
getValue
(
dataBuffer
[
i
]);
if
(
otherKey
<
key
||
(
otherKey
==
key
&&
blockStart
+
i
<
globalId
))
count
++
;
}
}
if
(
globalId
<
length
)
dataOut
[
count
]
=
value
;
}
/**
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
* is executed as a single work group.
...
...
platforms/opencl/include/OpenCLSort.h
View file @
143fe36d
...
@@ -92,7 +92,7 @@ private:
...
@@ -92,7 +92,7 @@ private:
OpenCLArray
offsetInBucket
;
OpenCLArray
offsetInBucket
;
OpenCLArray
bucketOffset
;
OpenCLArray
bucketOffset
;
OpenCLArray
buckets
;
OpenCLArray
buckets
;
cl
::
Kernel
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
cl
::
Kernel
shortListKernel
,
shortList2Kernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
bool
isShortList
;
};
};
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
143fe36d
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
8
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -48,6 +48,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -48,6 +48,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
shortList2Kernel
=
cl
::
Kernel
(
program
,
"sortShortList2"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
...
@@ -66,7 +67,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -66,7 +67,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
);
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
||
length
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
@@ -89,8 +90,8 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -89,8 +90,8 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
bucketOffset
.
initialize
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOffset
.
initialize
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
.
initialize
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
bucketOfElement
.
initialize
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
.
initialize
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
offsetInBucket
.
initialize
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
OpenCLSort
::~
OpenCLSort
()
{
OpenCLSort
::~
OpenCLSort
()
{
...
@@ -103,12 +104,21 @@ void OpenCLSort::sort(OpenCLArray& data) {
...
@@ -103,12 +104,21 @@ void OpenCLSort::sort(OpenCLArray& data) {
if
(
data
.
getSize
()
==
0
)
if
(
data
.
getSize
()
==
0
)
return
;
return
;
if
(
isShortList
)
{
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation
at once in local memory
.
// We can use a simpler sort kernel that does the entire operation
in one kernel
.
shortListKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
if
(
dataLength
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
())
{
shortListKernel
.
setArg
<
cl_uint
>
(
1
,
dataLength
);
shortList2Kernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortListKernel
.
setArg
(
2
,
dataLength
*
trait
->
getDataSize
(),
NULL
);
shortList2Kernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
.
getDeviceBuffer
());
context
.
executeKernel
(
shortListKernel
,
sortKernelSize
,
sortKernelSize
);
shortList2Kernel
.
setArg
<
cl_int
>
(
2
,
dataLength
);
context
.
executeKernel
(
shortList2Kernel
,
dataLength
);
buckets
.
copyTo
(
data
);
}
else
{
shortListKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortListKernel
.
setArg
<
cl_uint
>
(
1
,
dataLength
);
shortListKernel
.
setArg
(
2
,
dataLength
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
shortListKernel
,
sortKernelSize
,
sortKernelSize
);
}
}
}
else
{
else
{
// Compute the range of data values.
// Compute the range of data values.
...
...
platforms/opencl/src/kernels/sort.cl
View file @
143fe36d
...
@@ -45,6 +45,32 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo
...
@@ -45,6 +45,32 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo
data[index]
=
dataBuffer[index]
;
data[index]
=
dataBuffer[index]
;
}
}
/**
*
An
alternate
kernel
for
sorting
short
lists.
In
this
version
every
thread
does
a
full
*
scan
through
the
data
to
select
the
destination
for
one
element.
This
involves
more
*
work,
but
also
parallelizes
much
better.
*/
__kernel
void
sortShortList2
(
__global
const
DATA_TYPE*
restrict
dataIn,
__global
DATA_TYPE*
restrict
dataOut,
int
length
)
{
__local
DATA_TYPE
dataBuffer[64]
;
DATA_TYPE
value
=
dataIn[get_global_id
(
0
)
<
length
?
get_global_id
(
0
)
:
0]
;
KEY_TYPE
key
=
getValue
(
value
)
;
int
count
=
0
;
for
(
int
blockStart
=
0
; blockStart < length; blockStart += get_local_size(0)) {
int
numInBlock
=
min
((
int
)
get_local_size
(
0
)
,
length-blockStart
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
numInBlock
)
dataBuffer[get_local_id
(
0
)
]
=
dataIn[blockStart+get_local_id
(
0
)
]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
0
; i < numInBlock; i++) {
KEY_TYPE
otherKey
=
getValue
(
dataBuffer[i]
)
;
if
(
otherKey
<
key
||
(
otherKey
==
key
&&
blockStart+i
<
get_global_id
(
0
)))
count++
;
}
}
if
(
get_global_id
(
0
)
<
length
)
dataOut[count]
=
value
;
}
/**
/**
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
is
executed
as
a
single
work
group.
*
is
executed
as
a
single
work
group.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment