Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f3c55c28
"wrappers/python/src/vscode:/vscode.git/clone" did not exist on "529bf0f2f11bb352eff7b0bcdecb095801da1bdb"
Commit
f3c55c28
authored
May 03, 2018
by
peastman
Browse files
Minor optimizations
parent
72def6fb
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
87 additions
and
16 deletions
+87
-16
platforms/cuda/include/CudaSort.h
platforms/cuda/include/CudaSort.h
+1
-1
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+13
-5
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+1
-1
platforms/cuda/src/kernels/sort.cu
platforms/cuda/src/kernels/sort.cu
+27
-0
platforms/opencl/include/OpenCLSort.h
platforms/opencl/include/OpenCLSort.h
+1
-1
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+18
-8
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+26
-0
No files found.
platforms/cuda/include/CudaSort.h
View file @
f3c55c28
...
...
@@ -92,7 +92,7 @@ private:
CudaArray
offsetInBucket
;
CudaArray
bucketOffset
;
CudaArray
buckets
;
CUfunction
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
CUfunction
shortListKernel
,
shortList2Kernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
...
...
platforms/cuda/src/CudaSort.cpp
View file @
f3c55c28
...
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
shortList2Kernel
=
context
.
getKernel
(
module
,
"sortShortList2"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
...
...
@@ -56,7 +57,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
isShortList
=
(
length
<=
maxLocalBuffer
||
length
<=
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
...
...
@@ -79,8 +80,8 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
bucketOffset
.
initialize
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
.
initialize
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
.
initialize
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
CudaSort
::~
CudaSort
()
{
...
...
@@ -93,10 +94,17 @@ void CudaSort::sort(CudaArray& data) {
if
(
data
.
getSize
()
==
0
)
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation
at once in local memory
.
// We can use a simpler sort kernel that does the entire operation
in one kernel
.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
if
(
dataLength
<=
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
())
{
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortList2Kernel
,
sortArgs
,
dataLength
);
buckets
.
copyTo
(
data
);
}
else
{
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
}
}
else
{
// Compute the range of data values.
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
f3c55c28
...
...
@@ -195,8 +195,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
sumBuffer
=
(
int
*
)
posBuffer
;
// Reuse the same buffer to save memory
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
...
...
platforms/cuda/src/kernels/sort.cu
View file @
f3c55c28
...
...
@@ -46,6 +46,33 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
data
[
index
]
=
dataBuffer
[
index
];
}
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__global__
void
sortShortList2
(
const
DATA_TYPE
*
__restrict__
dataIn
,
DATA_TYPE
*
__restrict__
dataOut
,
unsigned
int
length
)
{
__shared__
DATA_TYPE
dataBuffer
[
64
];
int
globalId
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
DATA_TYPE
value
=
dataIn
[
globalId
<
length
?
globalId
:
0
];
KEY_TYPE
key
=
getValue
(
value
);
int
count
=
0
;
for
(
int
blockStart
=
0
;
blockStart
<
length
;
blockStart
+=
blockDim
.
x
)
{
int
numInBlock
=
min
(
blockDim
.
x
,
length
-
blockStart
);
__syncthreads
();
if
(
threadIdx
.
x
<
numInBlock
)
dataBuffer
[
threadIdx
.
x
]
=
dataIn
[
blockStart
+
threadIdx
.
x
];
__syncthreads
();
for
(
int
i
=
0
;
i
<
numInBlock
;
i
++
)
{
KEY_TYPE
otherKey
=
getValue
(
dataBuffer
[
i
]);
if
(
otherKey
<
key
||
(
otherKey
==
key
&&
blockStart
+
i
<
globalId
))
count
++
;
}
}
if
(
globalId
<
length
)
dataOut
[
count
]
=
value
;
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
...
...
platforms/opencl/include/OpenCLSort.h
View file @
f3c55c28
...
...
@@ -92,7 +92,7 @@ private:
OpenCLArray
offsetInBucket
;
OpenCLArray
bucketOffset
;
OpenCLArray
buckets
;
cl
::
Kernel
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
cl
::
Kernel
shortListKernel
,
shortList2Kernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
f3c55c28
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
8
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -48,6 +48,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
shortList2Kernel
=
cl
::
Kernel
(
program
,
"sortShortList2"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
...
...
@@ -66,7 +67,7 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
);
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
||
length
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
@@ -89,8 +90,8 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
bucketOffset
.
initialize
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
.
initialize
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
.
initialize
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
buckets
.
initialize
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
OpenCLSort
::~
OpenCLSort
()
{
...
...
@@ -103,12 +104,21 @@ void OpenCLSort::sort(OpenCLArray& data) {
if
(
data
.
getSize
()
==
0
)
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation
at once in local memory
.
// We can use a simpler sort kernel that does the entire operation
in one kernel
.
shortListKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortListKernel
.
setArg
<
cl_uint
>
(
1
,
dataLength
);
shortListKernel
.
setArg
(
2
,
dataLength
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
shortListKernel
,
sortKernelSize
,
sortKernelSize
);
if
(
dataLength
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
())
{
shortList2Kernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortList2Kernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
.
getDeviceBuffer
());
shortList2Kernel
.
setArg
<
cl_int
>
(
2
,
dataLength
);
context
.
executeKernel
(
shortList2Kernel
,
dataLength
);
buckets
.
copyTo
(
data
);
}
else
{
shortListKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortListKernel
.
setArg
<
cl_uint
>
(
1
,
dataLength
);
shortListKernel
.
setArg
(
2
,
dataLength
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
shortListKernel
,
sortKernelSize
,
sortKernelSize
);
}
}
else
{
// Compute the range of data values.
...
...
platforms/opencl/src/kernels/sort.cl
View file @
f3c55c28
...
...
@@ -45,6 +45,32 @@ __kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __lo
data[index]
=
dataBuffer[index]
;
}
/**
*
An
alternate
kernel
for
sorting
short
lists.
In
this
version
every
thread
does
a
full
*
scan
through
the
data
to
select
the
destination
for
one
element.
This
involves
more
*
work,
but
also
parallelizes
much
better.
*/
__kernel
void
sortShortList2
(
__global
const
DATA_TYPE*
restrict
dataIn,
__global
DATA_TYPE*
restrict
dataOut,
int
length
)
{
__local
DATA_TYPE
dataBuffer[64]
;
DATA_TYPE
value
=
dataIn[get_global_id
(
0
)
<
length
?
get_global_id
(
0
)
:
0]
;
KEY_TYPE
key
=
getValue
(
value
)
;
int
count
=
0
;
for
(
int
blockStart
=
0
; blockStart < length; blockStart += get_local_size(0)) {
int
numInBlock
=
min
((
int
)
get_local_size
(
0
)
,
length-blockStart
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
numInBlock
)
dataBuffer[get_local_id
(
0
)
]
=
dataIn[blockStart+get_local_id
(
0
)
]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
0
; i < numInBlock; i++) {
KEY_TYPE
otherKey
=
getValue
(
dataBuffer[i]
)
;
if
(
otherKey
<
key
||
(
otherKey
==
key
&&
blockStart+i
<
get_global_id
(
0
)))
count++
;
}
}
if
(
get_global_id
(
0
)
<
length
)
dataOut[count]
=
value
;
}
/**
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
is
executed
as
a
single
work
group.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment