Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
fce26088
"platforms/vscode:/vscode.git/clone" did not exist on "0b22cac142bbab27530bab10b5801c00b711cb3a"
Unverified
Commit
fce26088
authored
Oct 08, 2020
by
peastman
Committed by
GitHub
Oct 08, 2020
Browse files
Reduced cutoff for short list sorting kernel (#2878)
parent
94d7225b
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
11 deletions
+9
-11
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+1
-1
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+8
-10
No files found.
platforms/cuda/src/CudaSort.cpp
View file @
fce26088
...
@@ -58,7 +58,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -58,7 +58,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxSharedMem
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
int
maxShortList
=
min
(
8192
,
max
(
maxLocalBuffer
,
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
int
maxShortList
=
min
(
3000
,
max
(
maxLocalBuffer
,
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
isShortList
=
(
length
<=
maxShortList
);
isShortList
=
(
length
<=
maxShortList
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
;
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
fce26088
...
@@ -63,19 +63,17 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -63,19 +63,17 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
unsigned
int
maxShortList
=
min
(
8192
,
max
(
maxLocalBuffer
,
(
int
)
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
int
maxShortList
=
max
(
maxLocalBuffer
,
(
int
)
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
// The following line checks CL_KERNEL_WORK_GROUP_SIZE to make sure we don't create too large a workgroup.
// Unfortunately, AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm just leaving it commented out.
// If the workgroup size turns out to be too large, we catch the exception and switch back to the standard
// sorting kernels.
//maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
isShortList
=
(
length
<=
maxShortList
);
string
vendor
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_VENDOR
>
();
string
vendor
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_VENDOR
>
();
if
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
)
if
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
)
{
maxShortList
=
min
(
3000
,
maxShortList
);
useShortList2
=
(
dataLength
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
useShortList2
=
(
dataLength
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
else
}
else
{
maxShortList
=
min
(
1024
,
maxShortList
);
useShortList2
=
false
;
useShortList2
=
false
;
}
isShortList
=
(
length
<=
maxShortList
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment