Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b3af19cd
"vscode:/vscode.git/clone" did not exist on "ebbc40e3f3eadc208d994762cfe689f06c86ae7e"
Commit
b3af19cd
authored
May 16, 2018
by
peastman
Browse files
Minor optimizations
parent
143fe36d
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
7 deletions
+10
-7
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+1
-1
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+4
-2
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+5
-4
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
b3af19cd
...
@@ -218,7 +218,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -218,7 +218,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int
major
,
minor
;
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
int
numThreadBlocksPerComputeUnit
=
(
major
>
=
6
?
4
:
6
);
int
numThreadBlocksPerComputeUnit
=
(
major
=
=
6
?
4
:
6
);
if
(
cudaDriverVersion
<
7000
)
{
if
(
cudaDriverVersion
<
7000
)
{
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support
// its compute capability as 5.2, but the compiler doesn't support
...
...
platforms/cuda/src/CudaSort.cpp
View file @
b3af19cd
...
@@ -26,6 +26,7 @@
...
@@ -26,6 +26,7 @@
#include "CudaSort.h"
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include "CudaKernelSources.h"
#include <algorithm>
#include <map>
#include <map>
using
namespace
OpenMM
;
using
namespace
OpenMM
;
...
@@ -56,8 +57,9 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -56,8 +57,9 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
int
maxSharedMem
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
isShortList
=
(
length
<=
maxLocalBuffer
||
length
<=
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
int
maxShortList
=
min
(
8192
,
max
(
maxLocalBuffer
,
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
isShortList
=
(
length
<=
maxShortList
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
b3af19cd
...
@@ -59,15 +59,16 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -59,15 +59,16 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
// On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
unsigned
int
maxShortList
=
min
(
8192
,
max
(
maxLocalBuffer
,
(
int
)
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
// On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
||
length
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
//maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
isShortList
=
(
length
<=
maxShortList
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment