Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b3af19cd
Commit
b3af19cd
authored
May 16, 2018
by
peastman
Browse files
Minor optimizations
parent
143fe36d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
7 deletions
+10
-7
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+1
-1
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+4
-2
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+5
-4
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
b3af19cd
...
@@ -218,7 +218,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -218,7 +218,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int
major
,
minor
;
int
major
,
minor
;
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
CHECK_RESULT
(
cuDeviceComputeCapability
(
&
major
,
&
minor
,
device
));
int
numThreadBlocksPerComputeUnit
=
(
major
>
=
6
?
4
:
6
);
int
numThreadBlocksPerComputeUnit
=
(
major
=
=
6
?
4
:
6
);
if
(
cudaDriverVersion
<
7000
)
{
if
(
cudaDriverVersion
<
7000
)
{
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support
// its compute capability as 5.2, but the compiler doesn't support
...
...
platforms/cuda/src/CudaSort.cpp
View file @
b3af19cd
...
@@ -26,6 +26,7 @@
...
@@ -26,6 +26,7 @@
#include "CudaSort.h"
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include "CudaKernelSources.h"
#include <algorithm>
#include <map>
#include <map>
using
namespace
OpenMM
;
using
namespace
OpenMM
;
...
@@ -56,8 +57,9 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -56,8 +57,9 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
int
maxSharedMem
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
isShortList
=
(
length
<=
maxLocalBuffer
||
length
<=
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
int
maxShortList
=
min
(
8192
,
max
(
maxLocalBuffer
,
CudaContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
isShortList
=
(
length
<=
maxShortList
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
b3af19cd
...
@@ -59,15 +59,16 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -59,15 +59,16 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
int
maxLocalBuffer
=
(
maxSharedMem
/
trait
->
getDataSize
())
/
2
;
// On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
unsigned
int
maxShortList
=
min
(
8192
,
max
(
maxLocalBuffer
,
(
int
)
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
()));
// On Qualcomm's OpenCL, it's essential to check against CL_KERNEL_WORK_GROUP_SIZE. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
||
length
<=
OpenCLContext
::
ThreadBlockSize
*
context
.
getNumThreadBlocks
());
//maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
isShortList
=
(
length
<=
maxShortList
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment