Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
56b199be
Commit
56b199be
authored
Jan 24, 2018
by
Peter Eastman
Browse files
Optimizations to RMSDForce
parent
f4dc3110
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
18 additions
and
8 deletions
+18
-8
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+1
-1
platforms/cuda/src/kernels/rmsd.cu
platforms/cuda/src/kernels/rmsd.cu
+8
-3
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+1
-1
platforms/opencl/src/kernels/rmsd.cl
platforms/opencl/src/kernels/rmsd.cl
+8
-3
No files found.
platforms/cuda/src/CudaKernels.cpp
View file @
56b199be
...
...
@@ -6887,7 +6887,7 @@ double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel.
int numParticles = particles->getSize();
int
blockSize
=
128
;
int blockSize =
256
;
void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
&particles->getDevicePointer(), &buffer->getDevicePointer()};
cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL));
...
...
platforms/cuda/src/kernels/rmsd.cu
View file @
56b199be
...
...
@@ -4,11 +4,16 @@
/**
* Sum a value over all threads.
*/
__device__
real
reduceValue
(
real
value
,
real
*
temp
)
{
__device__
real
reduceValue
(
real
value
,
volatile
real
*
temp
)
{
const
int
thread
=
threadIdx
.
x
;
temp
[
thread
]
=
value
;
__syncthreads
();
for
(
uint
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
for
(
uint
step
=
1
;
step
<
32
;
step
*=
2
)
{
if
(
thread
+
step
<
blockDim
.
x
&&
thread
%
(
2
*
step
)
==
0
)
temp
[
thread
]
=
temp
[
thread
]
+
temp
[
thread
+
step
];
SYNC_WARPS
}
for
(
uint
step
=
32
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
thread
+
step
<
blockDim
.
x
&&
thread
%
(
2
*
step
)
==
0
)
temp
[
thread
]
=
temp
[
thread
]
+
temp
[
thread
+
step
];
__syncthreads
();
...
...
@@ -21,7 +26,7 @@ __device__ real reduceValue(real value, real* temp) {
*/
extern
"C"
__global__
void
computeRMSDPart1
(
int
numParticles
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
referencePos
,
const
int
*
__restrict__
particles
,
real
*
buffer
)
{
extern
__shared__
real
temp
[];
extern
__shared__
volatile
real
temp
[];
// Compute the center of the particle positions.
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
56b199be
...
...
@@ -7167,7 +7167,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel.
int numParticles = particles->getSize();
int blockSize =
128
;
int blockSize =
min(256, (int) kernel1.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()))
;
kernel1.setArg<cl_int>(0, numParticles);
kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());
...
...
platforms/opencl/src/kernels/rmsd.cl
View file @
56b199be
...
...
@@ -4,11 +4,16 @@
/**
*
Sum
a
value
over
all
threads.
*/
real
reduceValue
(
real
value,
__local
real*
temp
)
{
real
reduceValue
(
real
value,
__local
volatile
real*
temp
)
{
const
int
thread
=
get_local_id
(
0
)
;
temp[thread]
=
value
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
uint
step
=
1
; step < get_local_size(0); step *= 2) {
for
(
uint
step
=
1
; step < 32; step *= 2) {
if
(
thread+step
<
get_local_size
(
0
)
&&
thread%
(
2*step
)
==
0
)
temp[thread]
=
temp[thread]
+
temp[thread+step]
;
SYNC_WARPS
}
for
(
uint
step
=
32
; step < get_local_size(0); step *= 2) {
if
(
thread+step
<
get_local_size
(
0
)
&&
thread%
(
2*step
)
==
0
)
temp[thread]
=
temp[thread]
+
temp[thread+step]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -20,7 +25,7 @@ real reduceValue(real value, __local real* temp) {
*
Perform
the
first
step
of
computing
the
RMSD.
This
is
executed
as
a
single
work
group.
*/
__kernel
void
computeRMSDPart1
(
int
numParticles,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
referencePos,
__global
const
int*
restrict
particles,
__global
real*
buffer,
__local
real*
restrict
temp
)
{
__global
const
int*
restrict
particles,
__global
real*
buffer,
__local
volatile
real*
restrict
temp
)
{
//
Compute
the
center
of
the
particle
positions.
real3
center
=
(
real3
)
0
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment