Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
2975f44b
Unverified
Commit
2975f44b
authored
Jan 31, 2023
by
Peter Eastman
Committed by
GitHub
Jan 31, 2023
Browse files
Optimized reducing energy (#3902)
parent
e0c80069
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
24 deletions
+34
-24
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+17
-9
platforms/cuda/src/kernels/utilities.cu
platforms/cuda/src/kernels/utilities.cu
+2
-2
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+13
-11
platforms/opencl/src/kernels/utilities.cl
platforms/opencl/src/kernels/utilities.cl
+2
-2
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
2975f44b
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-202
1
Stanford University and the Authors. *
* Portions copyright (c) 2009-202
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -435,21 +435,23 @@ void CudaContext::initialize() {
ContextSelector
selector
(
*
this
);
string
errorMessage
=
"Error initializing Context"
;
int
numEnergyBuffers
=
max
(
numThreadBlocks
*
ThreadBlockSize
,
nonbonded
->
getNumEnergyBuffers
());
int
multiprocessors
;
CHECK_RESULT2
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
),
"Error checking GPU properties"
);
if
(
useDoublePrecision
)
{
energyBuffer
.
initialize
<
double
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
energySum
.
initialize
<
double
>
(
*
this
,
1
,
"energySum"
);
energySum
.
initialize
<
double
>
(
*
this
,
multiprocessors
,
"energySum"
);
int
pinnedBufferSize
=
max
(
paddedNumAtoms
*
4
,
numEnergyBuffers
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
}
else
if
(
useMixedPrecision
)
{
energyBuffer
.
initialize
<
double
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
energySum
.
initialize
<
double
>
(
*
this
,
1
,
"energySum"
);
energySum
.
initialize
<
double
>
(
*
this
,
multiprocessors
,
"energySum"
);
int
pinnedBufferSize
=
max
(
paddedNumAtoms
*
4
,
numEnergyBuffers
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
double
),
0
));
}
else
{
energyBuffer
.
initialize
<
float
>
(
*
this
,
numEnergyBuffers
,
"energyBuffer"
);
energySum
.
initialize
<
float
>
(
*
this
,
1
,
"energySum"
);
energySum
.
initialize
<
float
>
(
*
this
,
multiprocessors
,
"energySum"
);
int
pinnedBufferSize
=
max
(
paddedNumAtoms
*
6
,
numEnergyBuffers
);
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
pinnedBufferSize
*
sizeof
(
float
),
0
));
}
...
...
@@ -820,12 +822,18 @@ double CudaContext::reduceEnergy() {
int
bufferSize
=
energyBuffer
.
getSize
();
int
workGroupSize
=
512
;
void
*
args
[]
=
{
&
energyBuffer
.
getDevicePointer
(),
&
energySum
.
getDevicePointer
(),
&
bufferSize
,
&
workGroupSize
};
executeKernel
(
reduceEnergyKernel
,
args
,
workGroupSize
,
workGroupSize
,
workGroupSize
*
energyBuffer
.
getElementSize
());
executeKernel
(
reduceEnergyKernel
,
args
,
workGroupSize
*
energySum
.
getSize
()
,
workGroupSize
,
workGroupSize
*
energyBuffer
.
getElementSize
());
energySum
.
download
(
pinnedBuffer
);
if
(
getUseDoublePrecision
()
||
getUseMixedPrecision
())
return
*
((
double
*
)
pinnedBuffer
);
else
return
*
((
float
*
)
pinnedBuffer
);
double
result
=
0
;
if
(
getUseDoublePrecision
()
||
getUseMixedPrecision
())
{
for
(
int
i
=
0
;
i
<
energySum
.
getSize
();
i
++
)
result
+=
((
double
*
)
pinnedBuffer
)[
i
];
}
else
{
for
(
int
i
=
0
;
i
<
energySum
.
getSize
();
i
++
)
result
+=
((
float
*
)
pinnedBuffer
)[
i
];
}
return
result
;
}
void
CudaContext
::
setCharges
(
const
vector
<
double
>&
charges
)
{
...
...
platforms/cuda/src/kernels/utilities.cu
View file @
2975f44b
...
...
@@ -80,7 +80,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
extern
__shared__
mixed
tempBuffer
[];
const
unsigned
int
thread
=
threadIdx
.
x
;
mixed
sum
=
0
;
for
(
unsigned
int
index
=
thread
;
index
<
bufferSize
;
index
+=
blockDim
.
x
)
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
thread
Idx
.
x
;
index
<
bufferSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
sum
+=
energyBuffer
[
index
];
tempBuffer
[
thread
]
=
sum
;
for
(
int
i
=
1
;
i
<
workGroupSize
;
i
*=
2
)
{
...
...
@@ -89,7 +89,7 @@ __global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __re
tempBuffer
[
thread
]
+=
tempBuffer
[
thread
+
i
];
}
if
(
thread
==
0
)
*
result
=
tempBuffer
[
0
];
result
[
blockIdx
.
x
]
=
tempBuffer
[
0
];
}
/**
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
2975f44b
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-202
0
Stanford University and the Authors. *
* Portions copyright (c) 2009-202
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -497,23 +497,24 @@ void OpenCLContext::initialize() {
bonded
->
initialize
(
system
);
numForceBuffers
=
std
::
max
(
numForceBuffers
,
(
int
)
platformData
.
contexts
.
size
());
int
energyBufferSize
=
max
(
numThreadBlocks
*
ThreadBlockSize
,
nonbonded
->
getNumEnergyBuffers
());
int
numComputeUnits
=
device
.
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
if
(
useDoublePrecision
)
{
forceBuffers
.
initialize
<
mm_double4
>
(
*
this
,
paddedNumAtoms
*
numForceBuffers
,
"forceBuffers"
);
force
.
initialize
<
mm_double4
>
(
*
this
,
&
forceBuffers
.
getDeviceBuffer
(),
paddedNumAtoms
,
"force"
);
energyBuffer
.
initialize
<
cl_double
>
(
*
this
,
energyBufferSize
,
"energyBuffer"
);
energySum
.
initialize
<
cl_double
>
(
*
this
,
1
,
"energySum"
);
energySum
.
initialize
<
cl_double
>
(
*
this
,
numComputeUnits
,
"energySum"
);
}
else
if
(
useMixedPrecision
)
{
forceBuffers
.
initialize
<
mm_float4
>
(
*
this
,
paddedNumAtoms
*
numForceBuffers
,
"forceBuffers"
);
force
.
initialize
<
mm_float4
>
(
*
this
,
&
forceBuffers
.
getDeviceBuffer
(),
paddedNumAtoms
,
"force"
);
energyBuffer
.
initialize
<
cl_double
>
(
*
this
,
energyBufferSize
,
"energyBuffer"
);
energySum
.
initialize
<
cl_double
>
(
*
this
,
1
,
"energySum"
);
energySum
.
initialize
<
cl_double
>
(
*
this
,
numComputeUnits
,
"energySum"
);
}
else
{
forceBuffers
.
initialize
<
mm_float4
>
(
*
this
,
paddedNumAtoms
*
numForceBuffers
,
"forceBuffers"
);
force
.
initialize
<
mm_float4
>
(
*
this
,
&
forceBuffers
.
getDeviceBuffer
(),
paddedNumAtoms
,
"force"
);
energyBuffer
.
initialize
<
cl_float
>
(
*
this
,
energyBufferSize
,
"energyBuffer"
);
energySum
.
initialize
<
cl_float
>
(
*
this
,
1
,
"energySum"
);
energySum
.
initialize
<
cl_float
>
(
*
this
,
numComputeUnits
,
"energySum"
);
}
reduceForcesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
longForceBuffer
.
getDeviceBuffer
());
reduceForcesKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
forceBuffers
.
getDeviceBuffer
());
...
...
@@ -798,17 +799,18 @@ double OpenCLContext::reduceEnergy() {
reduceEnergyKernel
.
setArg
<
cl_int
>
(
2
,
energyBuffer
.
getSize
());
reduceEnergyKernel
.
setArg
<
cl_int
>
(
3
,
workGroupSize
);
reduceEnergyKernel
.
setArg
(
4
,
workGroupSize
*
energyBuffer
.
getElementSize
(),
NULL
);
executeKernel
(
reduceEnergyKernel
,
workGroupSize
,
workGroupSize
);
executeKernel
(
reduceEnergyKernel
,
workGroupSize
*
energySum
.
getSize
(),
workGroupSize
);
energySum
.
download
(
pinnedMemory
);
double
result
=
0
;
if
(
getUseDoublePrecision
()
||
getUseMixedPrecision
())
{
double
energy
;
energySum
.
download
(
&
energy
);
return
energy
;
for
(
int
i
=
0
;
i
<
energySum
.
getSize
();
i
++
)
result
+=
((
double
*
)
pinnedMemory
)[
i
];
}
else
{
float
energy
;
energySum
.
download
(
&
energy
);
return
energy
;
for
(
int
i
=
0
;
i
<
energySum
.
getSize
();
i
++
)
result
+=
((
float
*
)
pinnedMemory
)[
i
];
}
return
result
;
}
void
OpenCLContext
::
setCharges
(
const
vector
<
double
>&
charges
)
{
...
...
platforms/opencl/src/kernels/utilities.cl
View file @
2975f44b
...
...
@@ -108,7 +108,7 @@ __kernel void reduceForces(__global long* restrict longBuffer, __global real4* r
__kernel
void
reduceEnergy
(
__global
const
mixed*
restrict
energyBuffer,
__global
mixed*
restrict
result,
int
bufferSize,
int
workGroupSize,
__local
mixed*
tempBuffer
)
{
const
unsigned
int
thread
=
get_local_id
(
0
)
;
mixed
sum
=
0
;
for
(
unsigned
int
index
=
thread
; index < bufferSize; index += get_lo
c
al_size(0))
for
(
unsigned
int
index
=
get_global_id
(
0
)
; index < bufferSize; index += get_
g
lo
b
al_size(0))
sum
+=
energyBuffer[index]
;
tempBuffer[thread]
=
sum
;
for
(
int
i
=
1
; i < workGroupSize; i *= 2) {
...
...
@@ -117,7 +117,7 @@ __kernel void reduceEnergy(__global const mixed* restrict energyBuffer, __global
tempBuffer[thread]
+=
tempBuffer[thread+i]
;
}
if
(
thread
==
0
)
*
result
=
tempBuffer[0]
;
result
[get_group_id
(
0
)
]
=
tempBuffer[0]
;
}
/**
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment