Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
73760081
Commit
73760081
authored
Jul 13, 2018
by
Peter Eastman
Browse files
Optimizations to PME in OpenCL
parent
36b9db84
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
72 additions
and
50 deletions
+72
-50
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+1
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+24
-22
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+47
-27
No files found.
platforms/opencl/include/OpenCLKernels.h
View file @
73760081
...
...
@@ -663,7 +663,7 @@ private:
OpenCLArray
exceptionOffsetIndices
;
OpenCLArray
globalParams
;
OpenCLArray
cosSinSums
;
OpenCLArray
pmeGrid
;
OpenCLArray
pmeGrid
1
;
OpenCLArray
pmeGrid2
;
OpenCLArray
pmeBsplineModuliX
;
OpenCLArray
pmeBsplineModuliY
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
73760081
...
...
@@ -1726,15 +1726,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
defines["MULTSHIFT6"] = cl.doubleToString(multShift6);
}
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME)
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
pmeGrid.initialize(cl, gridElements, 2*elementSize, "pmeGrid");
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(pmeGrid2);
else
cl.addAutoclearBuffer(pmeGrid);
cl.addAutoclearBuffer(pmeGrid
1
);
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...
...
@@ -1755,8 +1758,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = (!cl.getPlatformData().disablePmeStream && isNvidia);
if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1";
...
...
@@ -2006,7 +2007,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
}
if (pmeGrid.isInitialized()) {
if (pmeGrid
1
.isInitialized()) {
// Create kernels for Coulomb PME.
map<string, string> replacements;
...
...
@@ -2036,7 +2037,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid
1
.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(13, charges.getDeviceBuffer());
...
...
@@ -2053,13 +2054,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid
1
.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid
1
.getDeviceBuffer());
}
if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
...
...
@@ -2099,7 +2100,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid
1
.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(13, sigmaEpsilon.getDeviceBuffer());
...
...
@@ -2116,13 +2117,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionEvalEnergyKernel.setArg<cl::Buffer>(4, pmeDispersionBsplineModuliZ.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid
1
.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid
1
.getDeviceBuffer());
}
}
}
...
...
@@ -2176,7 +2177,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
}
if (pmeGrid.isInitialized() && includeReciprocal) {
if (pmeGrid
1
.isInitialized() && includeReciprocal) {
if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue);
...
...
@@ -2251,7 +2252,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
}
fft->execFFT(pmeGrid, pmeGrid2, true);
fft->execFFT(pmeGrid
1
, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...
...
@@ -2272,7 +2273,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false);
fft->execFFT(pmeGrid2, pmeGrid
1
, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...
...
@@ -2304,7 +2305,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
cl.executeKernel(pmeDispersionUpdateBsplinesKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid);
cl.clearBuffer(pmeGrid
1
);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
...
...
@@ -2319,6 +2320,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
}
else {
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex);
if (cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid2);
...
...
@@ -2337,7 +2339,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
cl.clearBuffer(pmeGrid);
cl.clearBuffer(pmeGrid
1
);
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
if (cl.getUseDoublePrecision())
...
...
@@ -2348,7 +2350,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
}
}
dispersionFft->execFFT(pmeGrid, pmeGrid2, true);
dispersionFft->execFFT(pmeGrid
1
, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...
...
@@ -2369,7 +2371,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy)
cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid2, pmeGrid, false);
dispersionFft->execFFT(pmeGrid2, pmeGrid
1
, false);
setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...
...
platforms/opencl/src/kernels/pme.cl
View file @
73760081
...
...
@@ -105,12 +105,25 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
,
__global
const
real*
restrict
charges
#
endif
)
{
const
real
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
//
To
improve
memory
efficiency,
we
divide
indices
along
the
z
axis
into
//
PME_ORDER
blocks,
where
the
data
for
each
block
is
stored
together.
We
//
can
ensure
that
all
threads
write
to
the
same
block
at
the
same
time,
//
which
leads
to
better
coalescing
of
writes.
__local
int
zindexTable[GRID_SIZE_Z+PME_ORDER]
;
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z/
(
real
)
PME_ORDER
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < GRID_SIZE_Z+PME_ORDER; i += get_local_size(0)) {
int
zindex
=
i
%
GRID_SIZE_Z
;
int
block
=
zindex
%
PME_ORDER
;
zindexTable[i]
=
zindex/PME_ORDER
+
block*GRID_SIZE_X*GRID_SIZE_Y*blockSize
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Process
the
atoms
in
spatially
sorted
order.
This
improves
efficiency
when
writing
//
the
grid
values.
const
real
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
int
atom
=
pmeAtomGridIndex[i].x
;
real4
pos
=
posq[atom]
;
...
...
@@ -118,7 +131,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
#
else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*EPSILON_FACTOR
;
#
endif
if
(
charge
==
0
)
continue
;
...
...
@@ -154,40 +167,47 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
//
Spread
the
charge
from
this
atom
onto
each
grid
point.
int
izoffset
=
(
PME_ORDER-
(
gridIndex.z%PME_ORDER
))
%
PME_ORDER
;
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
int
xindex
=
gridIndex.x+ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
int
xbase
=
gridIndex.x+ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
xbase
=
xbase*GRID_SIZE_Y
;
real
dx
=
charge*data[ix].x
;
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
int
yindex
=
gridIndex.y+iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
int
ybase
=
gridIndex.y+iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
ybase
=
(
xbase+ybase
)
*blockSize
;
real
dxdy
=
dx*data[iy].y
;
for
(
int
i
=
0
; i < PME_ORDER; i++) {
int
iz
=
(
i+izoffset
)
%
PME_ORDER
;
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
add
=
charge*data[ix].x*data[iy].y*data[iz].z
;
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
//
On
Nvidia
devices
(
at
least
Maxwell
anyway
)
,
this
split
ordering
produces
much
higher
performance.
Why?
//
I
have
no
idea!
And
of
course
on
AMD
it
produces
slower
performance.
GPUs
are
not
meant
to
be
understood.
atom_add
(
&pmeGrid[index%2
==
0
?
index/2
:
(
index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
)
/2],
(
long
)
(
add*0x100000000
))
;
#
else
int
index
=
ybase
+
zindexTable[zindex]
;
real
add
=
dxdy*data[iz].z
;
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
#
endif
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
fixedGrid,
__global
real*
restrict
realGrid
)
{
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
grid1,
__global
real*
restrict
grid2
)
{
//
During
charge
spreading,
we
shuffled
the
order
of
indices
along
the
z
//
axis
to
make
memory
access
more
efficient.
We
now
need
to
unshuffle
//
them
and
convert
fixed
point
values
to
floating
point.
__local
int
zindexTable[GRID_SIZE_Z]
;
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z/
(
real
)
PME_ORDER
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < GRID_SIZE_Z; i += get_local_size(0)) {
int
block
=
i
%
PME_ORDER
;
zindexTable[i]
=
i/PME_ORDER
+
block*GRID_SIZE_X*GRID_SIZE_Y*blockSize
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR
/
(
real
)
0x100000000
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
long
value
=
fixedGrid[index%2
==
0
?
index/2
:
(
index+gridSize
)
/2]
;
#
else
long
value
=
fixedGrid[index]
;
#
endif
realGrid[index]
=
(
real
)
(
value*scale
)
;
int
zindex
=
index%GRID_SIZE_Z
;
int
loadIndex
=
zindexTable[zindex]
+
blockSize*
(
int
)
(
index/GRID_SIZE_Z
)
;
grid2[index]
=
scale*grid1[loadIndex]
;
}
}
#
elif
defined
(
DEVICE_IS_CPU
)
...
...
@@ -230,7 +250,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
#
else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*EPSILON_FACTOR
;
#
endif
if
(
charge
==
0
)
continue
;
...
...
@@ -269,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index] +=
EPSILON_FACTOR*
charge*data[ix].x*data[iy].y*data[iz].z;
pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment