Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
4885a268
Unverified
Commit
4885a268
authored
Jul 16, 2018
by
peastman
Committed by
GitHub
Jul 16, 2018
Browse files
Merge pull request #2121 from peastman/pme
Optimizations to PME charge spreading
parents
38ae2563
73760081
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
171 additions
and
127 deletions
+171
-127
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+2
-2
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+42
-42
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+55
-33
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+1
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+24
-22
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+47
-27
No files found.
platforms/cuda/include/CudaKernels.h
View file @
4885a268
...
...
@@ -686,8 +686,8 @@ private:
CudaArray
exceptionOffsetIndices
;
CudaArray
globalParams
;
CudaArray
cosSinSums
;
CudaArray
directP
meGrid
;
CudaArray
reciprocalP
meGrid
;
CudaArray
p
meGrid
1
;
CudaArray
p
meGrid
2
;
CudaArray
pmeBsplineModuliX
;
CudaArray
pmeBsplineModuliY
;
CudaArray
pmeBsplineModuliZ
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
4885a268
...
...
@@ -1749,7 +1749,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines
[
"GRID_SIZE_X"
]
=
cu
.
intToString
(
dispersionGridSizeX
);
pmeDefines
[
"GRID_SIZE_Y"
]
=
cu
.
intToString
(
dispersionGridSizeY
);
pmeDefines
[
"GRID_SIZE_Z"
]
=
cu
.
intToString
(
dispersionGridSizeZ
);
pmeDefines["EPSILON_FACTOR"] = "1";
pmeDefines
[
"RECIP_EXP_FACTOR"
]
=
cu
.
doubleToString
(
M_PI
*
M_PI
/
(
dispersionAlpha
*
dispersionAlpha
));
pmeDefines
[
"USE_LJPME"
]
=
"1"
;
double
invRCut6
=
pow
(
force
.
getCutoffDistance
(),
-
6
);
...
...
@@ -1772,12 +1771,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME)
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
directPmeGrid.initialize(cu, gridElements, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid.initialize(cu, gridElements, 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(directPmeGrid);
int
roundedZSize
=
PmeOrder
*
(
int
)
ceil
(
gridSizeZ
/
(
double
)
PmeOrder
);
int
gridElements
=
gridSizeX
*
gridSizeY
*
roundedZSize
;
if
(
doLJPME
)
{
roundedZSize
=
PmeOrder
*
(
int
)
ceil
(
dispersionGridSizeZ
/
(
double
)
PmeOrder
);
gridElements
=
max
(
gridElements
,
dispersionGridSizeX
*
dispersionGridSizeY
*
roundedZSize
);
}
pmeGrid1
.
initialize
(
cu
,
gridElements
,
2
*
elementSize
,
"pmeGrid1"
);
pmeGrid2
.
initialize
(
cu
,
gridElements
,
2
*
elementSize
,
"pmeGrid2"
);
cu
.
addAutoclearBuffer
(
pmeGrid2
);
pmeBsplineModuliX
.
initialize
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliY
.
initialize
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
.
initialize
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
...
...
@@ -2093,7 +2095,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void
*
forcesArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cosSinSums
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
}
if (
directP
meGrid.isInitialized() && includeReciprocal) {
if
(
p
meGrid
1
.
isInitialized
()
&&
includeReciprocal
)
{
if
(
usePmeStream
)
cu
.
setCurrentStream
(
pmeStream
);
...
...
@@ -2133,50 +2135,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort
->
sort
(
pmeAtomGridIndex
);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
p
meGrid
2
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
.
getDevicePointer
(),
&
charges
.
getDevicePointer
()};
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
128
);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid2
.
getDevicePointer
(),
&
pmeGrid1
.
getDevicePointer
()};
cu
.
executeKernel
(
pmeFinishSpreadChargeKernel
,
finishSpreadArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
useCudaFFT
)
{
if
(
cu
.
getUseDoublePrecision
())
cufftExecD2Z(fftForward, (double*)
directP
meGrid.getDevicePointer(), (double2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecD2Z
(
fftForward
,
(
double
*
)
p
meGrid
1
.
getDevicePointer
(),
(
double2
*
)
p
meGrid
2
.
getDevicePointer
());
else
cufftExecR2C(fftForward, (float*)
directP
meGrid.getDevicePointer(), (float2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecR2C
(
fftForward
,
(
float
*
)
p
meGrid
1
.
getDevicePointer
(),
(
float2
*
)
p
meGrid
2
.
getDevicePointer
());
}
else
{
fft->execFFT(
directP
meGrid,
reciprocalP
meGrid, true);
fft
->
execFFT
(
p
meGrid
1
,
p
meGrid
2
,
true
);
}
if
(
includeEnergy
)
{
void* computeEnergyArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void
*
computeEnergyArgs
[]
=
{
&
p
meGrid
2
.
getDevicePointer
(),
usePmeStream
?
&
pmeEnergyBuffer
.
getDevicePointer
()
:
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
.
getDevicePointer
(),
&
pmeBsplineModuliY
.
getDevicePointer
(),
&
pmeBsplineModuliZ
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeEvalEnergyKernel
,
computeEnergyArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
);
}
void* convolutionArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void
*
convolutionArgs
[]
=
{
&
p
meGrid
2
.
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
.
getDevicePointer
(),
&
pmeBsplineModuliY
.
getDevicePointer
(),
&
pmeBsplineModuliZ
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
convolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
useCudaFFT
)
{
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2D(fftBackward, (double2*)
reciprocalP
meGrid.getDevicePointer(), (double*)
directP
meGrid.getDevicePointer());
cufftExecZ2D
(
fftBackward
,
(
double2
*
)
p
meGrid
2
.
getDevicePointer
(),
(
double
*
)
p
meGrid
1
.
getDevicePointer
());
else
cufftExecC2R(fftBackward, (float2*)
reciprocalP
meGrid.getDevicePointer(), (float*)
directP
meGrid.getDevicePointer());
cufftExecC2R
(
fftBackward
,
(
float2
*
)
p
meGrid
2
.
getDevicePointer
(),
(
float
*
)
p
meGrid
1
.
getDevicePointer
());
}
else
{
fft->execFFT(
reciprocalP
meGrid,
directP
meGrid, false);
fft
->
execFFT
(
p
meGrid
2
,
p
meGrid
1
,
false
);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
p
meGrid
1
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
.
getDevicePointer
(),
&
charges
.
getDevicePointer
()};
...
...
@@ -2186,58 +2186,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active.
if
(
doLJPME
&&
hasLJ
)
{
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
if
(
!
hasCoulomb
)
{
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeDispersionGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
sort->sort(pmeAtomGridIndex);
sort
->
sort
(
pmeAtomGridIndex
);
}
cu.clearBuffer(
directP
meGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu
.
clearBuffer
(
p
meGrid
2
);
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
p
meGrid
2
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
.
getDevicePointer
(),
&
sigmaEpsilon
.
getDevicePointer
()};
cu
.
executeKernel
(
pmeDispersionSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
128
);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid2
.
getDevicePointer
(),
&
pmeGrid1
.
getDevicePointer
()};
cu
.
executeKernel
(
pmeDispersionFinishSpreadChargeKernel
,
finishSpreadArgs
,
dispersionGridSizeX
*
dispersionGridSizeY
*
dispersionGridSizeZ
,
256
);
if
(
useCudaFFT
)
{
if
(
cu
.
getUseDoublePrecision
())
cufftExecD2Z(dispersionFftForward, (double*)
directP
meGrid.getDevicePointer(), (double2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecD2Z
(
dispersionFftForward
,
(
double
*
)
p
meGrid
1
.
getDevicePointer
(),
(
double2
*
)
p
meGrid
2
.
getDevicePointer
());
else
cufftExecR2C(dispersionFftForward, (float*)
directP
meGrid.getDevicePointer(), (float2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecR2C
(
dispersionFftForward
,
(
float
*
)
p
meGrid
1
.
getDevicePointer
(),
(
float2
*
)
p
meGrid
2
.
getDevicePointer
());
}
else
{
dispersionFft->execFFT(
directP
meGrid,
reciprocalP
meGrid, true);
dispersionFft
->
execFFT
(
p
meGrid
1
,
p
meGrid
2
,
true
);
}
if
(
includeEnergy
)
{
void* computeEnergyArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void
*
computeEnergyArgs
[]
=
{
&
p
meGrid
2
.
getDevicePointer
(),
usePmeStream
?
&
pmeEnergyBuffer
.
getDevicePointer
()
:
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeDispersionBsplineModuliX
.
getDevicePointer
(),
&
pmeDispersionBsplineModuliY
.
getDevicePointer
(),
&
pmeDispersionBsplineModuliZ
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeEvalDispersionEnergyKernel
,
computeEnergyArgs
,
dispersionGridSizeX
*
dispersionGridSizeY
*
dispersionGridSizeZ
);
}
void* convolutionArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void
*
convolutionArgs
[]
=
{
&
p
meGrid
2
.
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeDispersionBsplineModuliX
.
getDevicePointer
(),
&
pmeDispersionBsplineModuliY
.
getDevicePointer
(),
&
pmeDispersionBsplineModuliZ
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeDispersionConvolutionKernel
,
convolutionArgs
,
dispersionGridSizeX
*
dispersionGridSizeY
*
dispersionGridSizeZ
,
256
);
if
(
useCudaFFT
)
{
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2D(dispersionFftBackward, (double2*)
reciprocalP
meGrid.getDevicePointer(), (double*)
directP
meGrid.getDevicePointer());
cufftExecZ2D
(
dispersionFftBackward
,
(
double2
*
)
p
meGrid
2
.
getDevicePointer
(),
(
double
*
)
p
meGrid
1
.
getDevicePointer
());
else
cufftExecC2R(dispersionFftBackward, (float2*)
reciprocalP
meGrid.getDevicePointer(), (float*)
directP
meGrid.getDevicePointer());
cufftExecC2R
(
dispersionFftBackward
,
(
float2
*
)
p
meGrid
2
.
getDevicePointer
(),
(
float
*
)
p
meGrid
1
.
getDevicePointer
());
}
else
{
dispersionFft->execFFT(
reciprocalP
meGrid,
directP
meGrid, false);
dispersionFft
->
execFFT
(
p
meGrid
2
,
p
meGrid
1
,
false
);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
p
meGrid
1
.
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
.
getDevicePointer
(),
&
sigmaEpsilon
.
getDevicePointer
()};
...
...
platforms/cuda/src/kernels/pme.cu
View file @
4885a268
...
...
@@ -28,12 +28,25 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
,
const
real
*
__restrict__
charges
#endif
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
__shared__
int
zindexTable
[
GRID_SIZE_Z
+
PME_ORDER
];
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z
/
(
real
)
PME_ORDER
);
for
(
int
i
=
threadIdx
.
x
;
i
<
GRID_SIZE_Z
+
PME_ORDER
;
i
+=
blockDim
.
x
)
{
int
zindex
=
i
%
GRID_SIZE_Z
;
int
block
=
zindex
%
PME_ORDER
;
zindexTable
[
i
]
=
zindex
/
PME_ORDER
+
block
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
blockSize
;
}
__syncthreads
();
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
atom
];
...
...
@@ -41,7 +54,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
const
float2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*
EPSILON_FACTOR
;
#endif
if
(
charge
==
0
)
continue
;
...
...
@@ -76,35 +89,28 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
int
izoffset
=
(
PME_ORDER
-
(
gridIndex
.
z
%
PME_ORDER
))
%
PME_ORDER
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
xbase
=
xbase
*
GRID_SIZE_Y
;
real
dx
=
charge
*
data
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
ybase
=
(
xbase
+
ybase
)
*
blockSize
;
real
dx
dy
=
dx
*
data
[
iy
].
y
;
for
(
int
i
=
0
;
i
<
PME_ORDER
;
i
++
)
{
int
iz
=
(
i
+
izoffset
)
%
PME_ORDER
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
int
index
=
ybase
+
zindexTable
[
zindex
];
real
add
=
dxdy
*
data
[
iz
].
z
;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
int
gridIndex
=
index
;
gridIndex
=
(
gridIndex
%
2
==
0
?
gridIndex
/
2
:
(
gridIndex
+
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
)
/
2
);
atomicAdd
(
&
ulonglong_p
[
gridIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
atomicAdd
(
&
originalPmeGrid
[
index
],
add
*
EPSILON_FACTOR
);
atomicAdd
(
&
originalPmeGrid
[
index
],
add
);
#endif
}
}
...
...
@@ -112,20 +118,36 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
}
}
extern
"C"
__global__
void
finishSpreadCharge
(
long
long
*
__restrict__
originalPmeGrid
)
{
real
*
floatGrid
=
(
real
*
)
originalPmeGrid
;
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR
/
(
real
)
0x100000000
;
#ifdef USE_DOUBLE_PRECISION
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
];
extern
"C"
__global__
void
finishSpreadCharge
(
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
const
long
long
*
__restrict__
grid1
,
#else
for
(
int
index
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
index
<
gridSize
;
index
+=
2
*
blockDim
.
x
*
gridDim
.
x
)
{
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
/
2
];
if
(
index
+
1
<
gridSize
)
floatGrid
[
index
+
1
]
=
scale
*
originalPmeGrid
[(
index
+
gridSize
+
1
)
/
2
];
const
real
*
__restrict__
grid1
,
#endif
real
*
__restrict__
grid2
)
{
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
__shared__
int
zindexTable
[
GRID_SIZE_Z
];
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z
/
(
real
)
PME_ORDER
);
for
(
int
i
=
threadIdx
.
x
;
i
<
GRID_SIZE_Z
;
i
+=
blockDim
.
x
)
{
int
block
=
i
%
PME_ORDER
;
zindexTable
[
i
]
=
i
/
PME_ORDER
+
block
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
blockSize
;
}
__syncthreads
();
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
zindex
=
index
%
GRID_SIZE_Z
;
int
loadIndex
=
zindexTable
[
zindex
]
+
blockSize
*
(
int
)
(
index
/
GRID_SIZE_Z
);
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
grid2
[
index
]
=
scale
*
grid1
[
loadIndex
];
#else
grid2
[
index
]
=
grid1
[
loadIndex
];
#endif
}
}
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
...
...
platforms/opencl/include/OpenCLKernels.h
View file @
4885a268
...
...
@@ -663,7 +663,7 @@ private:
OpenCLArray
exceptionOffsetIndices
;
OpenCLArray
globalParams
;
OpenCLArray
cosSinSums
;
OpenCLArray
pmeGrid
;
OpenCLArray
pmeGrid
1
;
OpenCLArray
pmeGrid2
;
OpenCLArray
pmeBsplineModuliX
;
OpenCLArray
pmeBsplineModuliY
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
4885a268
...
...
@@ -1726,15 +1726,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
defines["MULTSHIFT6"] = cl.doubleToString(multShift6);
}
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME)
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
pmeGrid.initialize(cl, gridElements, 2*elementSize, "pmeGrid");
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(pmeGrid2);
else
cl.addAutoclearBuffer(pmeGrid);
cl.addAutoclearBuffer(pmeGrid
1
);
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...
...
@@ -1755,8 +1758,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = (!cl.getPlatformData().disablePmeStream && isNvidia);
if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1";
...
...
@@ -2006,7 +2007,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
}
if (pmeGrid.isInitialized()) {
if (pmeGrid
1
.isInitialized()) {
// Create kernels for Coulomb PME.
map<string, string> replacements;
...
...
@@ -2036,7 +2037,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid
1
.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(13, charges.getDeviceBuffer());
...
...
@@ -2053,13 +2054,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid
1
.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid
1
.getDeviceBuffer());
}
if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
...
...
@@ -2099,7 +2100,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid
1
.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(13, sigmaEpsilon.getDeviceBuffer());
...
...
@@ -2116,13 +2117,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionEvalEnergyKernel.setArg<cl::Buffer>(4, pmeDispersionBsplineModuliZ.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid
1
.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid
1
.getDeviceBuffer());
}
}
}
...
...
@@ -2176,7 +2177,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
}
if (pmeGrid.isInitialized() && includeReciprocal) {
if (pmeGrid
1
.isInitialized() && includeReciprocal) {
if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue);
...
...
@@ -2251,7 +2252,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
}
fft->execFFT(pmeGrid, pmeGrid2, true);
fft->execFFT(pmeGrid
1
, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...
...
@@ -2272,7 +2273,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false);
fft->execFFT(pmeGrid2, pmeGrid
1
, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...
...
@@ -2304,7 +2305,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
cl.executeKernel(pmeDispersionUpdateBsplinesKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid);
cl.clearBuffer(pmeGrid
1
);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
...
...
@@ -2319,7 +2320,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
}
else {
sort->sort(pmeAtomGridIndex);
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex);
if (cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid2);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
...
...
@@ -2337,7 +2339,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
cl.clearBuffer(pmeGrid);
cl.clearBuffer(pmeGrid
1
);
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
if (cl.getUseDoublePrecision())
...
...
@@ -2348,7 +2350,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
}
}
dispersionFft->execFFT(pmeGrid, pmeGrid2, true);
dispersionFft->execFFT(pmeGrid
1
, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...
...
@@ -2369,7 +2371,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy)
cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid2, pmeGrid, false);
dispersionFft->execFFT(pmeGrid2, pmeGrid
1
, false);
setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...
...
platforms/opencl/src/kernels/pme.cl
View file @
4885a268
...
...
@@ -105,12 +105,25 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
,
__global
const
real*
restrict
charges
#
endif
)
{
const
real
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
//
To
improve
memory
efficiency,
we
divide
indices
along
the
z
axis
into
//
PME_ORDER
blocks,
where
the
data
for
each
block
is
stored
together.
We
//
can
ensure
that
all
threads
write
to
the
same
block
at
the
same
time,
//
which
leads
to
better
coalescing
of
writes.
__local
int
zindexTable[GRID_SIZE_Z+PME_ORDER]
;
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z/
(
real
)
PME_ORDER
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < GRID_SIZE_Z+PME_ORDER; i += get_local_size(0)) {
int
zindex
=
i
%
GRID_SIZE_Z
;
int
block
=
zindex
%
PME_ORDER
;
zindexTable[i]
=
zindex/PME_ORDER
+
block*GRID_SIZE_X*GRID_SIZE_Y*blockSize
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Process
the
atoms
in
spatially
sorted
order.
This
improves
efficiency
when
writing
//
the
grid
values.
const
real
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
int
atom
=
pmeAtomGridIndex[i].x
;
real4
pos
=
posq[atom]
;
...
...
@@ -118,7 +131,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
#
else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*EPSILON_FACTOR
;
#
endif
if
(
charge
==
0
)
continue
;
...
...
@@ -154,40 +167,47 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
//
Spread
the
charge
from
this
atom
onto
each
grid
point.
int
izoffset
=
(
PME_ORDER-
(
gridIndex.z%PME_ORDER
))
%
PME_ORDER
;
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
int
xindex
=
gridIndex.x+ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
int
xbase
=
gridIndex.x+ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
xbase
=
xbase*GRID_SIZE_Y
;
real
dx
=
charge*data[ix].x
;
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
int
yindex
=
gridIndex.y+iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
int
ybase
=
gridIndex.y+iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
ybase
=
(
xbase+ybase
)
*blockSize
;
real
dxdy
=
dx*data[iy].y
;
for
(
int
i
=
0
; i < PME_ORDER; i++) {
int
iz
=
(
i+izoffset
)
%
PME_ORDER
;
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
add
=
charge*data[ix].x*data[iy].y*data[iz].z
;
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
//
On
Nvidia
devices
(
at
least
Maxwell
anyway
)
,
this
split
ordering
produces
much
higher
performance.
Why?
//
I
have
no
idea!
And
of
course
on
AMD
it
produces
slower
performance.
GPUs
are
not
meant
to
be
understood.
atom_add
(
&pmeGrid[index%2
==
0
?
index/2
:
(
index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
)
/2],
(
long
)
(
add*0x100000000
))
;
#
else
int
index
=
ybase
+
zindexTable[zindex]
;
real
add
=
dxdy*data[iz].z
;
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
#
endif
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
fixedGrid,
__global
real*
restrict
realGrid
)
{
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
grid1,
__global
real*
restrict
grid2
)
{
//
During
charge
spreading,
we
shuffled
the
order
of
indices
along
the
z
//
axis
to
make
memory
access
more
efficient.
We
now
need
to
unshuffle
//
them
and
convert
fixed
point
values
to
floating
point.
__local
int
zindexTable[GRID_SIZE_Z]
;
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z/
(
real
)
PME_ORDER
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < GRID_SIZE_Z; i += get_local_size(0)) {
int
block
=
i
%
PME_ORDER
;
zindexTable[i]
=
i/PME_ORDER
+
block*GRID_SIZE_X*GRID_SIZE_Y*blockSize
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR
/
(
real
)
0x100000000
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
long
value
=
fixedGrid[index%2
==
0
?
index/2
:
(
index+gridSize
)
/2]
;
#
else
long
value
=
fixedGrid[index]
;
#
endif
realGrid[index]
=
(
real
)
(
value*scale
)
;
int
zindex
=
index%GRID_SIZE_Z
;
int
loadIndex
=
zindexTable[zindex]
+
blockSize*
(
int
)
(
index/GRID_SIZE_Z
)
;
grid2[index]
=
scale*grid1[loadIndex]
;
}
}
#
elif
defined
(
DEVICE_IS_CPU
)
...
...
@@ -230,7 +250,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
#
else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*EPSILON_FACTOR
;
#
endif
if
(
charge
==
0
)
continue
;
...
...
@@ -269,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index] +=
EPSILON_FACTOR*
charge*data[ix].x*data[iy].y*data[iz].z;
pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment