Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
1b12ede8
Commit
1b12ede8
authored
Jul 12, 2018
by
Peter Eastman
Browse files
Optimizations to PME in CUDA
parent
dea16a26
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
103 additions
and
77 deletions
+103
-77
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+2
-2
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+42
-41
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+59
-34
No files found.
platforms/cuda/include/CudaKernels.h
View file @
1b12ede8
...
...
@@ -686,8 +686,8 @@ private:
CudaArray
exceptionOffsetIndices
;
CudaArray
globalParams
;
CudaArray
cosSinSums
;
CudaArray
directP
meGrid
;
CudaArray
reciprocalP
meGrid
;
CudaArray
p
meGrid
1
;
CudaArray
p
meGrid
2
;
CudaArray
pmeBsplineModuliX
;
CudaArray
pmeBsplineModuliY
;
CudaArray
pmeBsplineModuliZ
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
1b12ede8
...
...
@@ -1772,12 +1772,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
if (doLJPME)
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
directPmeGrid.initialize(cu, gridElements, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid.initialize(cu, gridElements, 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(directPmeGrid);
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "originalPmeGrid");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...
...
@@ -2093,7 +2096,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (
directP
meGrid.isInitialized() && includeReciprocal) {
if (
p
meGrid
1
.isInitialized() && includeReciprocal) {
if (usePmeStream)
cu.setCurrentStream(pmeStream);
...
...
@@ -2133,50 +2136,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
p
meGrid
2
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecD2Z(fftForward, (double*)
directP
meGrid.getDevicePointer(), (double2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecD2Z(fftForward, (double*)
p
meGrid
1
.getDevicePointer(), (double2*)
p
meGrid
2
.getDevicePointer());
else
cufftExecR2C(fftForward, (float*)
directP
meGrid.getDevicePointer(), (float2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecR2C(fftForward, (float*)
p
meGrid
1
.getDevicePointer(), (float2*)
p
meGrid
2
.getDevicePointer());
}
else {
fft->execFFT(
directP
meGrid,
reciprocalP
meGrid, true);
fft->execFFT(
p
meGrid
1
,
p
meGrid
2
, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&
p
meGrid
2
.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* convolutionArgs[] = {&
p
meGrid
2
.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecZ2D(fftBackward, (double2*)
reciprocalP
meGrid.getDevicePointer(), (double*)
directP
meGrid.getDevicePointer());
cufftExecZ2D(fftBackward, (double2*)
p
meGrid
2
.getDevicePointer(), (double*)
p
meGrid
1
.getDevicePointer());
else
cufftExecC2R(fftBackward, (float2*)
reciprocalP
meGrid.getDevicePointer(), (float*)
directP
meGrid.getDevicePointer());
cufftExecC2R(fftBackward, (float2*)
p
meGrid
2
.getDevicePointer(), (float*)
p
meGrid
1
.getDevicePointer());
}
else {
fft->execFFT(
reciprocalP
meGrid,
directP
meGrid, false);
fft->execFFT(
p
meGrid
2
,
p
meGrid
1
, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
p
meGrid
1
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
...
...
@@ -2186,58 +2187,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active.
if (doLJPME && hasLJ) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
if (!hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
sort->sort(pmeAtomGridIndex);
}
cu.clearBuffer(
directP
meGrid);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.clearBuffer(
p
meGrid
1
);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
p
meGrid
1
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
void* finishSpreadArgs[] = {&pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecD2Z(dispersionFftForward, (double*)
directP
meGrid.getDevicePointer(), (double2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecD2Z(dispersionFftForward, (double*)
p
meGrid
1
.getDevicePointer(), (double2*)
p
meGrid
2
.getDevicePointer());
else
cufftExecR2C(dispersionFftForward, (float*)
directP
meGrid.getDevicePointer(), (float2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecR2C(dispersionFftForward, (float*)
p
meGrid
1
.getDevicePointer(), (float2*)
p
meGrid
2
.getDevicePointer());
}
else {
dispersionFft->execFFT(
directP
meGrid,
reciprocalP
meGrid, true);
dispersionFft->execFFT(
p
meGrid
1
,
p
meGrid
2
, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&
p
meGrid
2
.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
void* convolutionArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* convolutionArgs[] = {&
p
meGrid
2
.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
cufftExecZ2D(dispersionFftBackward, (double2*)
reciprocalP
meGrid.getDevicePointer(), (double*)
directP
meGrid.getDevicePointer());
cufftExecZ2D(dispersionFftBackward, (double2*)
p
meGrid
2
.getDevicePointer(), (double*)
p
meGrid
1
.getDevicePointer());
else
cufftExecC2R(dispersionFftBackward, (float2*)
reciprocalP
meGrid.getDevicePointer(), (float*)
directP
meGrid.getDevicePointer());
cufftExecC2R(dispersionFftBackward, (float2*)
p
meGrid
2
.getDevicePointer(), (float*)
p
meGrid
1
.getDevicePointer());
}
else {
dispersionFft->execFFT(
reciprocalP
meGrid,
directP
meGrid, false);
dispersionFft->execFFT(
p
meGrid
2
,
p
meGrid
1
, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
p
meGrid
1
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
...
...
platforms/cuda/src/kernels/pme.cu
View file @
1b12ede8
...
...
@@ -28,20 +28,33 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
,
const
real
*
__restrict__
charges
#endif
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
__shared__
int
zindexTable
[
GRID_SIZE_Z
+
PME_ORDER
];
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z
/
(
real
)
PME_ORDER
);
for
(
int
i
=
threadIdx
.
x
;
i
<
GRID_SIZE_Z
+
PME_ORDER
;
i
+=
blockDim
.
x
)
{
int
zindex
=
i
%
GRID_SIZE_Z
;
int
block
=
zindex
%
PME_ORDER
;
zindexTable
[
i
]
=
zindex
/
PME_ORDER
+
block
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
blockSize
;
}
__syncthreads
();
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
atom
];
#ifdef USE_LJPME
const
float2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
*
EPSILON_FACTOR
;
#else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*
EPSILON_FACTOR
;
#endif
if
(
charge
==
0
)
continue
;
...
...
@@ -76,35 +89,28 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
int
izoffset
=
(
PME_ORDER
-
(
gridIndex
.
z
%
PME_ORDER
))
%
PME_ORDER
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
xbase
=
xbase
*
GRID_SIZE_Y
;
real
dx
=
charge
*
data
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
ybase
=
(
xbase
+
ybase
)
*
blockSize
;
real
dx
dy
=
dx
*
data
[
iy
].
y
;
for
(
int
i
=
0
;
i
<
PME_ORDER
;
i
++
)
{
int
iz
=
(
i
+
izoffset
)
%
PME_ORDER
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
int
index
=
ybase
+
zindexTable
[
zindex
];
real
add
=
dxdy
*
data
[
iz
].
z
;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
int
gridIndex
=
index
;
gridIndex
=
(
gridIndex
%
2
==
0
?
gridIndex
/
2
:
(
gridIndex
+
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
)
/
2
);
atomicAdd
(
&
ulonglong_p
[
gridIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
atomicAdd
(
&
originalPmeGrid
[
index
],
add
*
EPSILON_FACTOR
);
atomicAdd
(
&
originalPmeGrid
[
index
],
add
);
#endif
}
}
...
...
@@ -112,20 +118,39 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
}
}
extern
"C"
__global__
void
finishSpreadCharge
(
long
long
*
__restrict__
originalPmeGrid
)
{
real
*
floatGrid
=
(
real
*
)
originalPmeGrid
;
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR
/
(
real
)
0x100000000
;
#ifdef USE_DOUBLE_PRECISION
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
];
extern
"C"
__global__
void
finishSpreadCharge
(
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
const
long
long
*
__restrict__
grid1
,
#else
for
(
int
index
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
index
<
gridSize
;
index
+=
2
*
blockDim
.
x
*
gridDim
.
x
)
{
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
/
2
];
if
(
index
+
1
<
gridSize
)
floatGrid
[
index
+
1
]
=
scale
*
originalPmeGrid
[(
index
+
gridSize
+
1
)
/
2
];
const
real
*
__restrict__
grid1
,
#endif
real
*
__restrict__
grid2
)
{
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
__shared__
int
zindexTable
[
GRID_SIZE_Z
];
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z
/
(
real
)
PME_ORDER
);
for
(
int
i
=
threadIdx
.
x
;
i
<
GRID_SIZE_Z
;
i
+=
blockDim
.
x
)
{
int
block
=
i
%
PME_ORDER
;
zindexTable
[
i
]
=
i
/
PME_ORDER
+
block
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
blockSize
;
}
__syncthreads
();
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
xindex
=
index
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
int
remainder
=
index
-
xindex
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
int
yindex
=
remainder
/
GRID_SIZE_Z
;
int
zindex
=
remainder
-
yindex
*
GRID_SIZE_Z
;
int
loadIndex
=
zindexTable
[
zindex
]
+
(
xindex
*
GRID_SIZE_Y
+
yindex
)
*
blockSize
;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
grid2
[
index
]
=
scale
*
grid1
[
loadIndex
];
#else
grid2
[
index
]
=
grid1
[
loadIndex
];
#endif
}
}
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment