Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
4885a268
Unverified
Commit
4885a268
authored
Jul 16, 2018
by
peastman
Committed by
GitHub
Jul 16, 2018
Browse files
Merge pull request #2121 from peastman/pme
Optimizations to PME charge spreading
parents
38ae2563
73760081
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
171 additions
and
127 deletions
+171
-127
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+2
-2
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+42
-42
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+55
-33
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+1
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+24
-22
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+47
-27
No files found.
platforms/cuda/include/CudaKernels.h
View file @
4885a268
...
@@ -686,8 +686,8 @@ private:
...
@@ -686,8 +686,8 @@ private:
CudaArray
exceptionOffsetIndices
;
CudaArray
exceptionOffsetIndices
;
CudaArray
globalParams
;
CudaArray
globalParams
;
CudaArray
cosSinSums
;
CudaArray
cosSinSums
;
CudaArray
directP
meGrid
;
CudaArray
p
meGrid
1
;
CudaArray
reciprocalP
meGrid
;
CudaArray
p
meGrid
2
;
CudaArray
pmeBsplineModuliX
;
CudaArray
pmeBsplineModuliX
;
CudaArray
pmeBsplineModuliY
;
CudaArray
pmeBsplineModuliY
;
CudaArray
pmeBsplineModuliZ
;
CudaArray
pmeBsplineModuliZ
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
4885a268
...
@@ -1749,7 +1749,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1749,7 +1749,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["EPSILON_FACTOR"] = "1";
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["USE_LJPME"] = "1";
double invRCut6 = pow(force.getCutoffDistance(), -6);
double invRCut6 = pow(force.getCutoffDistance(), -6);
...
@@ -1772,12 +1771,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1772,12 +1771,15 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
if (doLJPME)
int gridElements = gridSizeX*gridSizeY*roundedZSize;
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
if (doLJPME) {
directPmeGrid.initialize(cu, gridElements, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
reciprocalPmeGrid.initialize(cu, gridElements, 2*elementSize, "reciprocalPmeGrid");
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
cu.addAutoclearBuffer(directPmeGrid);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...
@@ -2093,7 +2095,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -2093,7 +2095,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
}
if (
directP
meGrid.isInitialized() && includeReciprocal) {
if (
p
meGrid
1
.isInitialized() && includeReciprocal) {
if (usePmeStream)
if (usePmeStream)
cu.setCurrentStream(pmeStream);
cu.setCurrentStream(pmeStream);
...
@@ -2133,50 +2135,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -2133,50 +2135,48 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
sort->sort(pmeAtomGridIndex);
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
p
meGrid
2
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
if (useCudaFFT) {
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
if (cu.getUseDoublePrecision())
cufftExecD2Z(fftForward, (double*)
directP
meGrid.getDevicePointer(), (double2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecD2Z(fftForward, (double*)
p
meGrid
1
.getDevicePointer(), (double2*)
p
meGrid
2
.getDevicePointer());
else
else
cufftExecR2C(fftForward, (float*)
directP
meGrid.getDevicePointer(), (float2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecR2C(fftForward, (float*)
p
meGrid
1
.getDevicePointer(), (float2*)
p
meGrid
2
.getDevicePointer());
}
}
else {
else {
fft->execFFT(
directP
meGrid,
reciprocalP
meGrid, true);
fft->execFFT(
p
meGrid
1
,
p
meGrid
2
, true);
}
}
if (includeEnergy) {
if (includeEnergy) {
void* computeEnergyArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&
p
meGrid
2
.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
}
void* convolutionArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* convolutionArgs[] = {&
p
meGrid
2
.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useCudaFFT) {
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
if (cu.getUseDoublePrecision())
cufftExecZ2D(fftBackward, (double2*)
reciprocalP
meGrid.getDevicePointer(), (double*)
directP
meGrid.getDevicePointer());
cufftExecZ2D(fftBackward, (double2*)
p
meGrid
2
.getDevicePointer(), (double*)
p
meGrid
1
.getDevicePointer());
else
else
cufftExecC2R(fftBackward, (float2*)
reciprocalP
meGrid.getDevicePointer(), (float*)
directP
meGrid.getDevicePointer());
cufftExecC2R(fftBackward, (float2*)
p
meGrid
2
.getDevicePointer(), (float*)
p
meGrid
1
.getDevicePointer());
}
}
else {
else {
fft->execFFT(
reciprocalP
meGrid,
directP
meGrid, false);
fft->execFFT(
p
meGrid
2
,
p
meGrid
1
, false);
}
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
p
meGrid
1
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
&charges.getDevicePointer()};
...
@@ -2186,58 +2186,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -2186,58 +2186,58 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active.
// now we assume that LJPME can only be used if electrostatic PME is also active.
if (doLJPME && hasLJ) {
if (doLJPME && hasLJ) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
if (!hasCoulomb) {
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
sort->sort(pmeAtomGridIndex);
}
cu.clearBuffer(
directP
meGrid);
cu.clearBuffer(
p
meGrid
2
);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &
p
meGrid
2
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
}
if (useCudaFFT) {
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
if (cu.getUseDoublePrecision())
cufftExecD2Z(dispersionFftForward, (double*)
directP
meGrid.getDevicePointer(), (double2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecD2Z(dispersionFftForward, (double*)
p
meGrid
1
.getDevicePointer(), (double2*)
p
meGrid
2
.getDevicePointer());
else
else
cufftExecR2C(dispersionFftForward, (float*)
directP
meGrid.getDevicePointer(), (float2*)
reciprocalP
meGrid.getDevicePointer());
cufftExecR2C(dispersionFftForward, (float*)
p
meGrid
1
.getDevicePointer(), (float2*)
p
meGrid
2
.getDevicePointer());
}
}
else {
else {
dispersionFft->execFFT(
directP
meGrid,
reciprocalP
meGrid, true);
dispersionFft->execFFT(
p
meGrid
1
,
p
meGrid
2
, true);
}
}
if (includeEnergy) {
if (includeEnergy) {
void* computeEnergyArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&
p
meGrid
2
.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
}
void* convolutionArgs[] = {&
reciprocalP
meGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* convolutionArgs[] = {&
p
meGrid
2
.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useCudaFFT) {
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
if (cu.getUseDoublePrecision())
cufftExecZ2D(dispersionFftBackward, (double2*)
reciprocalP
meGrid.getDevicePointer(), (double*)
directP
meGrid.getDevicePointer());
cufftExecZ2D(dispersionFftBackward, (double2*)
p
meGrid
2
.getDevicePointer(), (double*)
p
meGrid
1
.getDevicePointer());
else
else
cufftExecC2R(dispersionFftBackward, (float2*)
reciprocalP
meGrid.getDevicePointer(), (float*)
directP
meGrid.getDevicePointer());
cufftExecC2R(dispersionFftBackward, (float2*)
p
meGrid
2
.getDevicePointer(), (float*)
p
meGrid
1
.getDevicePointer());
}
}
else {
else {
dispersionFft->execFFT(
reciprocalP
meGrid,
directP
meGrid, false);
dispersionFft->execFFT(
p
meGrid
2
,
p
meGrid
1
, false);
}
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
directP
meGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &
p
meGrid
1
.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
&sigmaEpsilon.getDevicePointer()};
...
...
platforms/cuda/src/kernels/pme.cu
View file @
4885a268
...
@@ -28,12 +28,25 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -28,12 +28,25 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
,
const
real
*
__restrict__
charges
,
const
real
*
__restrict__
charges
#endif
#endif
)
{
)
{
real3
data
[
PME_ORDER
];
// To improve memory efficiency, we divide indices along the z axis into
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
// which leads to better coalescing of writes.
__shared__
int
zindexTable
[
GRID_SIZE_Z
+
PME_ORDER
];
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z
/
(
real
)
PME_ORDER
);
for
(
int
i
=
threadIdx
.
x
;
i
<
GRID_SIZE_Z
+
PME_ORDER
;
i
+=
blockDim
.
x
)
{
int
zindex
=
i
%
GRID_SIZE_Z
;
int
block
=
zindex
%
PME_ORDER
;
zindexTable
[
i
]
=
zindex
/
PME_ORDER
+
block
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
blockSize
;
}
__syncthreads
();
// Process the atoms in spatially sorted order. This improves efficiency when writing
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
// the grid values.
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
atom
];
real4
pos
=
posq
[
atom
];
...
@@ -41,7 +54,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -41,7 +54,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
const
float2
sigEps
=
sigmaEpsilon
[
atom
];
const
float2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
#else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*
EPSILON_FACTOR
;
#endif
#endif
if
(
charge
==
0
)
if
(
charge
==
0
)
continue
;
continue
;
...
@@ -76,35 +89,28 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -76,35 +89,28 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
// Spread the charge from this atom onto each grid point.
int
izoffset
=
(
PME_ORDER
-
(
gridIndex
.
z
%
PME_ORDER
))
%
PME_ORDER
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
xbase
=
xbase
*
GRID_SIZE_Y
;
real
dx
=
data
[
ix
].
x
;
real
dx
=
charge
*
data
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
ybase
=
(
xbase
+
ybase
)
*
blockSize
;
real
dy
=
data
[
iy
].
y
;
real
dx
dy
=
dx
*
data
[
iy
].
y
;
for
(
int
i
=
0
;
i
<
PME_ORDER
;
i
++
)
{
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
iz
=
(
i
+
izoffset
)
%
PME_ORDER
;
int
zindex
=
gridIndex
.
z
+
iz
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindexTable
[
zindex
];
int
index
=
ybase
+
zindex
;
real
add
=
dxdy
*
data
[
iz
].
z
;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
int
gridIndex
=
index
;
gridIndex
=
(
gridIndex
%
2
==
0
?
gridIndex
/
2
:
(
gridIndex
+
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
)
/
2
);
atomicAdd
(
&
ulonglong_p
[
gridIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
#else
atomicAdd
(
&
originalPmeGrid
[
index
],
add
*
EPSILON_FACTOR
);
atomicAdd
(
&
originalPmeGrid
[
index
],
add
);
#endif
#endif
}
}
}
}
...
@@ -112,20 +118,36 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -112,20 +118,36 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
}
}
}
}
extern
"C"
__global__
void
finishSpreadCharge
(
long
long
*
__restrict__
originalPmeGrid
)
{
extern
"C"
__global__
void
finishSpreadCharge
(
real
*
floatGrid
=
(
real
*
)
originalPmeGrid
;
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
const
long
long
*
__restrict__
grid1
,
real
scale
=
EPSILON_FACTOR
/
(
real
)
0x100000000
;
#ifdef USE_DOUBLE_PRECISION
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
];
#else
#else
for
(
int
index
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
index
<
gridSize
;
index
+=
2
*
blockDim
.
x
*
gridDim
.
x
)
{
const
real
*
__restrict__
grid1
,
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
/
2
];
#endif
if
(
index
+
1
<
gridSize
)
real
*
__restrict__
grid2
)
{
floatGrid
[
index
+
1
]
=
scale
*
originalPmeGrid
[(
index
+
gridSize
+
1
)
/
2
];
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
// convert them to floating point.
__shared__
int
zindexTable
[
GRID_SIZE_Z
];
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z
/
(
real
)
PME_ORDER
);
for
(
int
i
=
threadIdx
.
x
;
i
<
GRID_SIZE_Z
;
i
+=
blockDim
.
x
)
{
int
block
=
i
%
PME_ORDER
;
zindexTable
[
i
]
=
i
/
PME_ORDER
+
block
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
blockSize
;
}
}
__syncthreads
();
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
zindex
=
index
%
GRID_SIZE_Z
;
int
loadIndex
=
zindexTable
[
zindex
]
+
blockSize
*
(
int
)
(
index
/
GRID_SIZE_Z
);
#if defined(USE_DOUBLE_PRECISION) || defined(USE_DETERMINISTIC_FORCES)
grid2
[
index
]
=
scale
*
grid1
[
loadIndex
];
#else
grid2
[
index
]
=
grid1
[
loadIndex
];
#endif
#endif
}
}
}
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
...
...
platforms/opencl/include/OpenCLKernels.h
View file @
4885a268
...
@@ -663,7 +663,7 @@ private:
...
@@ -663,7 +663,7 @@ private:
OpenCLArray
exceptionOffsetIndices
;
OpenCLArray
exceptionOffsetIndices
;
OpenCLArray
globalParams
;
OpenCLArray
globalParams
;
OpenCLArray
cosSinSums
;
OpenCLArray
cosSinSums
;
OpenCLArray
pmeGrid
;
OpenCLArray
pmeGrid
1
;
OpenCLArray
pmeGrid2
;
OpenCLArray
pmeGrid2
;
OpenCLArray
pmeBsplineModuliX
;
OpenCLArray
pmeBsplineModuliX
;
OpenCLArray
pmeBsplineModuliY
;
OpenCLArray
pmeBsplineModuliY
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
4885a268
...
@@ -1726,15 +1726,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1726,15 +1726,18 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
defines["MULTSHIFT6"] = cl.doubleToString(multShift6);
defines["MULTSHIFT6"] = cl.doubleToString(multShift6);
}
}
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int gridElements = gridSizeX*gridSizeY*gridSizeZ;
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
if (doLJPME)
int gridElements = gridSizeX*gridSizeY*roundedZSize;
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
if (doLJPME) {
pmeGrid.initialize(cl, gridElements, 2*elementSize, "pmeGrid");
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(pmeGrid2);
cl.addAutoclearBuffer(pmeGrid2);
else
else
cl.addAutoclearBuffer(pmeGrid);
cl.addAutoclearBuffer(pmeGrid
1
);
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...
@@ -1755,8 +1758,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1755,8 +1758,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = (!cl.getPlatformData().disablePmeStream && isNvidia);
usePmeQueue = (!cl.getPlatformData().disablePmeStream && isNvidia);
if (usePmeQueue) {
if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1";
pmeDefines["USE_PME_STREAM"] = "1";
...
@@ -2006,7 +2007,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2006,7 +2007,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums.getDeviceBuffer());
}
}
if (pmeGrid.isInitialized()) {
if (pmeGrid
1
.isInitialized()) {
// Create kernels for Coulomb PME.
// Create kernels for Coulomb PME.
map<string, string> replacements;
map<string, string> replacements;
...
@@ -2036,7 +2037,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2036,7 +2037,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics())
if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid
1
.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(13, charges.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(13, charges.getDeviceBuffer());
...
@@ -2053,13 +2054,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2053,13 +2054,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ.getDeviceBuffer());
pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid
1
.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid
1
.getDeviceBuffer());
}
}
if (usePmeQueue)
if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
...
@@ -2099,7 +2100,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2099,7 +2100,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (cl.getSupports64BitGlobalAtomics())
if (cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2.getDeviceBuffer());
else
else
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid
1
.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
if (deviceIsCpu || cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(13, sigmaEpsilon.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(13, sigmaEpsilon.getDeviceBuffer());
...
@@ -2116,13 +2117,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2116,13 +2117,13 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionEvalEnergyKernel.setArg<cl::Buffer>(4, pmeDispersionBsplineModuliZ.getDeviceBuffer());
pmeDispersionEvalEnergyKernel.setArg<cl::Buffer>(4, pmeDispersionBsplineModuliZ.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid
1
.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid
1
.getDeviceBuffer());
}
}
}
}
}
}
...
@@ -2176,7 +2177,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2176,7 +2177,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize());
cl.executeKernel(ewaldSumsKernel, cosSinSums.getSize());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
}
}
if (pmeGrid.isInitialized() && includeReciprocal) {
if (pmeGrid
1
.isInitialized() && includeReciprocal) {
if (usePmeQueue && !includeEnergy)
if (usePmeQueue && !includeEnergy)
cl.setQueue(pmeQueue);
cl.setQueue(pmeQueue);
...
@@ -2251,7 +2252,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2251,7 +2252,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
}
}
}
fft->execFFT(pmeGrid, pmeGrid2, true);
fft->execFFT(pmeGrid
1
, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...
@@ -2272,7 +2273,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2272,7 +2273,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy)
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false);
fft->execFFT(pmeGrid2, pmeGrid
1
, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
if (cl.getUseDoublePrecision()) {
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...
@@ -2304,7 +2305,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2304,7 +2305,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
}
cl.executeKernel(pmeDispersionUpdateBsplinesKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionUpdateBsplinesKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid);
cl.clearBuffer(pmeGrid
1
);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
if (cl.getUseDoublePrecision()) {
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
...
@@ -2319,7 +2320,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2319,7 +2320,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
}
}
else {
else {
sort->sort(pmeAtomGridIndex);
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex);
if (cl.getSupports64BitGlobalAtomics()) {
if (cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid2);
cl.clearBuffer(pmeGrid2);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 5);
...
@@ -2337,7 +2339,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2337,7 +2339,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
}
else {
else {
cl.clearBuffer(pmeGrid);
cl.clearBuffer(pmeGrid
1
);
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
if (cl.getUseDoublePrecision())
if (cl.getUseDoublePrecision())
...
@@ -2348,7 +2350,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2348,7 +2350,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
}
}
}
}
dispersionFft->execFFT(pmeGrid, pmeGrid2, true);
dispersionFft->execFFT(pmeGrid
1
, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
if (cl.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
...
@@ -2369,7 +2371,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -2369,7 +2371,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy)
if (includeEnergy)
cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid2, pmeGrid, false);
dispersionFft->execFFT(pmeGrid2, pmeGrid
1
, false);
setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3);
setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
if (cl.getUseDoublePrecision()) {
pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
...
...
platforms/opencl/src/kernels/pme.cl
View file @
4885a268
...
@@ -105,12 +105,25 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
...
@@ -105,12 +105,25 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
,
__global
const
real*
restrict
charges
,
__global
const
real*
restrict
charges
#
endif
#
endif
)
{
)
{
const
real
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
//
To
improve
memory
efficiency,
we
divide
indices
along
the
z
axis
into
real4
data[PME_ORDER]
;
//
PME_ORDER
blocks,
where
the
data
for
each
block
is
stored
together.
We
//
can
ensure
that
all
threads
write
to
the
same
block
at
the
same
time,
//
which
leads
to
better
coalescing
of
writes.
__local
int
zindexTable[GRID_SIZE_Z+PME_ORDER]
;
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z/
(
real
)
PME_ORDER
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < GRID_SIZE_Z+PME_ORDER; i += get_local_size(0)) {
int
zindex
=
i
%
GRID_SIZE_Z
;
int
block
=
zindex
%
PME_ORDER
;
zindexTable[i]
=
zindex/PME_ORDER
+
block*GRID_SIZE_X*GRID_SIZE_Y*blockSize
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Process
the
atoms
in
spatially
sorted
order.
This
improves
efficiency
when
writing
//
Process
the
atoms
in
spatially
sorted
order.
This
improves
efficiency
when
writing
//
the
grid
values.
//
the
grid
values.
const
real
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
int
atom
=
pmeAtomGridIndex[i].x
;
int
atom
=
pmeAtomGridIndex[i].x
;
real4
pos
=
posq[atom]
;
real4
pos
=
posq[atom]
;
...
@@ -118,7 +131,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
...
@@ -118,7 +131,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
#
else
#
else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*EPSILON_FACTOR
;
#
endif
#
endif
if
(
charge
==
0
)
if
(
charge
==
0
)
continue
;
continue
;
...
@@ -154,40 +167,47 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
...
@@ -154,40 +167,47 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
//
Spread
the
charge
from
this
atom
onto
each
grid
point.
//
Spread
the
charge
from
this
atom
onto
each
grid
point.
int
izoffset
=
(
PME_ORDER-
(
gridIndex.z%PME_ORDER
))
%
PME_ORDER
;
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
int
xindex
=
gridIndex.x+ix
;
int
xbase
=
gridIndex.x+ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
xbase
=
xbase*GRID_SIZE_Y
;
real
dx
=
charge*data[ix].x
;
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
int
yindex
=
gridIndex.y+iy
;
int
ybase
=
gridIndex.y+iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
ybase
=
(
xbase+ybase
)
*blockSize
;
real
dxdy
=
dx*data[iy].y
;
for
(
int
i
=
0
; i < PME_ORDER; i++) {
int
iz
=
(
i+izoffset
)
%
PME_ORDER
;
int
zindex
=
gridIndex.z+iz
;
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
ybase
+
zindexTable[zindex]
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
add
=
dxdy*data[iz].z
;
real
add
=
charge*data[ix].x*data[iy].y*data[iz].z
;
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
//
On
Nvidia
devices
(
at
least
Maxwell
anyway
)
,
this
split
ordering
produces
much
higher
performance.
Why?
//
I
have
no
idea!
And
of
course
on
AMD
it
produces
slower
performance.
GPUs
are
not
meant
to
be
understood.
atom_add
(
&pmeGrid[index%2
==
0
?
index/2
:
(
index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
)
/2],
(
long
)
(
add*0x100000000
))
;
#
else
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
#
endif
}
}
}
}
}
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
fixedGrid,
__global
real*
restrict
realGrid
)
{
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
grid1,
__global
real*
restrict
grid2
)
{
//
During
charge
spreading,
we
shuffled
the
order
of
indices
along
the
z
//
axis
to
make
memory
access
more
efficient.
We
now
need
to
unshuffle
//
them
and
convert
fixed
point
values
to
floating
point.
__local
int
zindexTable[GRID_SIZE_Z]
;
int
blockSize
=
(
int
)
ceil
(
GRID_SIZE_Z/
(
real
)
PME_ORDER
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < GRID_SIZE_Z; i += get_local_size(0)) {
int
block
=
i
%
PME_ORDER
;
zindexTable[i]
=
i/PME_ORDER
+
block*GRID_SIZE_X*GRID_SIZE_Y*blockSize
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR
/
(
real
)
0x100000000
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
int
zindex
=
index%GRID_SIZE_Z
;
long
value
=
fixedGrid[index%2
==
0
?
index/2
:
(
index+gridSize
)
/2]
;
int
loadIndex
=
zindexTable[zindex]
+
blockSize*
(
int
)
(
index/GRID_SIZE_Z
)
;
#
else
grid2[index]
=
scale*grid1[loadIndex]
;
long
value
=
fixedGrid[index]
;
#
endif
realGrid[index]
=
(
real
)
(
value*scale
)
;
}
}
}
}
#
elif
defined
(
DEVICE_IS_CPU
)
#
elif
defined
(
DEVICE_IS_CPU
)
...
@@ -230,7 +250,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
...
@@ -230,7 +250,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
float2
sigEps
=
sigmaEpsilon[atom]
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
const
real
charge
=
8*sigEps.x*sigEps.x*sigEps.x*sigEps.y
;
#
else
#
else
const
real
charge
=
CHARGE
;
const
real
charge
=
(
CHARGE
)
*EPSILON_FACTOR
;
#
endif
#
endif
if
(
charge
==
0
)
if
(
charge
==
0
)
continue
;
continue
;
...
@@ -269,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
...
@@ -269,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz;
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index] +=
EPSILON_FACTOR*
charge*data[ix].x*data[iy].y*data[iz].z;
pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment