Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f7a102fb
"vscode:/vscode.git/clone" did not exist on "3d2b90ba1a23fae39947981a6a12295851408aa0"
Commit
f7a102fb
authored
Jan 25, 2017
by
Peter Eastman
Browse files
Reduced duplication in the code for Coulomb and LJ PME
parent
dbcc494f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
91 additions
and
364 deletions
+91
-364
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+0
-1
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+25
-28
platforms/cuda/src/kernels/ljpme.cu
platforms/cuda/src/kernels/ljpme.cu
+0
-327
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+66
-8
No files found.
platforms/cuda/include/CudaKernels.h
View file @
f7a102fb
...
@@ -701,7 +701,6 @@ private:
...
@@ -701,7 +701,6 @@ private:
CUfunction
pmeDispersionConvolutionKernel
;
CUfunction
pmeDispersionConvolutionKernel
;
CUfunction
pmeInterpolateForceKernel
;
CUfunction
pmeInterpolateForceKernel
;
CUfunction
pmeInterpolateDispersionForceKernel
;
CUfunction
pmeInterpolateDispersionForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
,
dispersionAlpha
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
,
dispersionAlpha
;
int
interpolateForceThreads
;
int
interpolateForceThreads
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
f7a102fb
...
@@ -1763,19 +1763,11 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1763,19 +1763,11 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
}
}
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
if (doLJPME) {
defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
double dar4 = dar2*dar2;
double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
defines["INVCUT6"] = cu.doubleToString(invRCut6);
defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
}
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"USE_EWALD"
]
=
"1"
;
defines
[
"USE_EWALD"
]
=
"1"
;
defines
[
"DO_LJPME"
]
=
doLJPME
?
"1"
:
"0"
;
defines
[
"DO_LJPME"
]
=
doLJPME
?
"1"
:
"0"
;
if
(
doLJPME
)
defines
[
"EWALD_DISPERSION_ALPHA"
]
=
cu
.
doubleToString
(
dispersionAlpha
);
if
(
cu
.
getContextIndex
()
==
0
)
{
if
(
cu
.
getContextIndex
()
==
0
)
{
ewaldSelfEnergy
=
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
);
ewaldSelfEnergy
=
-
ONE_4PI_EPS0
*
alpha
*
sumSquaredCharges
/
sqrt
(
M_PI
);
if
(
doLJPME
)
if
(
doLJPME
)
...
@@ -1784,6 +1776,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1784,6 +1776,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
char
deviceName
[
100
];
char
deviceName
[
100
];
cuDeviceGetName
(
deviceName
,
100
,
cu
.
getDevice
());
cuDeviceGetName
(
deviceName
,
100
,
cu
.
getDevice
());
usePmeStream
=
(
!
cu
.
getPlatformData
().
disablePmeStream
&&
string
(
deviceName
)
!=
"GeForce GTX 980"
);
// Using a separate stream is slower on GTX 980
usePmeStream
=
(
!
cu
.
getPlatformData
().
disablePmeStream
&&
string
(
deviceName
)
!=
"GeForce GTX 980"
);
// Using a separate stream is slower on GTX 980
map
<
string
,
string
>
pmeDefines
;
pmeDefines
[
"PME_ORDER"
]
=
cu
.
intToString
(
PmeOrder
);
pmeDefines
[
"PME_ORDER"
]
=
cu
.
intToString
(
PmeOrder
);
pmeDefines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
numParticles
);
pmeDefines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
numParticles
);
pmeDefines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
pmeDefines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
...
@@ -1799,18 +1792,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1799,18 +1792,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines
[
"USE_PME_STREAM"
]
=
"1"
;
pmeDefines
[
"USE_PME_STREAM"
]
=
"1"
;
if
(
cu
.
getPlatformData
().
deterministicForces
)
if
(
cu
.
getPlatformData
().
deterministicForces
)
pmeDefines
[
"USE_DETERMINISTIC_FORCES"
]
=
"1"
;
pmeDefines
[
"USE_DETERMINISTIC_FORCES"
]
=
"1"
;
if (doLJPME) {
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
pmeDefines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["DISPERSION_GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["DISPERSION_GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["DISPERSION_GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["RECIP_DISPERSION_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
}
CUmodule module;
if (doLJPME)
module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme+CudaKernelSources::ljpme, pmeDefines);
else
module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
if
(
cu
.
getPlatformData
().
useCpuPme
)
{
if
(
cu
.
getPlatformData
().
useCpuPme
)
{
// Create the CPU PME kernel.
// Create the CPU PME kernel.
...
@@ -1843,12 +1825,27 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1843,12 +1825,27 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cuFuncSetCacheConfig
(
pmeSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
if
(
doLJPME
)
{
if
(
doLJPME
)
{
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadC6");
pmeDefines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
dispersionAlpha
);
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomDispersionGridIndex");
pmeDefines
[
"GRID_SIZE_X"
]
=
cu
.
intToString
(
dispersionGridSizeX
);
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadC6");
pmeDefines
[
"GRID_SIZE_Y"
]
=
cu
.
intToString
(
dispersionGridSizeY
);
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalDispersionConvolution");
pmeDefines
[
"GRID_SIZE_Z"
]
=
cu
.
intToString
(
dispersionGridSizeZ
);
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateDispersionEnergy");
pmeDefines
[
"EPSILON_FACTOR"
]
=
"1"
;
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateDispersionForce");
pmeDefines
[
"RECIP_EXP_FACTOR"
]
=
cu
.
doubleToString
(
M_PI
*
M_PI
/
(
dispersionAlpha
*
dispersionAlpha
));
pmeDefines
[
"USE_LJPME"
]
=
"1"
;
double
invRCut6
=
pow
(
force
.
getCutoffDistance
(),
-
6
);
double
dalphaR
=
dispersionAlpha
*
force
.
getCutoffDistance
();
double
dar2
=
dalphaR
*
dalphaR
;
double
dar4
=
dar2
*
dar2
;
double
multShift6
=
-
invRCut6
*
(
1.0
-
exp
(
-
dar2
)
*
(
1.0
+
dar2
+
0.5
*
dar4
));
defines
[
"INVCUT6"
]
=
cu
.
doubleToString
(
invRCut6
);
defines
[
"MULTSHIFT6"
]
=
cu
.
doubleToString
(
multShift6
);
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
pmeDispersionFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
pmeDispersionGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeDispersionSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
pmeDispersionConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeEvalDispersionEnergyKernel
=
cu
.
getKernel
(
module
,
"gridEvaluateEnergy"
);
pmeInterpolateDispersionForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
cuFuncSetCacheConfig
(
pmeDispersionSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeDispersionSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
}
}
...
...
platforms/cuda/src/kernels/ljpme.cu
deleted
100644 → 0
View file @
dbcc494f
extern
"C"
__global__
void
findAtomDispersionGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// Compute the index of the grid point each atom is associated with.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
DISPERSION_GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
DISPERSION_GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
DISPERSION_GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
DISPERSION_GRID_SIZE_X
,
((
int
)
t
.
y
)
%
DISPERSION_GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
DISPERSION_GRID_SIZE_Z
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
gridIndex
.
x
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
+
gridIndex
.
y
*
DISPERSION_GRID_SIZE_Z
+
gridIndex
.
z
);
}
}
extern
"C"
__global__
void
gridSpreadC6
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
,
const
real2
*
__restrict__
sigmaEpsilon
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
atom
];
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
DISPERSION_GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
DISPERSION_GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
DISPERSION_GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
DISPERSION_GRID_SIZE_X
,
((
int
)
t
.
y
)
%
DISPERSION_GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
DISPERSION_GRID_SIZE_Z
);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
real
div
=
RECIP
(
j
-
1
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
c6
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
DISPERSION_GRID_SIZE_X
?
DISPERSION_GRID_SIZE_X
:
0
);
xbase
=
xbase
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
DISPERSION_GRID_SIZE_Y
?
DISPERSION_GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
DISPERSION_GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
DISPERSION_GRID_SIZE_Z
?
DISPERSION_GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real
add
=
c6
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
int
gridIndex
=
index
;
gridIndex
=
(
gridIndex
%
2
==
0
?
gridIndex
/
2
:
(
gridIndex
+
DISPERSION_GRID_SIZE_X
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
)
/
2
);
atomicAdd
(
&
ulonglong_p
[
gridIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
atomicAdd
(
&
originalPmeGrid
[
index
],
add
);
#endif
}
}
}
}
}
extern
"C"
__global__
void
finishSpreadC6
(
long
long
*
__restrict__
originalPmeGrid
)
{
real
*
floatGrid
=
(
real
*
)
originalPmeGrid
;
const
unsigned
int
gridSize
=
DISPERSION_GRID_SIZE_X
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
real
scale
=
1.0
f
/
(
real
)
0x100000000
;
#ifdef USE_DOUBLE_PRECISION
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
];
#else
for
(
int
index
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
index
<
gridSize
;
index
+=
2
*
blockDim
.
x
*
gridDim
.
x
)
{
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
/
2
];
if
(
index
+
1
<
gridSize
)
floatGrid
[
index
+
1
]
=
scale
*
originalPmeGrid
[(
index
+
gridSize
+
1
)
/
2
];
}
#endif
}
// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
extern
"C"
__global__
void
reciprocalDispersionConvolution
(
real2
*
__restrict__
halfcomplex_pmeGrid
,
mixed
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
const
real
scaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
alpha
=
EWALD_DISPERSION_ALPHA
;
real
bfac
=
M_PI
/
alpha
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
alpha
*
alpha
*
alpha
;
real
fac3
=
-
2
*
alpha
*
M_PI
*
M_PI
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
int
kx
=
index
/
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
int
remainder
=
index
-
kx
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
int
ky
=
remainder
/
(
GRID_SIZE_Z
/
2
+
1
);
int
kz
=
remainder
-
ky
*
(
GRID_SIZE_Z
/
2
+
1
);
int
mx
=
(
kx
<
(
GRID_SIZE_X
+
1
)
/
2
)
?
kx
:
(
kx
-
GRID_SIZE_X
);
int
my
=
(
ky
<
(
GRID_SIZE_Y
+
1
)
/
2
)
?
ky
:
(
ky
-
GRID_SIZE_Y
);
int
mz
=
(
kz
<
(
GRID_SIZE_Z
+
1
)
/
2
)
?
kz
:
(
kz
-
GRID_SIZE_Z
);
real
mhx
=
mx
*
recipBoxVecX
.
x
;
real
mhy
=
mx
*
recipBoxVecY
.
x
+
my
*
recipBoxVecY
.
y
;
real
mhz
=
mx
*
recipBoxVecZ
.
x
+
my
*
recipBoxVecZ
.
y
+
mz
*
recipBoxVecZ
.
z
;
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
denom
=
scaleFactor
/
(
bx
*
by
*
bz
);
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real
t
=
1.0
f
/
(
1.0
f
+
0.3275911
f
*
b
);
real
erfcterm
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expterm
;
#else
real
erfcterm
=
ERFC
(
b
);
#endif
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
}
}
extern
"C"
__global__
void
gridEvaluateDispersionEnergy
(
real2
*
__restrict__
halfcomplex_pmeGrid
,
mixed
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
DISPERSION_GRID_SIZE_X
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
const
real
scaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
alpha
=
EWALD_DISPERSION_ALPHA
;
real
bfac
=
M_PI
/
alpha
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
alpha
*
alpha
*
alpha
;
real
fac3
=
-
2
*
alpha
*
M_PI
*
M_PI
;
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
int
kx
=
index
/
(
DISPERSION_GRID_SIZE_Y
*
(
DISPERSION_GRID_SIZE_Z
));
int
remainder
=
index
-
kx
*
DISPERSION_GRID_SIZE_Y
*
(
DISPERSION_GRID_SIZE_Z
);
int
ky
=
remainder
/
(
DISPERSION_GRID_SIZE_Z
);
int
kz
=
remainder
-
ky
*
(
DISPERSION_GRID_SIZE_Z
);
int
mx
=
(
kx
<
(
DISPERSION_GRID_SIZE_X
+
1
)
/
2
)
?
kx
:
(
kx
-
DISPERSION_GRID_SIZE_X
);
int
my
=
(
ky
<
(
DISPERSION_GRID_SIZE_Y
+
1
)
/
2
)
?
ky
:
(
ky
-
DISPERSION_GRID_SIZE_Y
);
int
mz
=
(
kz
<
(
DISPERSION_GRID_SIZE_Z
+
1
)
/
2
)
?
kz
:
(
kz
-
DISPERSION_GRID_SIZE_Z
);
real
mhx
=
mx
*
recipBoxVecX
.
x
;
real
mhy
=
mx
*
recipBoxVecY
.
x
+
my
*
recipBoxVecY
.
y
;
real
mhz
=
mx
*
recipBoxVecZ
.
x
+
my
*
recipBoxVecZ
.
y
+
mz
*
recipBoxVecZ
.
z
;
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
denom
=
scaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real
t
=
1.0
f
/
(
1.0
f
+
0.3275911
f
*
b
);
real
erfcterm
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expterm
;
#else
real
erfcterm
=
ERFC
(
b
);
#endif
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
if
(
kz
>=
(
DISPERSION_GRID_SIZE_Z
/
2
+
1
))
{
kx
=
((
kx
==
0
)
?
kx
:
DISPERSION_GRID_SIZE_X
-
kx
);
ky
=
((
ky
==
0
)
?
ky
:
DISPERSION_GRID_SIZE_Y
-
ky
);
kz
=
DISPERSION_GRID_SIZE_Z
-
kz
;
}
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
DISPERSION_GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
DISPERSION_GRID_SIZE_Y
*
(
DISPERSION_GRID_SIZE_Z
/
2
+
1
));
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
// N.B. We inlcude the 0,0,0 point for dispersion
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
}
#ifdef USE_PME_STREAM
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
#else
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
#endif
}
extern
"C"
__global__
void
gridInterpolateDispersionForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
,
const
real2
*
__restrict__
sigmaEpsilon
)
{
real3
data
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real3
force
=
make_real3
(
0
);
real4
pos
=
posq
[
atom
];
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
DISPERSION_GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
DISPERSION_GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
DISPERSION_GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
DISPERSION_GRID_SIZE_X
,
((
int
)
t
.
y
)
%
DISPERSION_GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
DISPERSION_GRID_SIZE_Z
);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
real
div
=
RECIP
(
j
-
1
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
ddata
[
0
]
=
-
data
[
0
];
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
data
[
j
-
1
]
-
data
[
j
];
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Compute the force on this atom.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
DISPERSION_GRID_SIZE_X
?
DISPERSION_GRID_SIZE_X
:
0
);
xbase
=
xbase
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
real
ddx
=
ddata
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
DISPERSION_GRID_SIZE_Y
?
DISPERSION_GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
DISPERSION_GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
real
ddy
=
ddata
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
DISPERSION_GRID_SIZE_Z
?
DISPERSION_GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real
gridvalue
=
originalPmeGrid
[
index
];
force
.
x
+=
ddx
*
dy
*
data
[
iz
].
z
*
gridvalue
;
force
.
y
+=
dx
*
ddy
*
data
[
iz
].
z
*
gridvalue
;
force
.
z
+=
dx
*
dy
*
ddata
[
iz
].
z
*
gridvalue
;
}
}
}
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
real
q
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
real
forceX
=
-
q
*
(
force
.
x
*
DISPERSION_GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceY
=
-
q
*
(
force
.
x
*
DISPERSION_GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
DISPERSION_GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceZ
=
-
q
*
(
force
.
x
*
DISPERSION_GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
DISPERSION_GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
DISPERSION_GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
atomicAdd
(
&
forceBuffers
[
atom
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
forceX
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
forceY
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
forceZ
*
0x100000000
)));
}
}
platforms/cuda/src/kernels/pme.cu
View file @
f7a102fb
...
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
...
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
#ifdef USE_LJPME
,
const
real2
*
__restrict__
sigmaEpsilon
#endif
)
{
real3
data
[
PME_ORDER
];
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
...
@@ -63,6 +67,12 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -63,6 +67,12 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
// Spread the charge from this atom onto each grid point.
// Spread the charge from this atom onto each grid point.
#ifdef USE_LJPME
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
const
real
charge
=
pos
.
w
;
#endif
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
...
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
int
index
=
ybase
+
zindex
;
real
add
=
pos
.
w
*
dx
*
dy
*
data
[
iz
].
z
;
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
...
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
...
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
#ifdef USE_LJPME
const
real
recipScaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
real
bfac
=
M_PI
/
EWALD_ALPHA
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
;
real
fac3
=
-
2
*
EWALD_ALPHA
*
M_PI
*
M_PI
;
#else
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
#endif
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
// real indices
...
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
...
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
#ifdef USE_LJPME
real
denom
=
recipScaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
real
erfcterm
=
ERFC
(
b
);
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
#else
real
denom
=
m2
*
bx
*
by
*
bz
;
real
denom
=
m2
*
bx
*
by
*
bz
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
}
}
#endif
}
}
}
}
...
@@ -156,7 +185,15 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -156,7 +185,15 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
#ifdef USE_LJPME
const
real
recipScaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
real
bfac
=
M_PI
/
EWALD_ALPHA
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
;
real
fac3
=
-
2
*
EWALD_ALPHA
*
M_PI
*
M_PI
;
#else
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
#endif
mixed
energy
=
0
;
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
...
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
real
bx
=
pmeBsplineModuliX
[
kx
];
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
#ifdef USE_LJPME
real
denom
=
recipScaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
real
erfcterm
=
ERFC
(
b
);
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
#else
real
denom
=
m2
*
bx
*
by
*
bz
;
real
denom
=
m2
*
bx
*
by
*
bz
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
#endif
if
(
kz
>=
(
GRID_SIZE_Z
/
2
+
1
))
{
if
(
kz
>=
(
GRID_SIZE_Z
/
2
+
1
))
{
kx
=
((
kx
==
0
)
?
kx
:
GRID_SIZE_X
-
kx
);
kx
=
((
kx
==
0
)
?
kx
:
GRID_SIZE_X
-
kx
);
...
@@ -185,10 +233,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -185,10 +233,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
}
}
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
#ifndef USE_LJPME
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
#endif
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
}
}
}
#ifdef USE_PME_STREAM
#ifdef USE_PME_STREAM
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
#else
#else
...
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
extern
"C"
__global__
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
#ifdef USE_LJPME
,
const
real2
*
__restrict__
sigmaEpsilon
#endif
)
{
real3
data
[
PME_ORDER
];
real3
data
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
...
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
...
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
}
}
}
}
}
}
#ifdef USE_LJPME
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
real
q
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
#endif
real
forceX
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceX
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceY
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceY
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceZ
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
real
forceZ
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment