Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
fb62fecb
Commit
fb62fecb
authored
Sep 03, 2010
by
Peter Eastman
Browse files
Beginning to implement PME reciprocal space calculation
parent
cf335495
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
406 additions
and
1 deletion
+406
-1
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+135
-1
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
...ins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+4
-0
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+9
-0
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
+8
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
...eba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+250
-0
No files found.
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
View file @
fb62fecb
...
@@ -29,6 +29,9 @@
...
@@ -29,6 +29,9 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES
/* M_PI */
#endif
#include "openmm/OpenMMException.h"
#include "openmm/OpenMMException.h"
#include "cudaKernels.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
#include "amoebaCudaKernels.h"
...
@@ -37,9 +40,11 @@
...
@@ -37,9 +40,11 @@
extern
void
OPENMMCUDA_EXPORT
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
OPENMMCUDA_EXPORT
SetCalculateObcGbsaForces2Sim
(
gpuContext
gpu
);
extern
void
OPENMMCUDA_EXPORT
SetForcesSim
(
gpuContext
gpu
);
extern
void
OPENMMCUDA_EXPORT
SetForcesSim
(
gpuContext
gpu
);
#include <cmath>
#include <sstream>
#include <sstream>
#include <limits>
#include <limits>
#include <cstring>
#include <cstring>
#include <vector>
#ifdef WIN32
#ifdef WIN32
// #include <windows.h>
// #include <windows.h>
...
@@ -49,6 +54,8 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
...
@@ -49,6 +54,8 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
//#define AMOEBA_DEBUG
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#undef AMOEBA_DEBUG
using
std
::
vector
;
extern
"C"
extern
"C"
amoebaGpuContext
amoebaGpuInit
(
_gpuContext
*
gpu
)
amoebaGpuContext
amoebaGpuInit
(
_gpuContext
*
gpu
)
{
{
...
@@ -68,6 +75,12 @@ amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
...
@@ -68,6 +75,12 @@ amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
amoebaGpu
->
amoebaSim
.
numberOfAtoms
=
gpu
->
natoms
;
amoebaGpu
->
amoebaSim
.
numberOfAtoms
=
gpu
->
natoms
;
amoebaGpu
->
amoebaSim
.
paddedNumberOfAtoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
amoebaGpu
->
amoebaSim
.
paddedNumberOfAtoms
=
gpu
->
sim
.
paddedNumberOfAtoms
;
amoebaGpu
->
psThetai1
=
NULL
;
amoebaGpu
->
psThetai2
=
NULL
;
amoebaGpu
->
psThetai3
=
NULL
;
amoebaGpu
->
psQfac
=
NULL
;
amoebaGpu
->
psIgrid
=
NULL
;
return
amoebaGpu
;
return
amoebaGpu
;
}
}
...
@@ -2176,6 +2189,118 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
...
@@ -2176,6 +2189,118 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
}
}
extern
"C"
void
gpuSetAmoebaPMEParameters
(
amoebaGpuContext
amoebaGpu
,
float
alpha
,
int
gridSizeX
,
int
gridSizeY
,
int
gridSizeZ
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
gpu
->
sim
.
alphaEwald
=
alpha
;
int3
gridSize
=
make_int3
(
gridSizeX
,
gridSizeY
,
gridSizeZ
);
gpu
->
sim
.
pmeGridSize
=
gridSize
;
int3
groupSize
=
make_int3
(
2
,
4
,
4
);
gpu
->
sim
.
pmeGroupSize
=
groupSize
;
const
int3
numGroups
=
make_int3
((
gridSize
.
x
+
groupSize
.
x
-
1
)
/
groupSize
.
x
,
(
gridSize
.
y
+
groupSize
.
y
-
1
)
/
groupSize
.
y
,
(
gridSize
.
z
+
groupSize
.
z
-
1
)
/
groupSize
.
z
);
const
unsigned
int
totalGroups
=
numGroups
.
x
*
numGroups
.
y
*
numGroups
.
z
;
cufftPlan3d
(
&
gpu
->
fftplan
,
gridSize
.
x
,
gridSize
.
y
,
gridSize
.
z
,
CUFFT_C2C
);
gpu
->
psPmeGrid
=
new
CUDAStream
<
cufftComplex
>
(
gridSize
.
x
*
gridSize
.
y
*
gridSize
.
z
,
1
,
"PmeGrid"
);
gpu
->
sim
.
pPmeGrid
=
gpu
->
psPmeGrid
->
_pDevData
;
gpu
->
psPmeBsplineModuli
[
0
]
=
new
CUDAStream
<
float
>
(
gridSize
.
x
,
1
,
"PmeBsplineModuli0"
);
gpu
->
sim
.
pPmeBsplineModuli
[
0
]
=
gpu
->
psPmeBsplineModuli
[
0
]
->
_pDevData
;
gpu
->
psPmeBsplineModuli
[
1
]
=
new
CUDAStream
<
float
>
(
gridSize
.
y
,
1
,
"PmeBsplineModuli1"
);
gpu
->
sim
.
pPmeBsplineModuli
[
1
]
=
gpu
->
psPmeBsplineModuli
[
1
]
->
_pDevData
;
gpu
->
psPmeBsplineModuli
[
2
]
=
new
CUDAStream
<
float
>
(
gridSize
.
z
,
1
,
"PmeBsplineModuli2"
);
gpu
->
sim
.
pPmeBsplineModuli
[
2
]
=
gpu
->
psPmeBsplineModuli
[
2
]
->
_pDevData
;
amoebaGpu
->
psQfac
=
new
CUDAStream
<
float
>
(
gridSize
.
x
*
gridSize
.
y
*
gridSize
.
z
,
1
,
"qfac"
);
amoebaGpu
->
amoebaSim
.
pQfac
=
amoebaGpu
->
psQfac
->
_pDevData
;
amoebaGpu
->
psThetai1
=
new
CUDAStream
<
float4
>
(
AMOEBA_PME_ORDER
*
gpu
->
natoms
,
1
,
"thetai1"
);
amoebaGpu
->
amoebaSim
.
pThetai1
=
amoebaGpu
->
psThetai1
->
_pDevData
;
amoebaGpu
->
psThetai2
=
new
CUDAStream
<
float4
>
(
AMOEBA_PME_ORDER
*
gpu
->
natoms
,
1
,
"thetai2"
);
amoebaGpu
->
amoebaSim
.
pThetai2
=
amoebaGpu
->
psThetai2
->
_pDevData
;
amoebaGpu
->
psThetai3
=
new
CUDAStream
<
float4
>
(
AMOEBA_PME_ORDER
*
gpu
->
natoms
,
1
,
"thetai3"
);
amoebaGpu
->
amoebaSim
.
pThetai3
=
amoebaGpu
->
psThetai3
->
_pDevData
;
amoebaGpu
->
psIgrid
=
new
CUDAStream
<
int4
>
(
gpu
->
natoms
,
1
,
"igrid"
);
amoebaGpu
->
amoebaSim
.
pIgrid
=
amoebaGpu
->
psIgrid
->
_pDevData
;
gpu
->
psPmeAtomGridIndex
=
new
CUDAStream
<
int2
>
(
gpu
->
natoms
,
1
,
"PmeAtomGridIndex"
);
gpu
->
sim
.
pPmeAtomGridIndex
=
gpu
->
psPmeAtomGridIndex
->
_pDevData
;
gpu
->
psPmeBsplineTheta
=
new
CUDAStream
<
float4
>
(
1
,
1
,
"PmeBsplineTheta"
);
// Not actually uesd
gpu
->
psPmeBsplineDtheta
=
new
CUDAStream
<
float4
>
(
1
,
1
,
"PmeBsplineDtheta"
);
// Not actually used
// Initialize the b-spline moduli.
double
array
[
AMOEBA_PME_ORDER
];
double
x
=
0.0
;
array
[
0
]
=
1.0
-
x
;
array
[
1
]
=
x
;
for
(
int
k
=
2
;
k
<
AMOEBA_PME_ORDER
;
k
++
)
{
double
denom
=
1.0
/
k
;
array
[
k
]
=
x
*
array
[
k
-
1
]
*
denom
;
for
(
int
i
=
1
;
i
<
k
;
i
++
)
array
[
k
-
i
]
=
((
x
+
i
)
*
array
[
k
-
i
-
1
]
+
((
k
-
i
+
1
)
-
x
)
*
array
[
k
-
i
])
*
denom
;
array
[
0
]
=
(
1.0
-
x
)
*
array
[
0
]
*
denom
;
}
int
maxSize
=
std
::
max
(
std
::
max
(
gridSize
.
x
,
gridSize
.
y
),
gridSize
.
z
);
vector
<
double
>
bsarray
(
maxSize
+
1
,
0.0
);
for
(
int
i
=
2
;
i
<=
AMOEBA_PME_ORDER
+
1
;
i
++
)
bsarray
[
i
]
=
array
[
i
-
2
];
for
(
int
dim
=
0
;
dim
<
3
;
dim
++
)
{
float
*
bsmod
=
gpu
->
psPmeBsplineModuli
[
dim
]
->
_pSysData
;
int
size
=
gpu
->
psPmeBsplineModuli
[
dim
]
->
_length
;
// get the modulus of the discrete Fourier transform
double
factor
=
2.0
*
M_PI
/
size
;
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
double
sum1
=
0.0
;
double
sum2
=
0.0
;
for
(
int
j
=
1
;
j
<=
size
;
j
++
)
{
double
arg
=
factor
*
i
*
(
j
-
1
);
sum1
=
sum1
+
bsarray
[
j
]
*
cos
(
arg
);
sum2
=
sum2
+
bsarray
[
j
]
*
sin
(
arg
);
}
bsmod
[
i
]
=
sum1
*
sum1
+
sum2
*
sum2
;
}
// fix for exponential Euler spline interpolation failure
double
eps
=
1.0e-7
;
if
(
bsmod
[
0
]
<
eps
)
bsmod
[
0
]
=
0.5
*
bsmod
[
1
];
for
(
int
i
=
1
;
i
<
size
-
1
;
i
++
)
if
(
bsmod
[
i
]
<
eps
)
bsmod
[
i
]
=
0.5
*
(
bsmod
[
i
-
1
]
+
bsmod
[
i
+
1
]);
if
(
bsmod
[
size
-
1
]
<
eps
)
bsmod
[
size
-
1
]
=
0.5
*
bsmod
[
size
-
2
];
// compute and apply the optimal zeta coefficient
int
jcut
=
50
;
for
(
int
i
=
1
;
i
<=
size
;
i
++
)
{
int
k
=
i
-
1
;
if
(
i
>
size
/
2
)
k
=
k
-
size
;
double
zeta
;
if
(
k
==
0
)
zeta
=
1.0
;
else
{
double
sum1
=
1.0
;
double
sum2
=
1.0
;
factor
=
M_PI
*
k
/
size
;
for
(
int
j
=
1
;
j
<=
jcut
;
j
++
)
{
double
arg
=
factor
/
(
factor
+
M_PI
*
j
);
sum1
=
sum1
+
pow
(
arg
,
AMOEBA_PME_ORDER
);
sum2
=
sum2
+
pow
(
arg
,
2
*
AMOEBA_PME_ORDER
);
}
for
(
int
j
=
1
;
j
<=
jcut
;
j
++
)
{
double
arg
=
factor
/
(
factor
-
M_PI
*
j
);
sum1
=
sum1
+
pow
(
arg
,
AMOEBA_PME_ORDER
);
sum2
=
sum2
+
pow
(
arg
,
2
*
AMOEBA_PME_ORDER
);
}
zeta
=
sum2
/
sum1
;
}
bsmod
[
i
-
1
]
=
bsmod
[
i
-
1
]
*
zeta
*
zeta
;
}
}
}
extern
"C"
extern
"C"
void
amoebaGpuBuildVdwExclusionList
(
amoebaGpuContext
amoebaGpu
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusions
)
void
amoebaGpuBuildVdwExclusionList
(
amoebaGpuContext
amoebaGpu
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusions
)
{
{
...
@@ -2573,6 +2698,14 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
...
@@ -2573,6 +2698,14 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
delete
gpu
->
ps_P_ScaleIndices
;
delete
gpu
->
ps_P_ScaleIndices
;
delete
gpu
->
ps_M_ScaleIndices
;
delete
gpu
->
ps_M_ScaleIndices
;
if
(
gpu
->
psThetai1
)
{
delete
gpu
->
psThetai1
;
delete
gpu
->
psThetai2
;
delete
gpu
->
psThetai3
;
delete
gpu
->
psQfac
;
delete
gpu
->
psIgrid
;
}
if
(
gpu
->
pMapArray
){
if
(
gpu
->
pMapArray
){
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
paddedNumberOfAtoms
;
ii
++
){
for
(
unsigned
int
ii
=
0
;
ii
<
gpu
->
paddedNumberOfAtoms
;
ii
++
){
delete
gpu
->
pMapArray
[
ii
];
delete
gpu
->
pMapArray
[
ii
];
...
@@ -2626,6 +2759,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
...
@@ -2626,6 +2759,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
SetCalculateAmoebaCudaFixedEAndGKFieldsSim
(
amoebaGpu
);
SetCalculateAmoebaCudaFixedEAndGKFieldsSim
(
amoebaGpu
);
SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim
(
amoebaGpu
);
SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim
(
amoebaGpu
);
SetCalculateObcGbsaForces2Sim
(
amoebaGpu
->
gpuContext
);
SetCalculateObcGbsaForces2Sim
(
amoebaGpu
->
gpuContext
);
SetCalculateAmoebaPMESim
(
amoebaGpu
);
}
}
extern
"C"
extern
"C"
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
View file @
fb62fecb
...
@@ -156,5 +156,9 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
...
@@ -156,5 +156,9 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
//extern int isNanOrInfinity( double number );
//extern int isNanOrInfinity( double number );
extern
void
trackMutualInducedIterations
(
amoebaGpuContext
amoebaGpu
,
int
iteration
);
extern
void
trackMutualInducedIterations
(
amoebaGpuContext
amoebaGpu
,
int
iteration
);
// PME
extern
void
SetCalculateAmoebaPMESim
(
amoebaGpuContext
amoebaGpu
);
#endif //__AMOEBA_GPU_TYPES_H__
#endif //__AMOEBA_GPU_TYPES_H__
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
View file @
fb62fecb
...
@@ -47,6 +47,8 @@ enum CudaAmoebaNonbondedMethod
...
@@ -47,6 +47,8 @@ enum CudaAmoebaNonbondedMethod
AMOEBA_PARTICLE_MESH_EWALD
AMOEBA_PARTICLE_MESH_EWALD
};
};
static
const
int
AMOEBA_PME_ORDER
=
5
;
struct
cudaAmoebaGmxSimulation
{
struct
cudaAmoebaGmxSimulation
{
// Constants
// Constants
...
@@ -177,6 +179,13 @@ struct cudaAmoebaGmxSimulation {
...
@@ -177,6 +179,13 @@ struct cudaAmoebaGmxSimulation {
float
fd
;
// electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
float
fd
;
// electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
float
fq
;
// electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);
float
fq
;
// electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);
// PME arrays
float
*
pQfac
;
float4
*
pThetai1
;
float4
*
pThetai2
;
float4
*
pThetai3
;
int4
*
pIgrid
;
};
};
#endif
#endif
plugins/amoeba/platforms/cuda/src/kernels/amoebaGpuTypes.h
View file @
fb62fecb
...
@@ -219,6 +219,14 @@ struct _amoebaGpuContext {
...
@@ -219,6 +219,14 @@ struct _amoebaGpuContext {
// Wca dispersion fields
// Wca dispersion fields
CUDAStream
<
float2
>*
psWcaDispersionRadiusEpsilon
;
CUDAStream
<
float2
>*
psWcaDispersionRadiusEpsilon
;
// PME fields
CUDAStream
<
float
>*
psQfac
;
CUDAStream
<
float4
>*
psThetai1
;
CUDAStream
<
float4
>*
psThetai2
;
CUDAStream
<
float4
>*
psThetai3
;
CUDAStream
<
int4
>*
psIgrid
;
};
};
typedef
struct
_amoebaGpuContext
*
amoebaGpuContext
;
typedef
struct
_amoebaGpuContext
*
amoebaGpuContext
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
0 → 100644
View file @
fb62fecb
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
static
__constant__
cudaGmxSimulation
cSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
void
SetCalculateAmoebaPMESim
(
amoebaGpuContext
amoebaGpu
)
{
cudaError_t
status
;
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
status
=
cudaMemcpyToSymbol
(
cSim
,
&
gpu
->
sim
,
sizeof
(
cudaGmxSimulation
));
RTERROR
(
status
,
"SetCalculateAmoebaPMESim: cudaMemcpyToSymbol: SetSim copy to cSim failed"
);
status
=
cudaMemcpyToSymbol
(
cAmoebaSim
,
&
amoebaGpu
->
amoebaSim
,
sizeof
(
cudaAmoebaGmxSimulation
));
RTERROR
(
status
,
"SetCalculateAmoebaPMESim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed"
);
}
#define ARRAY(x,y) array[(x)-1+((y)-1)*AMOEBA_PME_ORDER]
/**
* This is called from computeBsplines(). It calculates the spline coefficients for a single atom along a single axis.
*/
__device__
void
computeBSplinePoint
(
float4
*
thetai
,
float
w
,
float
*
array
)
{
// real*8 thetai(4,AMOEBA_PME_ORDER)
// real*8 array(AMOEBA_PME_ORDER,AMOEBA_PME_ORDER)
// initialization to get to 2nd order recursion
ARRAY
(
2
,
2
)
=
w
;
ARRAY
(
2
,
1
)
=
1.0
f
-
w
;
// perform one pass to get to 3rd order recursion
ARRAY
(
3
,
3
)
=
0.5
f
*
w
*
ARRAY
(
2
,
2
);
ARRAY
(
3
,
2
)
=
0.5
f
*
((
1.0
f
+
w
)
*
ARRAY
(
2
,
1
)
+
(
2.0
f
-
w
)
*
ARRAY
(
2
,
2
));
ARRAY
(
3
,
1
)
=
0.5
f
*
(
1.0
f
-
w
)
*
ARRAY
(
2
,
1
);
// compute standard B-spline recursion to desired order
for
(
int
i
=
4
;
i
<=
AMOEBA_PME_ORDER
;
i
++
)
{
int
k
=
i
-
1
;
float
denom
=
1.0
f
/
k
;
ARRAY
(
i
,
i
)
=
denom
*
w
*
ARRAY
(
k
,
k
);
for
(
int
j
=
1
;
j
<=
i
-
2
;
j
++
)
ARRAY
(
i
,
i
-
j
)
=
denom
*
((
w
+
j
)
*
ARRAY
(
k
,
i
-
j
-
1
)
+
(
i
-
j
-
w
)
*
ARRAY
(
k
,
i
-
j
));
ARRAY
(
i
,
1
)
=
denom
*
(
1.0
f
-
w
)
*
ARRAY
(
k
,
1
);
}
// get coefficients for the B-spline first derivative
int
k
=
AMOEBA_PME_ORDER
-
1
;
ARRAY
(
k
,
AMOEBA_PME_ORDER
)
=
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
1
);
for
(
int
i
=
AMOEBA_PME_ORDER
-
1
;
i
>=
2
;
i
--
)
ARRAY
(
k
,
i
)
=
ARRAY
(
k
,
i
-
1
)
-
ARRAY
(
k
,
i
);
ARRAY
(
k
,
1
)
=
-
ARRAY
(
k
,
1
);
// get coefficients for the B-spline second derivative
k
=
AMOEBA_PME_ORDER
-
2
;
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
1
)
=
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
2
);
for
(
int
i
=
AMOEBA_PME_ORDER
-
2
;
i
>=
2
;
i
--
)
ARRAY
(
k
,
i
)
=
ARRAY
(
k
,
i
-
1
)
-
ARRAY
(
k
,
i
);
ARRAY
(
k
,
1
)
=
-
ARRAY
(
k
,
1
);
ARRAY
(
k
,
AMOEBA_PME_ORDER
)
=
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
1
);
for
(
int
i
=
AMOEBA_PME_ORDER
-
1
;
i
>=
2
;
i
--
)
ARRAY
(
k
,
i
)
=
ARRAY
(
k
,
i
-
1
)
-
ARRAY
(
k
,
i
);
ARRAY
(
k
,
1
)
=
-
ARRAY
(
k
,
1
);
// get coefficients for the B-spline third derivative
k
=
AMOEBA_PME_ORDER
-
3
;
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
2
)
=
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
3
);
for
(
int
i
=
AMOEBA_PME_ORDER
-
3
;
i
>=
2
;
i
--
)
ARRAY
(
k
,
i
)
=
ARRAY
(
k
,
i
-
1
)
-
ARRAY
(
k
,
i
);
ARRAY
(
k
,
1
)
=
-
ARRAY
(
k
,
1
);
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
1
)
=
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
2
);
for
(
int
i
=
AMOEBA_PME_ORDER
-
2
;
i
>=
2
;
i
--
)
ARRAY
(
k
,
i
)
=
ARRAY
(
k
,
i
-
1
)
-
ARRAY
(
k
,
i
);
ARRAY
(
k
,
1
)
=
-
ARRAY
(
k
,
1
);
ARRAY
(
k
,
AMOEBA_PME_ORDER
)
=
ARRAY
(
k
,
AMOEBA_PME_ORDER
-
1
);
for
(
int
i
=
AMOEBA_PME_ORDER
-
1
;
i
>=
2
;
i
--
)
ARRAY
(
k
,
i
)
=
ARRAY
(
k
,
i
-
1
)
-
ARRAY
(
k
,
i
);
ARRAY
(
k
,
1
)
=
-
ARRAY
(
k
,
1
);
// copy coefficients from temporary to permanent storage
for
(
int
i
=
1
;
i
<=
AMOEBA_PME_ORDER
;
i
++
)
thetai
[
i
]
=
make_float4
(
ARRAY
(
AMOEBA_PME_ORDER
,
i
),
ARRAY
(
AMOEBA_PME_ORDER
-
1
,
i
),
ARRAY
(
AMOEBA_PME_ORDER
-
2
,
i
),
ARRAY
(
AMOEBA_PME_ORDER
-
3
,
i
));
}
/**
* Compute bspline coefficients.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
256
,
1
)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
256
,
1
)
#else
__launch_bounds__
(
128
,
1
)
#endif
void
computeBsplines_kernel
()
{
extern
__shared__
float
bsplines_cache
[];
// size = block_size*pme_order*pme_order
float
*
array
=
&
bsplines_cache
[
threadIdx
.
x
*
AMOEBA_PME_ORDER
*
AMOEBA_PME_ORDER
];
// get the B-spline coefficients for each multipole site
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
cSim
.
atoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
float4
posq
=
cSim
.
pPosq
[
i
];
posq
.
x
-=
floor
(
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
periodicBoxSizeX
;
posq
.
y
-=
floor
(
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
periodicBoxSizeY
;
posq
.
z
-=
floor
(
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
float3
t
=
make_float3
((
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
)
*
cSim
.
pmeGridSize
.
x
,
(
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
)
*
cSim
.
pmeGridSize
.
y
,
(
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
pmeGridSize
.
z
);
// First axis.
float
w
=
posq
.
x
*
cSim
.
invPeriodicBoxSizeX
;
float
fr
=
cSim
.
pmeGridSize
.
x
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
AMOEBA_PME_ORDER
;
computeBSplinePoint
(
&
cAmoebaSim
.
pThetai1
[
i
*
AMOEBA_PME_ORDER
],
w
,
array
);
// Second axis.
w
=
posq
.
y
*
cSim
.
invPeriodicBoxSizeY
;
fr
=
cSim
.
pmeGridSize
.
y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
AMOEBA_PME_ORDER
;
computeBSplinePoint
(
&
cAmoebaSim
.
pThetai2
[
i
*
AMOEBA_PME_ORDER
],
w
,
array
);
// Third axis.
w
=
posq
.
z
*
cSim
.
invPeriodicBoxSizeZ
;
fr
=
cSim
.
pmeGridSize
.
z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
AMOEBA_PME_ORDER
;
computeBSplinePoint
(
&
cAmoebaSim
.
pThetai3
[
i
*
AMOEBA_PME_ORDER
],
w
,
array
);
cAmoebaSim
.
pIgrid
[
i
]
=
make_int4
(
igrid1
,
igrid2
,
igrid3
,
0
);
cSim
.
pPmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
igrid2
*
cSim
.
pmeGridSize
.
z
+
igrid3
);
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
512
,
1
)
#else
__launch_bounds__
(
256
,
1
)
#endif
void
kGridSpreadMultipoles_kernel
()
{
unsigned
int
numGridPoints
=
cSim
.
pmeGridSize
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
int3
gridPoint
;
gridPoint
.
x
=
gridIndex
/
(
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
gridPoint
.
y
=
remainder
/
cSim
.
pmeGridSize
.
z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
cSim
.
pmeGridSize
.
z
;
float
result
=
0.0
f
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
cSim
.
pmeGridSize
.
x
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
cSim
.
pmeGridSize
.
y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
cSim
.
pmeGridSize
.
z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
cSim
.
pmeGridSize
.
z
-
1
);
int
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z1
;
int
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z2
;
int
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
int
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
cSim
.
pPosq
[
atomIndex
].
w
;
float
atomDipoleX
=
cAmoebaSim
.
pLabFrameDipole
[
atomIndex
+
3
];
float
atomDipoleY
=
cAmoebaSim
.
pLabFrameDipole
[
atomIndex
+
3
+
1
];
float
atomDipoleZ
=
cAmoebaSim
.
pLabFrameDipole
[
atomIndex
+
3
+
2
];
float
atomQuadrupoleXX
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
];
float
atomQuadrupoleXY
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
1
];
float
atomQuadrupoleXZ
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
2
];
float
atomQuadrupoleYY
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
4
];
float
atomQuadrupoleYZ
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
5
];
float
atomQuadrupoleZZ
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
8
];
float4
t
=
cAmoebaSim
.
pThetai1
[
atomIndex
+
ix
*
cSim
.
atoms
];
float4
u
=
cAmoebaSim
.
pThetai2
[
atomIndex
+
iy
*
cSim
.
atoms
];
float4
v
=
cAmoebaSim
.
pThetai3
[
atomIndex
+
iz
*
cSim
.
atoms
];
float
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
float
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
float
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
result
+=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
}
if
(
z1
>
gridPoint
.
z
)
{
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
;
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
gridPoint
.
z
;
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
cSim
.
pPosq
[
atomIndex
].
w
;
float
atomDipoleX
=
cAmoebaSim
.
pLabFrameDipole
[
atomIndex
+
3
];
float
atomDipoleY
=
cAmoebaSim
.
pLabFrameDipole
[
atomIndex
+
3
+
1
];
float
atomDipoleZ
=
cAmoebaSim
.
pLabFrameDipole
[
atomIndex
+
3
+
2
];
float
atomQuadrupoleXX
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
];
float
atomQuadrupoleXY
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
1
];
float
atomQuadrupoleXZ
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
2
];
float
atomQuadrupoleYY
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
4
];
float
atomQuadrupoleYZ
=
2
*
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
5
];
float
atomQuadrupoleZZ
=
cAmoebaSim
.
pLabFrameQuadrupole
[
atomIndex
+
9
+
8
];
float4
t
=
cAmoebaSim
.
pThetai1
[
atomIndex
+
ix
*
cSim
.
atoms
];
float4
u
=
cAmoebaSim
.
pThetai2
[
atomIndex
+
iy
*
cSim
.
atoms
];
float4
v
=
cAmoebaSim
.
pThetai3
[
atomIndex
+
iz
*
cSim
.
atoms
];
float
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
float
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
float
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
result
+=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
}
}
}
}
cSim
.
pPmeGrid
[
gridIndex
]
=
make_cuComplex
(
result
*
sqrt
(
cSim
.
epsfac
),
0.0
f
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment