Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
7b66ba19
Commit
7b66ba19
authored
Jan 19, 2017
by
Andy Simmonett
Browse files
Added initial CUDA LJPME code, and modified cpupme plugin to handle LJPME as a standalone method.
parent
94cb8614
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
1145 additions
and
78 deletions
+1145
-78
olla/include/openmm/kernels.h
olla/include/openmm/kernels.h
+51
-0
platforms/cpu/src/CpuKernels.cpp
platforms/cpu/src/CpuKernels.cpp
+3
-2
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+34
-6
platforms/cuda/include/CudaParallelKernels.h
platforms/cuda/include/CudaParallelKernels.h
+9
-0
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+264
-22
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+4
-0
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+4
-0
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+44
-1
platforms/cuda/src/kernels/ljpme.cu
platforms/cuda/src/kernels/ljpme.cu
+325
-0
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+6
-5
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+1
-1
plugins/cpupme/src/CpuPmeKernelFactory.cpp
plugins/cpupme/src/CpuPmeKernelFactory.cpp
+2
-0
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+303
-31
plugins/cpupme/src/CpuPmeKernels.h
plugins/cpupme/src/CpuPmeKernels.h
+91
-7
plugins/cpupme/tests/TestCpuPme.cpp
plugins/cpupme/tests/TestCpuPme.cpp
+1
-2
tests/TestNonbondedForce.h
tests/TestNonbondedForce.h
+3
-1
No files found.
olla/include/openmm/kernels.h
View file @
7b66ba19
...
...
@@ -1345,6 +1345,57 @@ public:
};
/**
* This kernel performs the dispersion reciprocal space calculation for LJPME. In most cases, this
* calculation is done directly by CalcNonbondedForceKernel so this kernel is unneeded.
* In some cases it may want to outsource the work to a different kernel. In particular,
* GPU based platforms sometimes use a CPU based implementation provided by a separate
* plugin.
*/
class
CalcDispersionPmeReciprocalForceKernel
:
public
KernelImpl
{
public:
class
IO
;
static
std
::
string
Name
()
{
return
"CalcDispersionPmeReciprocalForce"
;
}
CalcDispersionPmeReciprocalForceKernel
(
std
::
string
name
,
const
Platform
&
platform
)
:
KernelImpl
(
name
,
platform
)
{
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
virtual
void
initialize
(
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
double
alpha
)
=
0
;
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxVectors the vectors defining the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
virtual
void
beginComputation
(
IO
&
io
,
const
Vec3
*
periodicBoxVectors
,
bool
includeEnergy
)
=
0
;
/**
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
virtual
double
finishComputation
(
IO
&
io
)
=
0
;
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
virtual
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
=
0
;
};
}
// namespace OpenMM
#endif
/*OPENMM_KERNELS_H_*/
platforms/cpu/src/CpuKernels.cpp
View file @
7b66ba19
...
...
@@ -674,8 +674,9 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if
(
useOptimizedPme
)
{
optimizedPme
=
getPlatform
().
createKernel
(
CalcPmeReciprocalForceKernel
::
Name
(),
context
);
optimizedPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
initialize
(
gridSize
[
0
],
gridSize
[
1
],
gridSize
[
2
],
numParticles
,
ewaldAlpha
);
optimizedDispersionPme
=
getPlatform
().
createKernel
(
CalcPmeReciprocalForceKernel
::
Name
(),
context
);
optimizedDispersionPme
.
getAs
<
CalcPmeReciprocalForceKernel
>
().
initialize
(
dispersionGridSize
[
0
],
dispersionGridSize
[
1
],
dispersionGridSize
[
2
],
numParticles
,
ewaldDispersionAlpha
);
optimizedDispersionPme
=
getPlatform
().
createKernel
(
CalcDispersionPmeReciprocalForceKernel
::
Name
(),
context
);
optimizedDispersionPme
.
getAs
<
CalcDispersionPmeReciprocalForceKernel
>
().
initialize
(
dispersionGridSize
[
0
],
dispersionGridSize
[
1
],
dispersionGridSize
[
2
],
numParticles
,
ewaldDispersionAlpha
);
}
}
}
...
...
platforms/cuda/include/CudaKernels.h
View file @
7b66ba19
...
...
@@ -598,8 +598,10 @@ private:
class
CudaCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
pmeEnergyBuffer
(
NULL
),
sort
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
)
{
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
C6s
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
directDispersionPmeGrid
(
NULL
),
reciprocalDispersionPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
pmeAtomDispersionGridIndex
(
NULL
),
pmeEnergyBuffer
(
NULL
),
dispersionPmeEnergyBuffer
(
NULL
),
sort
(
NULL
),
dispersionFft
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
),
dispersionPmeio
(
NULL
)
{
}
~
CudaCalcNonbondedForceKernel
();
/**
...
...
@@ -636,6 +638,15 @@ public:
* @param nz the number of grid points along the Z axis
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
class
SortTrait
:
public
CudaSort
::
SortTrait
{
int
getDataSize
()
const
{
return
8
;}
...
...
@@ -655,38 +666,55 @@ private:
CudaContext
&
cu
;
bool
hasInitializedFFT
;
CudaArray
*
sigmaEpsilon
;
CudaArray
*
C6s
;
CudaArray
*
exceptionParams
;
CudaArray
*
cosSinSums
;
CudaArray
*
directPmeGrid
;
CudaArray
*
reciprocalPmeGrid
;
CudaArray
*
directDispersionPmeGrid
;
CudaArray
*
reciprocalDispersionPmeGrid
;
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeAtomDispersionGridIndex
;
CudaArray
*
pmeEnergyBuffer
;
CudaArray
*
dispersionPmeEnergyBuffer
;
CudaSort
*
sort
;
Kernel
cpuPme
;
Kernel
cpuDispersionPme
;
PmeIO
*
pmeio
;
CUstream
pmeStream
;
CUevent
pmeSyncEvent
;
PmeIO
*
dispersionPmeio
;
CUstream
pmeStream
,
dispersionPmeStream
;
CUevent
pmeSyncEvent
,
dispersionPmeSyncEvent
;
CudaFFT3D
*
fft
;
cufftHandle
fftForward
;
cufftHandle
fftBackward
;
CudaFFT3D
*
dispersionFft
;
cufftHandle
dispersionFftForward
;
cufftHandle
dispersionFftBackward
;
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeDispersionGridIndexKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeDispersionSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeDispersionFinishSpreadChargeKernel
;
CUfunction
pmeEvalEnergyKernel
;
CUfunction
pmeEvalDispersionEnergyKernel
;
CUfunction
pmeConvolutionKernel
;
CUfunction
pmeDispersionConvolutionKernel
;
CUfunction
pmeInterpolateForceKernel
;
CUfunction
pmeInterpolateDispersionForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersion
Coefficient
,
a
lpha
;
double
ewaldSelfEnergy
,
dispersion
SelfEnergy
,
dispersionCoefficient
,
alpha
,
dispersionA
lpha
;
int
interpolateForceThreads
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
,
useCudaFFT
;
int
dispersionGridSizeX
,
dispersionGridSizeY
,
dispersionGridSizeZ
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
,
useCudaFFT
,
doLJPME
;
NonbondedMethod
nonbondedMethod
;
static
const
int
PmeOrder
=
5
;
};
...
...
platforms/cuda/include/CudaParallelKernels.h
View file @
7b66ba19
...
...
@@ -439,6 +439,15 @@ public:
* @param nz the number of grid points along the Z axis
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
/**
* Get the dispersion parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
class
Task
;
CudaPlatform
::
PlatformData
&
data
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
7b66ba19
This diff is collapsed.
Click to expand it.
platforms/cuda/src/CudaParallelKernels.cpp
View file @
7b66ba19
...
...
@@ -628,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
void
CudaParallelCalcNonbondedForceKernel
::
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
{
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getLJPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
class
CudaParallelCalcCustomNonbondedForceKernel
::
Task
:
public
CudaContext
::
WorkTask
{
public:
Task
(
ContextImpl
&
context
,
CudaCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
7b66ba19
...
...
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
CHECK_RESULT
(
cuDeviceGetName
(
name
,
1000
,
contexts
[
i
]
->
getDevice
()),
"Error querying device name"
);
deviceName
<<
name
;
}
size_t
printfsize
;
cuCtxGetLimit
(
&
printfsize
,
CU_LIMIT_PRINTF_FIFO_SIZE
);
cuCtxSetLimit
(
CU_LIMIT_PRINTF_FIFO_SIZE
,
10
*
printfsize
);
useCpuPme
=
(
cpuPmeProperty
==
"true"
&&
!
contexts
[
0
]
->
getUseDoublePrecision
());
disablePmeStream
=
(
pmeStreamProperty
==
"true"
);
deterministicForces
=
(
deterministicForcesProperty
==
"true"
);
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
7b66ba19
...
...
@@ -17,6 +17,25 @@
const
real
erfcAlphaR
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expAlphaRSqr
;
#endif
real
tempForce
=
0.0
f
;
#if HAS_LENNARD_JONES
// The multiplicative term to correct for the multiplicative terms that are always
// present in reciprocal space. The real terms have an additive contribution
// added in, but for excluded terms the multiplicative term is just subtracted.
// These factors are needed in both clauses of the needCorrection statement, so
// I declare them up here.
#if DO_LJPME
const
real
dispersionAlphaR
=
EWALD_DISPERSION_ALPHA
*
r
;
const
real
dar2
=
dispersionAlphaR
*
dispersionAlphaR
;
const
real
dar4
=
dar2
*
dar2
;
const
real
dar6
=
dar4
*
dar2
;
const
real
invR2
=
invR
*
invR
;
const
real
expDar2
=
EXP
(
-
dar2
);
const
real
c6
=
C6s1
*
C6s2
;
const
real
coef
=
invR2
*
invR2
*
invR2
*
c6
;
const
real
eprefac
=
1.0
f
+
dar2
+
0.5
f
*
dar4
;
const
real
dprefac
=
eprefac
+
dar6
/
6.0
f
;
#endif
#endif
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
...
@@ -29,6 +48,13 @@
includeInteraction
=
false
;
tempEnergy
-=
TWO_OVER_SQRT_PI
*
EWALD_ALPHA
*
138.935456
f
*
posq1
.
w
*
posq2
.
w
;
}
#if HAS_LENNARD_JONES
#if DO_LJPME
// The multiplicative grid term
tempEnergy
+=
coef
*
(
1.0
f
-
expDar2
*
eprefac
);
tempForce
+=
6.0
f
*
coef
*
(
1.0
f
-
expDar2
*
dprefac
);
#endif
#endif
}
else
{
#if HAS_LENNARD_JONES
...
...
@@ -36,7 +62,8 @@
real
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
real
sig6
=
sig2
*
sig2
*
sig2
;
real
epssig6
=
sig6
*
(
sigmaEpsilon1
.
y
*
sigmaEpsilon2
.
y
);
real
eps
=
sigmaEpsilon1
.
y
*
sigmaEpsilon2
.
y
;
real
epssig6
=
sig6
*
eps
;
tempForce
=
epssig6
*
(
12.0
f
*
sig6
-
6.0
f
);
real
ljEnergy
=
epssig6
*
(
sig6
-
1.0
f
);
#if USE_LJ_SWITCH
...
...
@@ -48,6 +75,22 @@
ljEnergy
*=
switchValue
;
}
#endif
#if DO_LJPME
// The multiplicative grid term
ljEnergy
+=
coef
*
(
1.0
f
-
expDar2
*
eprefac
);
tempForce
+=
6.0
f
*
coef
*
(
1.0
f
-
expDar2
*
dprefac
);
// The potential shift accounts for the step at the cutoff introduced by the
// transition from additive to multiplicative combintion rules and is only
// needed for the real (not excluded) terms. By addin these terms to ljEnergy
// instead of tempEnergy here, the includeInteraction mask is correctly applied.
sig2
=
sig
*
sig
;
sig6
=
sig2
*
sig2
*
sig2
*
INVCUT6
;
epssig6
=
eps
*
sig6
;
// The additive part of the potential shift
ljEnergy
+=
epssig6
*
(
1.0
f
-
sig6
);
// The multiplicative part of the potential shift
ljEnergy
+=
MULTSHIFT6
*
c6
;
#endif
tempForce
+=
prefactor
*
(
erfcAlphaR
+
alphaR
*
expAlphaRSqr
*
TWO_OVER_SQRT_PI
);
tempEnergy
+=
includeInteraction
?
ljEnergy
+
prefactor
*
erfcAlphaR
:
0
;
#else
...
...
platforms/cuda/src/kernels/ljpme.cu
0 → 100644
View file @
7b66ba19
extern
"C"
__global__
void
findAtomDispersionGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// Compute the index of the grid point each atom is associated with.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
DISPERSION_GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
DISPERSION_GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
DISPERSION_GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
DISPERSION_GRID_SIZE_X
,
((
int
)
t
.
y
)
%
DISPERSION_GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
DISPERSION_GRID_SIZE_Z
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
gridIndex
.
x
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
+
gridIndex
.
y
*
DISPERSION_GRID_SIZE_Z
+
gridIndex
.
z
);
}
}
extern
"C"
__global__
void
gridSpreadC6
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
C6s
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
atom
];
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
DISPERSION_GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
DISPERSION_GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
DISPERSION_GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
DISPERSION_GRID_SIZE_X
,
((
int
)
t
.
y
)
%
DISPERSION_GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
DISPERSION_GRID_SIZE_Z
);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
real
div
=
RECIP
(
j
-
1
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
DISPERSION_GRID_SIZE_X
?
DISPERSION_GRID_SIZE_X
:
0
);
xbase
=
xbase
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
DISPERSION_GRID_SIZE_Y
?
DISPERSION_GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
DISPERSION_GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
DISPERSION_GRID_SIZE_Z
?
DISPERSION_GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
// We need to grab the C6 coefficient from the array
real
add
=
C6s
[
atom
]
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#elif __CUDA_ARCH__ < 200 || defined(USE_DETERMINISTIC_FORCES)
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
int
gridIndex
=
index
;
gridIndex
=
(
gridIndex
%
2
==
0
?
gridIndex
/
2
:
(
gridIndex
+
DISPERSION_GRID_SIZE_X
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
)
/
2
);
atomicAdd
(
&
ulonglong_p
[
gridIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
atomicAdd
(
&
originalPmeGrid
[
index
],
add
);
#endif
}
}
}
}
}
extern
"C"
__global__
void
finishSpreadC6
(
long
long
*
__restrict__
originalPmeGrid
)
{
real
*
floatGrid
=
(
real
*
)
originalPmeGrid
;
const
unsigned
int
gridSize
=
DISPERSION_GRID_SIZE_X
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
real
scale
=
1.0
f
/
(
real
)
0x100000000
;
#ifdef USE_DOUBLE_PRECISION
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
];
#else
for
(
int
index
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
index
<
gridSize
;
index
+=
2
*
blockDim
.
x
*
gridDim
.
x
)
{
floatGrid
[
index
]
=
scale
*
originalPmeGrid
[
index
/
2
];
if
(
index
+
1
<
gridSize
)
floatGrid
[
index
+
1
]
=
scale
*
originalPmeGrid
[(
index
+
gridSize
+
1
)
/
2
];
}
#endif
}
// convolutes the dispersion grid on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
extern
"C"
__global__
void
reciprocalDispersionConvolution
(
real2
*
__restrict__
halfcomplex_pmeGrid
,
mixed
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
const
real
scaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
alpha
=
EWALD_DISPERSION_ALPHA
;
real
bfac
=
M_PI
/
alpha
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
alpha
*
alpha
*
alpha
;
real
fac3
=
-
2
*
alpha
*
M_PI
*
M_PI
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
int
kx
=
index
/
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
int
remainder
=
index
-
kx
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
int
ky
=
remainder
/
(
GRID_SIZE_Z
/
2
+
1
);
int
kz
=
remainder
-
ky
*
(
GRID_SIZE_Z
/
2
+
1
);
int
mx
=
(
kx
<
(
GRID_SIZE_X
+
1
)
/
2
)
?
kx
:
(
kx
-
GRID_SIZE_X
);
int
my
=
(
ky
<
(
GRID_SIZE_Y
+
1
)
/
2
)
?
ky
:
(
ky
-
GRID_SIZE_Y
);
int
mz
=
(
kz
<
(
GRID_SIZE_Z
+
1
)
/
2
)
?
kz
:
(
kz
-
GRID_SIZE_Z
);
real
mhx
=
mx
*
recipBoxVecX
.
x
;
real
mhy
=
mx
*
recipBoxVecY
.
x
+
my
*
recipBoxVecY
.
y
;
real
mhz
=
mx
*
recipBoxVecZ
.
x
+
my
*
recipBoxVecZ
.
y
+
mz
*
recipBoxVecZ
.
z
;
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
denom
=
scaleFactor
/
(
bx
*
by
*
bz
);
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real
t
=
1.0
f
/
(
1.0
f
+
0.3275911
f
*
b
);
real
erfcterm
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expterm
;
#else
real
erfcterm
=
ERFC
(
b
);
#endif
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
}
}
extern
"C"
__global__
void
gridEvaluateDispersionEnergy
(
real2
*
__restrict__
halfcomplex_pmeGrid
,
mixed
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
DISPERSION_GRID_SIZE_X
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
const
real
scaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
alpha
=
EWALD_DISPERSION_ALPHA
;
real
bfac
=
M_PI
/
alpha
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
alpha
*
alpha
*
alpha
;
real
fac3
=
-
2
*
alpha
*
M_PI
*
M_PI
;
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
int
kx
=
index
/
(
DISPERSION_GRID_SIZE_Y
*
(
DISPERSION_GRID_SIZE_Z
));
int
remainder
=
index
-
kx
*
DISPERSION_GRID_SIZE_Y
*
(
DISPERSION_GRID_SIZE_Z
);
int
ky
=
remainder
/
(
DISPERSION_GRID_SIZE_Z
);
int
kz
=
remainder
-
ky
*
(
DISPERSION_GRID_SIZE_Z
);
int
mx
=
(
kx
<
(
DISPERSION_GRID_SIZE_X
+
1
)
/
2
)
?
kx
:
(
kx
-
DISPERSION_GRID_SIZE_X
);
int
my
=
(
ky
<
(
DISPERSION_GRID_SIZE_Y
+
1
)
/
2
)
?
ky
:
(
ky
-
DISPERSION_GRID_SIZE_Y
);
int
mz
=
(
kz
<
(
DISPERSION_GRID_SIZE_Z
+
1
)
/
2
)
?
kz
:
(
kz
-
DISPERSION_GRID_SIZE_Z
);
real
mhx
=
mx
*
recipBoxVecX
.
x
;
real
mhy
=
mx
*
recipBoxVecY
.
x
+
my
*
recipBoxVecY
.
y
;
real
mhz
=
mx
*
recipBoxVecZ
.
x
+
my
*
recipBoxVecZ
.
y
+
mz
*
recipBoxVecZ
.
z
;
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
denom
=
scaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
#if FAST_ERFC
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 1.5e-7. Stolen by ACS from the CUDA platform's AMOEBA plugin.
real
t
=
1.0
f
/
(
1.0
f
+
0.3275911
f
*
b
);
real
erfcterm
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expterm
;
#else
real
erfcterm
=
ERFC
(
b
);
#endif
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
if
(
kz
>=
(
DISPERSION_GRID_SIZE_Z
/
2
+
1
))
{
kx
=
((
kx
==
0
)
?
kx
:
DISPERSION_GRID_SIZE_X
-
kx
);
ky
=
((
ky
==
0
)
?
ky
:
DISPERSION_GRID_SIZE_Y
-
ky
);
kz
=
DISPERSION_GRID_SIZE_Z
-
kz
;
}
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
DISPERSION_GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
DISPERSION_GRID_SIZE_Y
*
(
DISPERSION_GRID_SIZE_Z
/
2
+
1
));
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
// N.B. We inlcude the 0,0,0 point for dispersion
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
}
#ifdef USE_PME_STREAM
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
#else
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
#endif
}
extern
"C"
__global__
void
gridInterpolateDispersionForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
C6s
)
{
real3
data
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real3
force
=
make_real3
(
0
);
real4
pos
=
posq
[
atom
];
APPLY_PERIODIC_TO_POS
(
pos
)
real3
t
=
make_real3
(
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
,
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
,
pos
.
z
*
recipBoxVecZ
.
z
);
t
.
x
=
(
t
.
x
-
floor
(
t
.
x
))
*
DISPERSION_GRID_SIZE_X
;
t
.
y
=
(
t
.
y
-
floor
(
t
.
y
))
*
DISPERSION_GRID_SIZE_Y
;
t
.
z
=
(
t
.
z
-
floor
(
t
.
z
))
*
DISPERSION_GRID_SIZE_Z
;
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
DISPERSION_GRID_SIZE_X
,
((
int
)
t
.
y
)
%
DISPERSION_GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
DISPERSION_GRID_SIZE_Z
);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
real
div
=
RECIP
(
j
-
1
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
ddata
[
0
]
=
-
data
[
0
];
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
data
[
j
-
1
]
-
data
[
j
];
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Compute the force on this atom.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
DISPERSION_GRID_SIZE_X
?
DISPERSION_GRID_SIZE_X
:
0
);
xbase
=
xbase
*
DISPERSION_GRID_SIZE_Y
*
DISPERSION_GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
real
ddx
=
ddata
[
ix
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
DISPERSION_GRID_SIZE_Y
?
DISPERSION_GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
DISPERSION_GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
real
ddy
=
ddata
[
iy
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
DISPERSION_GRID_SIZE_Z
?
DISPERSION_GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real
gridvalue
=
originalPmeGrid
[
index
];
force
.
x
+=
ddx
*
dy
*
data
[
iz
].
z
*
gridvalue
;
force
.
y
+=
dx
*
ddy
*
data
[
iz
].
z
*
gridvalue
;
force
.
z
+=
dx
*
dy
*
ddata
[
iz
].
z
*
gridvalue
;
}
}
}
real
q
=
C6s
[
atom
];
real
forceX
=
-
q
*
(
force
.
x
*
DISPERSION_GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceY
=
-
q
*
(
force
.
x
*
DISPERSION_GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
DISPERSION_GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceZ
=
-
q
*
(
force
.
x
*
DISPERSION_GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
DISPERSION_GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
DISPERSION_GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
atomicAdd
(
&
forceBuffers
[
atom
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
forceX
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
forceY
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
forceZ
*
0x100000000
)));
}
}
platforms/opencl/src/OpenCLKernels.cpp
View file @
7b66ba19
...
...
@@ -2209,12 +2209,13 @@ void OpenCLCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx,
if (nonbondedMethod != LJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cl.getPlatformData().useCpuPme)
cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->
a
lpha;
nx =
g
ridSizeX;
ny =
g
ridSizeY;
nz =
g
ridSizeZ;
alpha = this->
dispersionA
lpha;
nx =
dispersionG
ridSizeX;
ny =
dispersionG
ridSizeY;
nz =
dispersionG
ridSizeZ;
}
}
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
7b66ba19
...
...
@@ -1155,7 +1155,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
NonbondedForce
nb
;
nb
.
setEwaldErrorTolerance
(
force
.
getEwaldErrorTolerance
());
nb
.
setCutoffDistance
(
force
.
getCutoffDistance
());
NonbondedForceImpl
::
calcPMEParameters
(
system
,
nb
,
alpha
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
NonbondedForceImpl
::
calcPMEParameters
(
system
,
nb
,
alpha
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
false
);
gridSizeX
=
CudaFFT3D
::
findLegalDimension
(
gridSizeX
);
gridSizeY
=
CudaFFT3D
::
findLegalDimension
(
gridSizeY
);
gridSizeZ
=
CudaFFT3D
::
findLegalDimension
(
gridSizeZ
);
...
...
plugins/cpupme/src/CpuPmeKernelFactory.cpp
View file @
7b66ba19
...
...
@@ -55,5 +55,7 @@ extern "C" OPENMM_EXPORT_PME void registerPlatforms() {
KernelImpl
*
CpuPmeKernelFactory
::
createKernelImpl
(
std
::
string
name
,
const
Platform
&
platform
,
ContextImpl
&
context
)
const
{
if
(
name
==
CalcPmeReciprocalForceKernel
::
Name
())
return
new
CpuCalcPmeReciprocalForceKernel
(
name
,
platform
);
if
(
name
==
CalcDispersionPmeReciprocalForceKernel
::
Name
())
return
new
CpuCalcDispersionPmeReciprocalForceKernel
(
name
,
platform
);
throw
OpenMMException
((
std
::
string
(
"Tried to create kernel with illegal kernel name '"
)
+
name
+
"'"
).
c_str
());
}
plugins/cpupme/src/CpuPmeKernels.cpp
View file @
7b66ba19
...
...
@@ -48,8 +48,8 @@ using namespace std;
static
const
int
PME_ORDER
=
5
;
bool
CpuCalcPmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
int
CpuCalcPmeReciprocalForceKernel
::
numThreads
=
0
;
bool
CpuCalc
Dispersion
PmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
int
CpuCalc
Dispersion
PmeReciprocalForceKernel
::
numThreads
=
0
;
static
void
spreadCharge
(
float
*
posq
,
float
*
grid
,
int
gridx
,
int
gridy
,
int
gridz
,
int
numParticles
,
Vec3
*
periodicBoxVectors
,
Vec3
*
recipBoxVectors
,
gmx_atomic_t
&
atomicCounter
,
const
float
epsilonFactor
)
{
float
temp
[
4
];
...
...
@@ -590,7 +590,7 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
int
complexSize
=
gridx
*
gridy
*
(
gridz
/
2
+
1
);
int
complexStart
=
std
::
max
(
1
,
((
index
*
complexSize
)
/
numThreads
));
int
complexEnd
=
(((
index
+
1
)
*
complexSize
)
/
numThreads
);
const
float
epsilonFactor
=
calculationType
==
Electrostatic
?
sqrt
(
ONE_4PI_EPS0
)
:
1.0
f
;
const
float
epsilonFactor
=
sqrt
(
ONE_4PI_EPS0
);
spreadCharge
(
posq
,
tempGrid
[
index
],
gridx
,
gridy
,
gridz
,
numParticles
,
periodicBoxVectors
,
recipBoxVectors
,
atomicCounter
,
epsilonFactor
);
threads
.
syncThreads
();
int
numGrids
=
tempGrid
.
size
();
...
...
@@ -601,37 +601,16 @@ void CpuCalcPmeReciprocalForceKernel::runWorkerThread(ThreadPool& threads, int i
sum
.
store
(
&
realGrid
[
i
]);
}
threads
.
syncThreads
();
switch
(
calculationType
){
case
Electrostatic
:
if
(
lastBoxVectors
[
0
]
!=
periodicBoxVectors
[
0
]
||
lastBoxVectors
[
1
]
!=
periodicBoxVectors
[
1
]
||
lastBoxVectors
[
2
]
!=
periodicBoxVectors
[
2
])
{
computeReciprocalEterm
(
gridxStart
,
gridxEnd
,
gridx
,
gridy
,
gridz
,
recipEterm
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
}
if
(
includeEnergy
)
{
threadEnergy
[
index
]
=
reciprocalEnergy
(
gridxStart
,
gridxEnd
,
complexGrid
,
recipEterm
,
gridx
,
gridy
,
gridz
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
}
reciprocalConvolution
(
complexStart
,
complexEnd
,
complexGrid
,
recipEterm
);
if
(
lastBoxVectors
[
0
]
!=
periodicBoxVectors
[
0
]
||
lastBoxVectors
[
1
]
!=
periodicBoxVectors
[
1
]
||
lastBoxVectors
[
2
]
!=
periodicBoxVectors
[
2
])
{
computeReciprocalEterm
(
gridxStart
,
gridxEnd
,
gridx
,
gridy
,
gridz
,
recipEterm
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
break
;
case
Dispersion
:
if
(
lastBoxVectors
[
0
]
!=
periodicBoxVectors
[
0
]
||
lastBoxVectors
[
1
]
!=
periodicBoxVectors
[
1
]
||
lastBoxVectors
[
2
]
!=
periodicBoxVectors
[
2
])
{
computeReciprocalDispersionEterm
(
gridxStart
,
gridxEnd
,
gridx
,
gridy
,
gridz
,
recipEterm
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
}
if
(
includeEnergy
)
{
threadEnergy
[
index
]
=
reciprocalDispersionEnergy
(
gridxStart
,
gridxEnd
,
complexGrid
,
recipEterm
,
gridx
,
gridy
,
gridz
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
}
// For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
complexStart
=
(
index
*
complexSize
)
/
numThreads
;
reciprocalConvolution
(
complexStart
,
complexEnd
,
complexGrid
,
recipEterm
);
}
if
(
includeEnergy
)
{
threadEnergy
[
index
]
=
reciprocalEnergy
(
gridxStart
,
gridxEnd
,
complexGrid
,
recipEterm
,
gridx
,
gridy
,
gridz
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
break
;
default:
throw
OpenMMException
(
"Unimplemented convolution type"
);
}
reciprocalConvolution
(
complexStart
,
complexEnd
,
complexGrid
,
recipEterm
);
threads
.
syncThreads
();
interpolateForces
(
posq
,
&
force
[
0
],
realGrid
,
gridx
,
gridy
,
gridz
,
numParticles
,
periodicBoxVectors
,
recipBoxVectors
,
atomicCounter
,
epsilonFactor
);
}
...
...
@@ -702,3 +681,296 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum, bool isZ) {
minimum
++
;
}
}
/*
* Everything below here is just a clone of the above, but to handle the dispersion term
* instead of electrostatics.
*/
bool
CpuCalcPmeReciprocalForceKernel
::
hasInitializedThreads
=
false
;
int
CpuCalcPmeReciprocalForceKernel
::
numThreads
=
0
;
class
CpuCalcDispersionPmeReciprocalForceKernel
::
ComputeTask
:
public
ThreadPool
::
Task
{
public:
ComputeTask
(
CpuCalcDispersionPmeReciprocalForceKernel
&
owner
)
:
owner
(
owner
)
{
}
void
execute
(
ThreadPool
&
threads
,
int
threadIndex
)
{
owner
.
runWorkerThread
(
threads
,
threadIndex
);
}
CpuCalcDispersionPmeReciprocalForceKernel
&
owner
;
};
static
void
*
dispersionThreadBody
(
void
*
args
)
{
CpuCalcDispersionPmeReciprocalForceKernel
&
owner
=
*
reinterpret_cast
<
CpuCalcDispersionPmeReciprocalForceKernel
*>
(
args
);
owner
.
runMainThread
();
return
0
;
}
void
CpuCalcDispersionPmeReciprocalForceKernel
::
initialize
(
int
xsize
,
int
ysize
,
int
zsize
,
int
numParticles
,
double
alpha
)
{
if
(
!
hasInitializedThreads
)
{
numThreads
=
getNumProcessors
();
char
*
threadsEnv
=
getenv
(
"OPENMM_CPU_THREADS"
);
if
(
threadsEnv
!=
NULL
)
stringstream
(
threadsEnv
)
>>
numThreads
;
fftwf_init_threads
();
hasInitializedThreads
=
true
;
}
threadEnergy
.
resize
(
numThreads
);
gridx
=
findFFTDimension
(
xsize
,
false
);
gridy
=
findFFTDimension
(
ysize
,
false
);
gridz
=
findFFTDimension
(
zsize
,
true
);
this
->
numParticles
=
numParticles
;
this
->
alpha
=
alpha
;
force
.
resize
(
4
*
numParticles
);
recipEterm
.
resize
(
gridx
*
gridy
*
gridz
);
// Initialize threads.
isFinished
=
false
;
pthread_cond_init
(
&
startCondition
,
NULL
);
pthread_cond_init
(
&
endCondition
,
NULL
);
pthread_mutex_init
(
&
lock
,
NULL
);
pthread_create
(
&
mainThread
,
NULL
,
dispersionThreadBody
,
this
);
// Wait until the main thread is up and running.
pthread_mutex_lock
(
&
lock
);
while
(
!
isFinished
)
pthread_cond_wait
(
&
endCondition
,
&
lock
);
pthread_mutex_unlock
(
&
lock
);
// Initialize FFTW.
for
(
int
i
=
0
;
i
<
numThreads
;
i
++
)
tempGrid
.
push_back
((
float
*
)
fftwf_malloc
(
sizeof
(
float
)
*
(
gridx
*
gridy
*
gridz
+
3
)));
realGrid
=
tempGrid
[
0
];
complexGrid
=
(
fftwf_complex
*
)
fftwf_malloc
(
sizeof
(
fftwf_complex
)
*
gridx
*
gridy
*
(
gridz
/
2
+
1
));
fftwf_plan_with_nthreads
(
numThreads
);
forwardFFT
=
fftwf_plan_dft_r2c_3d
(
gridx
,
gridy
,
gridz
,
realGrid
,
complexGrid
,
FFTW_MEASURE
);
backwardFFT
=
fftwf_plan_dft_c2r_3d
(
gridx
,
gridy
,
gridz
,
complexGrid
,
realGrid
,
FFTW_MEASURE
);
hasCreatedPlan
=
true
;
// Initialize the b-spline moduli.
int
maxSize
=
std
::
max
(
std
::
max
(
gridx
,
gridy
),
gridz
);
vector
<
double
>
data
(
PME_ORDER
);
vector
<
double
>
ddata
(
PME_ORDER
);
vector
<
double
>
bsplinesData
(
maxSize
);
data
[
PME_ORDER
-
1
]
=
0.0
;
data
[
1
]
=
0.0
;
data
[
0
]
=
1.0
;
for
(
int
i
=
3
;
i
<
PME_ORDER
;
i
++
)
{
double
div
=
1.0
/
(
i
-
1.0
);
data
[
i
-
1
]
=
0.0
;
for
(
int
j
=
1
;
j
<
(
i
-
1
);
j
++
)
data
[
i
-
j
-
1
]
=
div
*
(
j
*
data
[
i
-
j
-
2
]
+
(
i
-
j
)
*
data
[
i
-
j
-
1
]);
data
[
0
]
=
div
*
data
[
0
];
}
// Differentiate.
ddata
[
0
]
=
-
data
[
0
];
for
(
int
i
=
1
;
i
<
PME_ORDER
;
i
++
)
ddata
[
i
]
=
data
[
i
-
1
]
-
data
[
i
];
double
div
=
1.0
/
(
PME_ORDER
-
1
);
data
[
PME_ORDER
-
1
]
=
0.0
;
for
(
int
i
=
1
;
i
<
(
PME_ORDER
-
1
);
i
++
)
data
[
PME_ORDER
-
i
-
1
]
=
div
*
(
i
*
data
[
PME_ORDER
-
i
-
2
]
+
(
PME_ORDER
-
i
)
*
data
[
PME_ORDER
-
i
-
1
]);
data
[
0
]
=
div
*
data
[
0
];
for
(
int
i
=
0
;
i
<
maxSize
;
i
++
)
bsplinesData
[
i
]
=
0.0
;
for
(
int
i
=
1
;
i
<=
PME_ORDER
;
i
++
)
bsplinesData
[
i
]
=
data
[
i
-
1
];
// Evaluate the actual bspline moduli for X/Y/Z.
bsplineModuli
[
0
].
resize
(
gridx
);
bsplineModuli
[
1
].
resize
(
gridy
);
bsplineModuli
[
2
].
resize
(
gridz
);
for
(
int
dim
=
0
;
dim
<
3
;
dim
++
)
{
int
ndata
=
bsplineModuli
[
dim
].
size
();
vector
<
float
>&
moduli
=
bsplineModuli
[
dim
];
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
{
double
sc
=
0.0
;
double
ss
=
0.0
;
for
(
int
j
=
0
;
j
<
ndata
;
j
++
)
{
double
arg
=
(
2.0
*
M_PI
*
i
*
j
)
/
ndata
;
sc
+=
bsplinesData
[
j
]
*
cos
(
arg
);
ss
+=
bsplinesData
[
j
]
*
sin
(
arg
);
}
moduli
[
i
]
=
(
float
)
(
sc
*
sc
+
ss
*
ss
);
}
for
(
int
i
=
0
;
i
<
ndata
;
i
++
)
if
(
moduli
[
i
]
<
1.0e-7
f
)
moduli
[
i
]
=
(
moduli
[
i
-
1
]
+
moduli
[
i
+
1
])
*
0.5
f
;
}
}
CpuCalcDispersionPmeReciprocalForceKernel
::~
CpuCalcDispersionPmeReciprocalForceKernel
()
{
isDeleted
=
true
;
pthread_mutex_lock
(
&
lock
);
pthread_cond_broadcast
(
&
startCondition
);
pthread_mutex_unlock
(
&
lock
);
pthread_join
(
mainThread
,
NULL
);
pthread_mutex_destroy
(
&
lock
);
pthread_cond_destroy
(
&
startCondition
);
pthread_cond_destroy
(
&
endCondition
);
for
(
int
i
=
0
;
i
<
(
int
)
tempGrid
.
size
();
i
++
)
fftwf_free
(
tempGrid
[
i
]);
if
(
complexGrid
!=
NULL
)
fftwf_free
(
complexGrid
);
if
(
hasCreatedPlan
)
{
fftwf_destroy_plan
(
forwardFFT
);
fftwf_destroy_plan
(
backwardFFT
);
}
}
void
CpuCalcDispersionPmeReciprocalForceKernel
::
runMainThread
()
{
// This is the main thread that coordinates all the other ones.
pthread_mutex_lock
(
&
lock
);
isFinished
=
true
;
pthread_cond_signal
(
&
endCondition
);
ThreadPool
threads
(
numThreads
);
while
(
true
)
{
// Wait for the signal to start.
pthread_cond_wait
(
&
startCondition
,
&
lock
);
if
(
isDeleted
)
break
;
posq
=
io
->
getPosq
();
ComputeTask
task
(
*
this
);
gmx_atomic_set
(
&
atomicCounter
,
0
);
threads
.
execute
(
task
);
// Signal threads to perform charge spreading.
threads
.
waitForThreads
();
threads
.
resumeThreads
();
// Signal threads to sum the charge grids.
threads
.
waitForThreads
();
fftwf_execute_dft_r2c
(
forwardFFT
,
realGrid
,
complexGrid
);
if
(
lastBoxVectors
[
0
]
!=
periodicBoxVectors
[
0
]
||
lastBoxVectors
[
1
]
!=
periodicBoxVectors
[
1
]
||
lastBoxVectors
[
2
]
!=
periodicBoxVectors
[
2
])
{
threads
.
resumeThreads
();
// Signal threads to compute the reciprocal scale factors.
threads
.
waitForThreads
();
}
if
(
includeEnergy
)
{
threads
.
resumeThreads
();
// Signal threads to compute energy.
threads
.
waitForThreads
();
for
(
int
i
=
0
;
i
<
(
int
)
threadEnergy
.
size
();
i
++
)
energy
+=
threadEnergy
[
i
];
}
threads
.
resumeThreads
();
// Signal threads to perform reciprocal convolution.
threads
.
waitForThreads
();
fftwf_execute_dft_c2r
(
backwardFFT
,
complexGrid
,
realGrid
);
gmx_atomic_set
(
&
atomicCounter
,
0
);
threads
.
resumeThreads
();
// Signal threads to interpolate forces.
threads
.
waitForThreads
();
isFinished
=
true
;
lastBoxVectors
[
0
]
=
periodicBoxVectors
[
0
];
lastBoxVectors
[
1
]
=
periodicBoxVectors
[
1
];
lastBoxVectors
[
2
]
=
periodicBoxVectors
[
2
];
pthread_cond_signal
(
&
endCondition
);
}
pthread_mutex_unlock
(
&
lock
);
}
void
CpuCalcDispersionPmeReciprocalForceKernel
::
runWorkerThread
(
ThreadPool
&
threads
,
int
index
)
{
int
gridxStart
=
(
index
*
gridx
)
/
numThreads
;
int
gridxEnd
=
((
index
+
1
)
*
gridx
)
/
numThreads
;
int
gridSize
=
(
gridx
*
gridy
*
gridz
+
3
)
/
4
;
int
gridStart
=
4
*
((
index
*
gridSize
)
/
numThreads
);
int
gridEnd
=
4
*
(((
index
+
1
)
*
gridSize
)
/
numThreads
);
int
complexSize
=
gridx
*
gridy
*
(
gridz
/
2
+
1
);
int
complexStart
=
std
::
max
(
1
,
((
index
*
complexSize
)
/
numThreads
));
int
complexEnd
=
(((
index
+
1
)
*
complexSize
)
/
numThreads
);
const
float
epsilonFactor
=
1.0
f
;
spreadCharge
(
posq
,
tempGrid
[
index
],
gridx
,
gridy
,
gridz
,
numParticles
,
periodicBoxVectors
,
recipBoxVectors
,
atomicCounter
,
epsilonFactor
);
threads
.
syncThreads
();
int
numGrids
=
tempGrid
.
size
();
for
(
int
i
=
gridStart
;
i
<
gridEnd
;
i
+=
4
)
{
fvec4
sum
(
&
realGrid
[
i
]);
for
(
int
j
=
1
;
j
<
numGrids
;
j
++
)
sum
+=
fvec4
(
&
tempGrid
[
j
][
i
]);
sum
.
store
(
&
realGrid
[
i
]);
}
threads
.
syncThreads
();
if
(
lastBoxVectors
[
0
]
!=
periodicBoxVectors
[
0
]
||
lastBoxVectors
[
1
]
!=
periodicBoxVectors
[
1
]
||
lastBoxVectors
[
2
]
!=
periodicBoxVectors
[
2
])
{
computeReciprocalDispersionEterm
(
gridxStart
,
gridxEnd
,
gridx
,
gridy
,
gridz
,
recipEterm
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
}
if
(
includeEnergy
)
{
threadEnergy
[
index
]
=
reciprocalDispersionEnergy
(
gridxStart
,
gridxEnd
,
complexGrid
,
recipEterm
,
gridx
,
gridy
,
gridz
,
alpha
,
bsplineModuli
,
periodicBoxVectors
,
recipBoxVectors
);
threads
.
syncThreads
();
}
// For dispersion, we include the {0,0,0} term, so the start point needs to be redefined
complexStart
=
(
index
*
complexSize
)
/
numThreads
;
reciprocalConvolution
(
complexStart
,
complexEnd
,
complexGrid
,
recipEterm
);
threads
.
syncThreads
();
interpolateForces
(
posq
,
&
force
[
0
],
realGrid
,
gridx
,
gridy
,
gridz
,
numParticles
,
periodicBoxVectors
,
recipBoxVectors
,
atomicCounter
,
epsilonFactor
);
}
void
CpuCalcDispersionPmeReciprocalForceKernel
::
beginComputation
(
IO
&
io
,
const
Vec3
*
periodicBoxVectors
,
bool
includeEnergy
)
{
this
->
io
=
&
io
;
this
->
periodicBoxVectors
[
0
]
=
periodicBoxVectors
[
0
];
this
->
periodicBoxVectors
[
1
]
=
periodicBoxVectors
[
1
];
this
->
periodicBoxVectors
[
2
]
=
periodicBoxVectors
[
2
];
this
->
includeEnergy
=
includeEnergy
;
energy
=
0.0
;
// Invert the box vectors.
double
determinant
=
periodicBoxVectors
[
0
][
0
]
*
periodicBoxVectors
[
1
][
1
]
*
periodicBoxVectors
[
2
][
2
];
double
scale
=
1.0
/
determinant
;
recipBoxVectors
[
0
]
=
Vec3
(
periodicBoxVectors
[
1
][
1
]
*
periodicBoxVectors
[
2
][
2
],
0
,
0
)
*
scale
;
recipBoxVectors
[
1
]
=
Vec3
(
-
periodicBoxVectors
[
1
][
0
]
*
periodicBoxVectors
[
2
][
2
],
periodicBoxVectors
[
0
][
0
]
*
periodicBoxVectors
[
2
][
2
],
0
)
*
scale
;
recipBoxVectors
[
2
]
=
Vec3
(
periodicBoxVectors
[
1
][
0
]
*
periodicBoxVectors
[
2
][
1
]
-
periodicBoxVectors
[
1
][
1
]
*
periodicBoxVectors
[
2
][
0
],
-
periodicBoxVectors
[
0
][
0
]
*
periodicBoxVectors
[
2
][
1
],
periodicBoxVectors
[
0
][
0
]
*
periodicBoxVectors
[
1
][
1
])
*
scale
;
// Do the calculation.
pthread_mutex_lock
(
&
lock
);
isFinished
=
false
;
pthread_cond_signal
(
&
startCondition
);
pthread_mutex_unlock
(
&
lock
);
}
double
CpuCalcDispersionPmeReciprocalForceKernel
::
finishComputation
(
IO
&
io
)
{
pthread_mutex_lock
(
&
lock
);
while
(
!
isFinished
)
{
pthread_cond_wait
(
&
endCondition
,
&
lock
);
}
pthread_mutex_unlock
(
&
lock
);
io
.
setForce
(
&
force
[
0
]);
return
energy
;
}
bool
CpuCalcDispersionPmeReciprocalForceKernel
::
isProcessorSupported
()
{
return
isVec4Supported
();
}
void
CpuCalcDispersionPmeReciprocalForceKernel
::
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
{
alpha
=
this
->
alpha
;
nx
=
gridx
;
ny
=
gridy
;
nz
=
gridz
;
}
int
CpuCalcDispersionPmeReciprocalForceKernel
::
findFFTDimension
(
int
minimum
,
bool
isZ
)
{
if
(
minimum
<
1
)
return
1
;
while
(
true
)
{
// Attempt to factor the current value.
if
(
isZ
&&
minimum
%
2
==
1
)
{
// Force the last dimension to be even, since this produces better performance in FFTW.
minimum
++
;
continue
;
}
int
unfactored
=
minimum
;
for
(
int
factor
=
2
;
factor
<
8
;
factor
++
)
{
while
(
unfactored
>
1
&&
unfactored
%
factor
==
0
)
unfactored
/=
factor
;
}
if
(
unfactored
==
1
||
unfactored
==
11
||
unfactored
==
13
)
return
minimum
;
minimum
++
;
}
}
plugins/cpupme/src/CpuPmeKernels.h
View file @
7b66ba19
...
...
@@ -51,10 +51,8 @@ namespace OpenMM {
class
OPENMM_EXPORT_PME
CpuCalcPmeReciprocalForceKernel
:
public
CalcPmeReciprocalForceKernel
{
public:
enum
CalculationType
{
Electrostatic
=
0
,
Dispersion
=
1
};
CpuCalcPmeReciprocalForceKernel
(
std
::
string
name
,
const
Platform
&
platform
)
:
CalcPmeReciprocalForceKernel
(
name
,
platform
),
hasCreatedPlan
(
false
),
isDeleted
(
false
),
realGrid
(
NULL
),
complexGrid
(
NULL
)
,
calculationType
(
Electrostatic
)
{
hasCreatedPlan
(
false
),
isDeleted
(
false
),
realGrid
(
NULL
),
complexGrid
(
NULL
)
{
}
/**
* Initialize the kernel.
...
...
@@ -103,11 +101,98 @@ public:
* @param nz the number of grid points along the Z axis
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
class
ComputeTask
;
/**
* Select a size for one grid dimension that FFTW can handle efficiently.
*/
int
findFFTDimension
(
int
minimum
,
bool
isZ
);
static
bool
hasInitializedThreads
;
static
int
numThreads
;
int
gridx
,
gridy
,
gridz
,
numParticles
;
double
alpha
;
bool
hasCreatedPlan
,
isFinished
,
isDeleted
;
std
::
vector
<
float
>
force
;
std
::
vector
<
float
>
bsplineModuli
[
3
];
std
::
vector
<
float
>
recipEterm
;
Vec3
lastBoxVectors
[
3
];
std
::
vector
<
float
>
threadEnergy
;
std
::
vector
<
float
*>
tempGrid
;
float
*
realGrid
;
fftwf_complex
*
complexGrid
;
fftwf_plan
forwardFFT
,
backwardFFT
;
int
waitCount
;
pthread_cond_t
startCondition
,
endCondition
;
pthread_mutex_t
lock
;
pthread_t
mainThread
;
// The following variables are used to store information about the calculation currently being performed.
IO
*
io
;
float
energy
;
float
*
posq
;
Vec3
periodicBoxVectors
[
3
],
recipBoxVectors
[
3
];
bool
includeEnergy
;
gmx_atomic_t
atomicCounter
;
};
/**
* This is an optimized CPU implementation of CalcDispersionPmeReciprocalForceKernel. It is both
* vectorized (requiring SSE 4.1) and multithreaded. It uses FFTW to perform the FFTs.
*/
class
OPENMM_EXPORT_PME
CpuCalcDispersionPmeReciprocalForceKernel
:
public
CalcPmeReciprocalForceKernel
{
public:
CpuCalcDispersionPmeReciprocalForceKernel
(
std
::
string
name
,
const
Platform
&
platform
)
:
CalcPmeReciprocalForceKernel
(
name
,
platform
),
hasCreatedPlan
(
false
),
isDeleted
(
false
),
realGrid
(
NULL
),
complexGrid
(
NULL
)
{
}
/**
* Initialize the kernel.
*
* @param gridx the x size of the PME grid
* @param gridy the y size of the PME grid
* @param gridz the z size of the PME grid
* @param numParticles the number of particles in the system
* @param alpha the Ewald blending parameter
*/
void
initialize
(
int
xsize
,
int
ysize
,
int
zsize
,
int
numParticles
,
double
alpha
);
~
CpuCalcDispersionPmeReciprocalForceKernel
();
/**
* Begin computing the force and energy.
*
* @param io an object that coordinates data transfer
* @param periodicBoxVectors the vectors defining the periodic box (measured in nm)
* @param includeEnergy true if potential energy should be computed
*/
void
beginComputation
(
IO
&
io
,
const
Vec3
*
periodicBoxVectors
,
bool
includeEnergy
);
/**
* Sets the type of reciprocal space computation to perform (Electrostatic or Dispersion).
* @param type The type of computation
* Finish computing the force and energy.
*
* @param io an object that coordinates data transfer
* @return the potential energy due to the PME reciprocal space interactions
*/
void
setCalculationType
(
CalculationType
type
)
{
calculationType
=
type
;
}
double
finishComputation
(
IO
&
io
);
/**
* This routine contains the code executed by the main thread.
*/
void
runMainThread
();
/**
* This routine contains the code executed by each worker thread.
*/
void
runWorkerThread
(
ThreadPool
&
threads
,
int
index
);
/**
* Get whether the current CPU supports all features needed by this kernel.
*/
static
bool
isProcessorSupported
();
/**
* Get the parameters being used for PME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
class
ComputeTask
;
/**
...
...
@@ -138,7 +223,6 @@ private:
float
*
posq
;
Vec3
periodicBoxVectors
[
3
],
recipBoxVectors
[
3
];
bool
includeEnergy
;
CalculationType
calculationType
;
gmx_atomic_t
atomicCounter
;
};
...
...
plugins/cpupme/tests/TestCpuPme.cpp
View file @
7b66ba19
...
...
@@ -523,8 +523,7 @@ void test_water2_dpme_energies_forces_no_exclusions() {
const
vector
<
Vec3
>&
refforces
=
state
.
getForces
();
// Optimized CPU calculation
CpuCalcPmeReciprocalForceKernel
pme
(
CalcPmeReciprocalForceKernel
::
Name
(),
platform
);
pme
.
setCalculationType
(
CpuCalcPmeReciprocalForceKernel
::
Dispersion
);
CpuCalcDispersionPmeReciprocalForceKernel
pme
(
CalcPmeReciprocalForceKernel
::
Name
(),
platform
);
IO
io
;
double
selfEwaldEnergy
=
0
;
double
dalpha6
=
pow
(
dalpha
,
6.0
);
...
...
tests/TestNonbondedForce.h
View file @
7b66ba19
...
...
@@ -1695,8 +1695,10 @@ void test_water125_dpme_vs_long_cutoff_with_exclusions() {
ASSERT_EQUAL_TOL
(
refenergy
,
energy
,
5E-4
);
ASSERT_EQUAL_TOL
(
gromacs_energy
,
energy
,
5E-5
);
// Forces accumulated in single precision are tested to a more permissive criterion; the double
// precision platform can match to 5E-5.
for
(
int
n
=
0
;
n
<
NATOMS
;
++
n
)
ASSERT_EQUAL_VEC
(
refforces
[
n
],
forces
[
n
],
5
E-
5
);
ASSERT_EQUAL_VEC
(
refforces
[
n
],
forces
[
n
],
1
E-
4
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment