Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
98c30a3f
Commit
98c30a3f
authored
Jun 10, 2010
by
Peter Eastman
Browse files
Major optimizations to PME performance
parent
b880b5d9
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
120 additions
and
65 deletions
+120
-65
platforms/cuda/src/kernels/bbsort.cu
platforms/cuda/src/kernels/bbsort.cu
+3
-3
platforms/cuda/src/kernels/bbsort_kernel.cu
platforms/cuda/src/kernels/bbsort_kernel.cu
+2
-2
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+1
-1
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+1
-1
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+1
-1
platforms/cuda/src/kernels/kCalculatePME.cu
platforms/cuda/src/kernels/kCalculatePME.cu
+46
-21
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+7
-6
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+2
-2
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+57
-28
No files found.
platforms/cuda/src/kernels/bbsort.cu
View file @
98c30a3f
...
...
@@ -17,7 +17,7 @@
#include "bbsort_kernel.cu"
floa
t
getValue
(
floa
t2
v
){
in
t
getValue
(
in
t2
v
){
return
v
.
y
;
}
...
...
@@ -115,7 +115,7 @@ void reduceMinMax(T* dData,int size,float& result,bool isMax)
CUDA_SAFE_CALL
(
cudaMemcpy
(
&
originalResult
,
dData
,
sizeof
(
T
),
cudaMemcpyDeviceToHost
));
result
=
(
floa
t
)
getValue
(
originalResult
);
result
=
(
in
t
)
getValue
(
originalResult
);
}
template
<
typename
T
>
...
...
@@ -305,7 +305,7 @@ Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile com
*************************************************************************************/
template
<
>
void
bbSort
(
floa
t2
*
dData
,
int
size
,
int
listOrder
)
void
bbSort
(
in
t2
*
dData
,
int
size
,
int
listOrder
)
{
bbSortBody
(
dData
,
size
,
listOrder
);
...
...
platforms/cuda/src/kernels/bbsort_kernel.cu
View file @
98c30a3f
...
...
@@ -19,7 +19,7 @@ texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
texture
<
unsigned
int
,
1
,
cudaReadModeElementType
>
tBucketOfSlices
;
texture
<
unsigned
int
,
1
,
cudaReadModeElementType
>
tSliceOffsetInBucket
;
__device__
floa
t
dGetValue
(
floa
t2
v
){
__device__
in
t
dGetValue
(
in
t2
v
){
return
v
.
y
;
}
...
...
@@ -29,7 +29,7 @@ __device__ T dGetValue(T v){
}
__device__
void
dPad
(
floa
t2
&
v
){
__device__
void
dPad
(
in
t2
&
v
){
v
.
x
=
0x3fffffff
;
v
.
y
=
0x4fffffff
;
}
...
...
platforms/cuda/src/kernels/cudatypes.h
View file @
98c30a3f
...
...
@@ -420,7 +420,7 @@ struct cudaGmxSimulation {
float4
*
pPmeBsplineTheta
;
float4
*
pPmeBsplineDtheta
;
int
*
pPmeAtomRange
;
// The range of sorted atoms at each grid point
floa
t2
*
pPmeAtomGridIndex
;
// The grid point each atom is at
in
t2
*
pPmeAtomGridIndex
;
// The grid point each atom is at
unsigned
int
bonds
;
// Number of bonds
int4
*
pBondID
;
// Bond atom and output buffer IDs
float2
*
pBondParameter
;
// Bond parameters
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
98c30a3f
...
...
@@ -994,7 +994,7 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSiz
gpu
->
sim
.
pPmeBsplineDtheta
=
gpu
->
psPmeBsplineDtheta
->
_pDevData
;
gpu
->
psPmeAtomRange
=
new
CUDAStream
<
int
>
(
gridSize
.
x
*
gridSize
.
y
*
gridSize
.
z
+
1
,
1
,
"PmeAtomRange"
);
gpu
->
sim
.
pPmeAtomRange
=
gpu
->
psPmeAtomRange
->
_pDevData
;
gpu
->
psPmeAtomGridIndex
=
new
CUDAStream
<
floa
t2
>
(
gpu
->
natoms
,
1
,
"PmeAtomGridIndex"
);
gpu
->
psPmeAtomGridIndex
=
new
CUDAStream
<
in
t2
>
(
gpu
->
natoms
,
1
,
"PmeAtomGridIndex"
);
gpu
->
sim
.
pPmeAtomGridIndex
=
gpu
->
psPmeAtomGridIndex
->
_pDevData
;
tabulateErfc
(
gpu
);
...
...
platforms/cuda/src/kernels/gputypes.h
View file @
98c30a3f
...
...
@@ -122,7 +122,7 @@ struct _gpuContext {
CUDAStream
<
float4
>*
psPmeBsplineTheta
;
CUDAStream
<
float4
>*
psPmeBsplineDtheta
;
CUDAStream
<
int
>*
psPmeAtomRange
;
// The range of sorted atoms at each grid point
CUDAStream
<
floa
t2
>*
psPmeAtomGridIndex
;
// The grid point each atom is at
CUDAStream
<
in
t2
>*
psPmeAtomGridIndex
;
// The grid point each atom is at
CUDAStream
<
float2
>*
psObcData
;
CUDAStream
<
float4
>*
psGBVIData
;
CUDAStream
<
float
>*
psObcChain
;
...
...
platforms/cuda/src/kernels/kCalculatePME.cu
View file @
98c30a3f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2010
Stanford University and the Authors. *
* Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman *
* Contributors: *
* *
...
...
@@ -117,7 +117,7 @@ void kUpdateGridIndexAndFraction_kernel()
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
cSim
.
pmeGridSize
.
x
,
((
int
)
t
.
y
)
%
cSim
.
pmeGridSize
.
y
,
((
int
)
t
.
z
)
%
cSim
.
pmeGridSize
.
z
);
cSim
.
pPmeAtomGridIndex
[
i
]
=
make_
floa
t2
(
i
,
gridIndex
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
z
);
cSim
.
pPmeAtomGridIndex
[
i
]
=
make_
in
t2
(
i
,
gridIndex
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
z
);
}
}
...
...
@@ -141,7 +141,7 @@ void kFindAtomRangeForGrid_kernel()
int
last
=
(
start
==
0
?
-
1
:
cSim
.
pPmeAtomGridIndex
[
start
-
1
].
y
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
floa
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
in
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
gridIndex
=
atomData
.
y
;
if
(
gridIndex
!=
last
)
{
...
...
@@ -150,10 +150,13 @@ void kFindAtomRangeForGrid_kernel()
last
=
gridIndex
;
}
// The grid index won't be needed again. Reuse that component to hold the
atom charge
, thus saving
//
an extra load operation
in the charge spreading kernel.
// The grid index won't be needed again. Reuse that component to hold the
z index
, thus saving
//
some work
in the charge spreading kernel.
cSim
.
pPmeAtomGridIndex
[
i
].
y
=
cSim
.
pPosq
[(
int
)
atomData
.
x
].
w
;
float
posz
=
cSim
.
pPosq
[
atomData
.
x
].
z
;
posz
-=
floor
(
posz
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
int
z
=
((
int
)
((
posz
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
pmeGridSize
.
z
))
%
cSim
.
pmeGridSize
.
z
;
cSim
.
pPmeAtomGridIndex
[
i
].
y
=
z
;
}
// Fill in values beyond the last atom.
...
...
@@ -266,28 +269,47 @@ void kGridSpreadCharge_kernel()
int
remainder
=
gridIndex
-
gridPoint
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
gridPoint
.
y
=
remainder
/
cSim
.
pmeGridSize
.
z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
cSim
.
pmeGridSize
.
z
;
gridPoint
.
x
+=
cSim
.
pmeGridSize
.
x
;
gridPoint
.
y
+=
cSim
.
pmeGridSize
.
y
;
gridPoint
.
z
+=
cSim
.
pmeGridSize
.
z
;
float
result
=
0.0
f
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
cSim
.
pmeGridSize
.
x
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
++
iz
)
{
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
cSim
.
pmeGridSize
.
y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
cSim
.
pmeGridSize
.
z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
cSim
.
pmeGridSize
.
z
-
1
);
int
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z1
;
int
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z2
;
int
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
int
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
cSim
.
pPosq
[
atomIndex
].
w
;
result
+=
atomCharge
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
x
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iy
*
cSim
.
atoms
).
y
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iz
*
cSim
.
atoms
).
z
;
}
if
(
z1
>
gridPoint
.
z
)
{
int
x
=
(
gridPoint
.
x
-
ix
)
%
cSim
.
pmeGridSize
.
x
;
int
y
=
(
gridPoint
.
y
-
iy
)
%
cSim
.
pmeGridSize
.
y
;
int
z
=
(
gridPoint
.
z
-
iz
)
%
cSim
.
pmeGridSize
.
z
;
int
gridIndex
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z
;
int
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex
];
int
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex
+
1
];
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
;
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
gridPoint
.
z
;
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
floa
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
in
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
float
atomCharge
=
atomData
.
y
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
cSim
.
pPosq
[
atomIndex
].
w
;
result
+=
atomCharge
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
x
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iy
*
cSim
.
atoms
).
y
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iz
*
cSim
.
atoms
).
z
;
}
}
}
}
cSim
.
pPmeGrid
[
gridIndex
]
=
make_cuComplex
(
result
*
sqrt
(
cSim
.
epsfac
),
0.0
f
);
}
}
...
...
@@ -358,17 +380,20 @@ void kGridInterpolateForce_kernel()
((
int
)
t
.
z
)
%
cSim
.
pmeGridSize
.
z
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xindex
=
(
gridIndex
.
x
+
ix
)
%
cSim
.
pmeGridSize
.
x
;
int
xindex
=
gridIndex
.
x
+
ix
;
xindex
-=
(
xindex
>=
cSim
.
pmeGridSize
.
x
?
cSim
.
pmeGridSize
.
x
:
0
);
float
tx
=
cSim
.
pPmeBsplineTheta
[
atom
+
ix
*
cSim
.
atoms
].
x
;
float
dtx
=
cSim
.
pPmeBsplineDtheta
[
atom
+
ix
*
cSim
.
atoms
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
yindex
=
(
gridIndex
.
y
+
iy
)
%
cSim
.
pmeGridSize
.
y
;
int
yindex
=
gridIndex
.
y
+
iy
;
yindex
-=
(
yindex
>=
cSim
.
pmeGridSize
.
y
?
cSim
.
pmeGridSize
.
y
:
0
);
float
ty
=
cSim
.
pPmeBsplineTheta
[
atom
+
iy
*
cSim
.
atoms
].
y
;
float
dty
=
cSim
.
pPmeBsplineDtheta
[
atom
+
iy
*
cSim
.
atoms
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
(
gridIndex
.
z
+
iz
)
%
cSim
.
pmeGridSize
.
z
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
cSim
.
pmeGridSize
.
z
?
cSim
.
pmeGridSize
.
z
:
0
);
float
tz
=
cSim
.
pPmeBsplineTheta
[
atom
+
iz
*
cSim
.
atoms
].
z
;
float
dtz
=
cSim
.
pPmeBsplineDtheta
[
atom
+
iz
*
cSim
.
atoms
].
z
;
int
index
=
xindex
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
yindex
*
cSim
.
pmeGridSize
.
z
+
zindex
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
98c30a3f
...
...
@@ -1151,8 +1151,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineDtheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineDtheta"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_
floa
t2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_
floa
t2
>
(
cl
,
cl
.
getNumAtoms
(),
"
floa
t2"
,
"value.y"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_
in
t2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_
in
t2
>
(
cl
,
cl
.
getNumAtoms
(),
"
in
t2"
,
"value.y"
);
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
// Initialize the b-spline moduli.
...
...
@@ -1303,10 +1303,11 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineDtheta
->
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
(
3
,
2
*
OpenCLContext
::
ThreadBlockSize
*
PmeOrder
*
sizeof
(
mm_float4
),
NULL
);
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomRange
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeAtomRange
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
98c30a3f
...
...
@@ -491,9 +491,9 @@ private:
OpenCLArray
<
mm_float4
>*
pmeBsplineTheta
;
OpenCLArray
<
mm_float4
>*
pmeBsplineDtheta
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
mm_
floa
t2
>*
pmeAtomGridIndex
;
OpenCLArray
<
mm_
in
t2
>*
pmeAtomGridIndex
;
OpenCLArray
<
cl_float
>*
erfcTable
;
OpenCLSort
<
mm_
floa
t2
>*
sort
;
OpenCLSort
<
mm_
in
t2
>*
sort
;
OpenCLFFT3D
*
fft
;
cl
::
Kernel
exceptionsKernel
;
cl
::
Kernel
ewaldSumsKernel
;
...
...
platforms/opencl/src/kernels/pme.cl
View file @
98c30a3f
__kernel
void
updateGridIndexAndFraction
(
__global
float4*
posq,
__global
floa
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
void
updateGridIndexAndFraction
(
__global
float4*
posq,
__global
in
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
float4
pos
=
posq[i]
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
...
...
@@ -10,7 +10,7 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
int4
gridIndex
=
(
int4
)
(((
int
)
t.x
)
%
GRID_SIZE_X,
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
pmeAtomGridIndex[i]
=
(
floa
t2
)
(
i,
gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z
)
;
pmeAtomGridIndex[i]
=
(
in
t2
)
(
i,
gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z
)
;
}
}
...
...
@@ -18,12 +18,12 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
*
For
each
grid
point,
find
the
range
of
sorted
atoms
associated
with
that
point.
*/
__kernel
void
findAtomRangeForGrid
(
__global
floa
t2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange
)
{
__kernel
void
findAtomRangeForGrid
(
__global
in
t2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange
)
{
int
start
=
(
NUM_ATOMS*get_global_id
(
0
))
/get_global_size
(
0
)
;
int
end
=
(
NUM_ATOMS*
(
get_global_id
(
0
)
+1
))
/get_global_size
(
0
)
;
int
last
=
(
start
==
0
?
-1
:
pmeAtomGridIndex[start-1].y
)
;
for
(
int
i
=
start
; i < end; ++i) {
floa
t2
atomData
=
pmeAtomGridIndex[i]
;
in
t2
atomData
=
pmeAtomGridIndex[i]
;
int
gridIndex
=
atomData.y
;
if
(
gridIndex
!=
last
)
{
for
(
int
j
=
last+1
; j <= gridIndex; ++j)
...
...
@@ -41,7 +41,7 @@ __kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global i
}
}
__kernel
void
updateBsplines
(
__global
float4*
posq,
__global
float4*
pmeBsplineTheta,
__global
float4*
pmeBsplineDTheta,
__local
float4*
bsplinesCache,
__global
floa
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
void
updateBsplines
(
__global
float4*
posq,
__global
float4*
pmeBsplineTheta,
__global
float4*
pmeBsplineDTheta,
__local
float4*
bsplinesCache,
__global
in
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
const
float4
scale
=
1.0f/
(
PME_ORDER-1
)
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
__local
float4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
...
...
@@ -81,45 +81,71 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
}
}
//
The
grid
index
won
't
be
needed
again.
Reuse
that
component
to
hold
the
atom
charge
,
thus
saving
//
an
extra
load
operation
in
the
charge
spreading
kernel.
//
The
grid
index
won
't
be
needed
again.
Reuse
that
component
to
hold
the
z
index
,
thus
saving
//
some
work
in
the
charge
spreading
kernel.
int
start
=
(
NUM_ATOMS*get_global_id
(
0
))
/get_global_size
(
0
)
;
int
end
=
(
NUM_ATOMS*
(
get_global_id
(
0
)
+1
))
/get_global_size
(
0
)
;
for
(
int
i
=
start
; i < end; ++i) {
float2
atomData
=
pmeAtomGridIndex[i]
;
pmeAtomGridIndex[i].y
=
posq[
(
int
)
atomData.x].w
;
float
posz
=
posq[pmeAtomGridIndex[i].x].z
;
posz
-=
floor
(
posz*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
int
z
=
((
int
)
((
posz*invPeriodicBoxSize.z
)
*GRID_SIZE_Z
))
%
GRID_SIZE_Z
;
pmeAtomGridIndex[i].y
=
z
;
}
}
__kernel
void
gridSpreadCharge
(
__global
float2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
float2*
pmeGrid,
__global
float4*
pmeBsplineTheta
)
{
__kernel
void
gridSpreadCharge
(
__global
float
4*
posq,
__global
int
2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
float2*
pmeGrid,
__global
float4*
pmeBsplineTheta
)
{
unsigned
int
numGridPoints
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
for
(
int
gridIndex
=
get_global_id
(
0
)
; gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
//
Compute
the
charge
on
a
grid
point.
int4
gridPoint
;
gridPoint.x
=
gridIndex/
(
GRID_SIZE_Y*GRID_SIZE_Z
)
;
int
remainder
=
gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z
;
gridPoint.y
=
remainder/GRID_SIZE_Z
;
gridPoint.z
=
remainder-gridPoint.y*GRID_SIZE_Z
;
gridPoint.x
+=
GRID_SIZE_X
;
gridPoint.y
+=
GRID_SIZE_Y
;
gridPoint.z
+=
GRID_SIZE_Z
;
float
result
=
0.0f
;
for
(
int
ix
=
0
; ix < PME_ORDER; ++ix)
for
(
int
iy
=
0
; iy < PME_ORDER; ++iy)
for
(
int
iz
=
0
; iz < PME_ORDER; ++iz) {
int
x
=
(
gridPoint.x-ix
)
%GRID_SIZE_X
;
int
y
=
(
gridPoint.y-iy
)
%GRID_SIZE_Y
;
int
z
=
(
gridPoint.z-iz
)
%GRID_SIZE_Z
;
int
gridIndex
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z
;
int
firstAtom
=
pmeAtomRange[gridIndex]
;
int
lastAtom
=
pmeAtomRange[gridIndex+1]
;
for
(
int
i
=
firstAtom
; i < lastAtom; ++i) {
float2
atomData
=
pmeAtomGridIndex[i]
;
//
Loop
over
all
atoms
that
affect
this
grid
point.
for
(
int
ix
=
0
; ix < PME_ORDER; ++ix) {
int
x
=
gridPoint.x-ix+
(
gridPoint.x
>=
ix
?
0
:
GRID_SIZE_X
)
;
for
(
int
iy
=
0
; iy < PME_ORDER; ++iy) {
int
y
=
gridPoint.y-iy+
(
gridPoint.y
>=
iy
?
0
:
GRID_SIZE_Y
)
;
int
z1
=
gridPoint.z-PME_ORDER+1
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
)
;
int
z2
=
(
z1
<
gridPoint.z
?
gridPoint.z
:
GRID_SIZE_Z-1
)
;
int
gridIndex1
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1
;
int
gridIndex2
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2
;
int
firstAtom
=
pmeAtomRange[gridIndex1]
;
int
lastAtom
=
pmeAtomRange[gridIndex2+1]
;
for
(
int
i
=
firstAtom
; i < lastAtom; ++i)
{
int2
atomData
=
pmeAtomGridIndex[i]
;
int
atomIndex
=
atomData.x
;
int
z
=
atomData.y
;
int
iz
=
gridPoint.z-z+
(
gridPoint.z
>=
z
?
0
:
GRID_SIZE_Z
)
;
float
atomCharge
=
posq[atomIndex].w
;
result
+=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
}
if
(
z1
>
gridPoint.z
)
{
gridIndex1
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z
;
gridIndex2
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z
;
firstAtom
=
pmeAtomRange[gridIndex1]
;
lastAtom
=
pmeAtomRange[gridIndex2+1]
;
for
(
int
i
=
firstAtom
; i < lastAtom; ++i)
{
int2
atomData
=
pmeAtomGridIndex[i]
;
int
atomIndex
=
atomData.x
;
float
atomCharge
=
atomData.y
;
int
z
=
atomData.y
;
int
iz
=
gridPoint.z-z+
(
gridPoint.z
>=
z
?
0
:
GRID_SIZE_Z
)
;
float
atomCharge
=
posq[atomIndex].w
;
result
+=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
}
}
}
}
pmeGrid[gridIndex]
=
(
float2
)
(
result*EPSILON_FACTOR,
0.0f
)
;
}
}
...
...
@@ -168,15 +194,18 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
int
xindex
=
(
gridIndex.x
+
ix
)
%
GRID_SIZE_X
;
int
xindex
=
gridIndex.x+ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
float
tx
=
pmeBsplineTheta[atom+ix*NUM_ATOMS].x
;
float
dtx
=
pmeBsplineDTheta[atom+ix*NUM_ATOMS].x
;
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
int
yindex
=
(
gridIndex.y
+
iy
)
%
GRID_SIZE_Y
;
int
yindex
=
gridIndex.y+iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
float
ty
=
pmeBsplineTheta[atom+iy*NUM_ATOMS].y
;
float
dty
=
pmeBsplineDTheta[atom+iy*NUM_ATOMS].y
;
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
int
zindex
=
(
gridIndex.z
+
iz
)
%
GRID_SIZE_Z
;
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
float
tz
=
pmeBsplineTheta[atom+iz*NUM_ATOMS].z
;
float
dtz
=
pmeBsplineDTheta[atom+iz*NUM_ATOMS].z
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment