Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
98c30a3f
"platforms/common/src/BondedUtilities.cpp" did not exist on "eb64fa2f15b33dbad5b4de818d3e93bec8e6298e"
Commit
98c30a3f
authored
Jun 10, 2010
by
Peter Eastman
Browse files
Major optimizations to PME performance
parent
b880b5d9
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
120 additions
and
65 deletions
+120
-65
platforms/cuda/src/kernels/bbsort.cu
platforms/cuda/src/kernels/bbsort.cu
+3
-3
platforms/cuda/src/kernels/bbsort_kernel.cu
platforms/cuda/src/kernels/bbsort_kernel.cu
+2
-2
platforms/cuda/src/kernels/cudatypes.h
platforms/cuda/src/kernels/cudatypes.h
+1
-1
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+1
-1
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+1
-1
platforms/cuda/src/kernels/kCalculatePME.cu
platforms/cuda/src/kernels/kCalculatePME.cu
+46
-21
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+7
-6
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+2
-2
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+57
-28
No files found.
platforms/cuda/src/kernels/bbsort.cu
View file @
98c30a3f
...
...
@@ -17,7 +17,7 @@
#include "bbsort_kernel.cu"
floa
t
getValue
(
floa
t2
v
){
in
t
getValue
(
in
t2
v
){
return
v
.
y
;
}
...
...
@@ -115,7 +115,7 @@ void reduceMinMax(T* dData,int size,float& result,bool isMax)
CUDA_SAFE_CALL
(
cudaMemcpy
(
&
originalResult
,
dData
,
sizeof
(
T
),
cudaMemcpyDeviceToHost
));
result
=
(
floa
t
)
getValue
(
originalResult
);
result
=
(
in
t
)
getValue
(
originalResult
);
}
template
<
typename
T
>
...
...
@@ -305,7 +305,7 @@ Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile com
*************************************************************************************/
template
<
>
void
bbSort
(
floa
t2
*
dData
,
int
size
,
int
listOrder
)
void
bbSort
(
in
t2
*
dData
,
int
size
,
int
listOrder
)
{
bbSortBody
(
dData
,
size
,
listOrder
);
...
...
platforms/cuda/src/kernels/bbsort_kernel.cu
View file @
98c30a3f
...
...
@@ -19,7 +19,7 @@ texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
texture
<
unsigned
int
,
1
,
cudaReadModeElementType
>
tBucketOfSlices
;
texture
<
unsigned
int
,
1
,
cudaReadModeElementType
>
tSliceOffsetInBucket
;
__device__
floa
t
dGetValue
(
floa
t2
v
){
__device__
in
t
dGetValue
(
in
t2
v
){
return
v
.
y
;
}
...
...
@@ -29,7 +29,7 @@ __device__ T dGetValue(T v){
}
__device__
void
dPad
(
floa
t2
&
v
){
__device__
void
dPad
(
in
t2
&
v
){
v
.
x
=
0x3fffffff
;
v
.
y
=
0x4fffffff
;
}
...
...
platforms/cuda/src/kernels/cudatypes.h
View file @
98c30a3f
...
...
@@ -420,7 +420,7 @@ struct cudaGmxSimulation {
float4
*
pPmeBsplineTheta
;
float4
*
pPmeBsplineDtheta
;
int
*
pPmeAtomRange
;
// The range of sorted atoms at each grid point
floa
t2
*
pPmeAtomGridIndex
;
// The grid point each atom is at
in
t2
*
pPmeAtomGridIndex
;
// The grid point each atom is at
unsigned
int
bonds
;
// Number of bonds
int4
*
pBondID
;
// Bond atom and output buffer IDs
float2
*
pBondParameter
;
// Bond parameters
...
...
platforms/cuda/src/kernels/gpu.cpp
View file @
98c30a3f
...
...
@@ -994,7 +994,7 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSiz
gpu
->
sim
.
pPmeBsplineDtheta
=
gpu
->
psPmeBsplineDtheta
->
_pDevData
;
gpu
->
psPmeAtomRange
=
new
CUDAStream
<
int
>
(
gridSize
.
x
*
gridSize
.
y
*
gridSize
.
z
+
1
,
1
,
"PmeAtomRange"
);
gpu
->
sim
.
pPmeAtomRange
=
gpu
->
psPmeAtomRange
->
_pDevData
;
gpu
->
psPmeAtomGridIndex
=
new
CUDAStream
<
floa
t2
>
(
gpu
->
natoms
,
1
,
"PmeAtomGridIndex"
);
gpu
->
psPmeAtomGridIndex
=
new
CUDAStream
<
in
t2
>
(
gpu
->
natoms
,
1
,
"PmeAtomGridIndex"
);
gpu
->
sim
.
pPmeAtomGridIndex
=
gpu
->
psPmeAtomGridIndex
->
_pDevData
;
tabulateErfc
(
gpu
);
...
...
platforms/cuda/src/kernels/gputypes.h
View file @
98c30a3f
...
...
@@ -122,7 +122,7 @@ struct _gpuContext {
CUDAStream
<
float4
>*
psPmeBsplineTheta
;
CUDAStream
<
float4
>*
psPmeBsplineDtheta
;
CUDAStream
<
int
>*
psPmeAtomRange
;
// The range of sorted atoms at each grid point
CUDAStream
<
floa
t2
>*
psPmeAtomGridIndex
;
// The grid point each atom is at
CUDAStream
<
in
t2
>*
psPmeAtomGridIndex
;
// The grid point each atom is at
CUDAStream
<
float2
>*
psObcData
;
CUDAStream
<
float4
>*
psGBVIData
;
CUDAStream
<
float
>*
psObcChain
;
...
...
platforms/cuda/src/kernels/kCalculatePME.cu
View file @
98c30a3f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2010
Stanford University and the Authors. *
* Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman *
* Contributors: *
* *
...
...
@@ -117,7 +117,7 @@ void kUpdateGridIndexAndFraction_kernel()
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
cSim
.
pmeGridSize
.
x
,
((
int
)
t
.
y
)
%
cSim
.
pmeGridSize
.
y
,
((
int
)
t
.
z
)
%
cSim
.
pmeGridSize
.
z
);
cSim
.
pPmeAtomGridIndex
[
i
]
=
make_
floa
t2
(
i
,
gridIndex
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
z
);
cSim
.
pPmeAtomGridIndex
[
i
]
=
make_
in
t2
(
i
,
gridIndex
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
y
*
cSim
.
pmeGridSize
.
z
+
gridIndex
.
z
);
}
}
...
...
@@ -141,7 +141,7 @@ void kFindAtomRangeForGrid_kernel()
int
last
=
(
start
==
0
?
-
1
:
cSim
.
pPmeAtomGridIndex
[
start
-
1
].
y
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
floa
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
in
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
gridIndex
=
atomData
.
y
;
if
(
gridIndex
!=
last
)
{
...
...
@@ -150,10 +150,13 @@ void kFindAtomRangeForGrid_kernel()
last
=
gridIndex
;
}
// The grid index won't be needed again. Reuse that component to hold the
atom charge
, thus saving
//
an extra load operation
in the charge spreading kernel.
// The grid index won't be needed again. Reuse that component to hold the
z index
, thus saving
//
some work
in the charge spreading kernel.
cSim
.
pPmeAtomGridIndex
[
i
].
y
=
cSim
.
pPosq
[(
int
)
atomData
.
x
].
w
;
float
posz
=
cSim
.
pPosq
[
atomData
.
x
].
z
;
posz
-=
floor
(
posz
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
periodicBoxSizeZ
;
int
z
=
((
int
)
((
posz
*
cSim
.
invPeriodicBoxSizeZ
)
*
cSim
.
pmeGridSize
.
z
))
%
cSim
.
pmeGridSize
.
z
;
cSim
.
pPmeAtomGridIndex
[
i
].
y
=
z
;
}
// Fill in values beyond the last atom.
...
...
@@ -266,27 +269,46 @@ void kGridSpreadCharge_kernel()
int
remainder
=
gridIndex
-
gridPoint
.
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
;
gridPoint
.
y
=
remainder
/
cSim
.
pmeGridSize
.
z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
cSim
.
pmeGridSize
.
z
;
gridPoint
.
x
+=
cSim
.
pmeGridSize
.
x
;
gridPoint
.
y
+=
cSim
.
pmeGridSize
.
y
;
gridPoint
.
z
+=
cSim
.
pmeGridSize
.
z
;
float
result
=
0.0
f
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
cSim
.
pmeGridSize
.
x
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
++
iz
)
{
int
x
=
(
gridPoint
.
x
-
ix
)
%
cSim
.
pmeGridSize
.
x
;
int
y
=
(
gridPoint
.
y
-
iy
)
%
cSim
.
pmeGridSize
.
y
;
int
z
=
(
gridPoint
.
z
-
iz
)
%
cSim
.
pmeGridSize
.
z
;
int
gridIndex
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z
;
int
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex
];
int
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex
+
1
];
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
cSim
.
pmeGridSize
.
y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
cSim
.
pmeGridSize
.
z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
cSim
.
pmeGridSize
.
z
-
1
);
int
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z1
;
int
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
z2
;
int
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
int
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
floa
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
in
t2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
float
atomCharge
=
atomData
.
y
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
cSim
.
pPosq
[
atomIndex
].
w
;
result
+=
atomCharge
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
x
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iy
*
cSim
.
atoms
).
y
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iz
*
cSim
.
atoms
).
z
;
}
if
(
z1
>
gridPoint
.
z
)
{
gridIndex1
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
;
gridIndex2
=
x
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
y
*
cSim
.
pmeGridSize
.
z
+
gridPoint
.
z
;
firstAtom
=
cSim
.
pPmeAtomRange
[
gridIndex1
];
lastAtom
=
cSim
.
pPmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
cSim
.
pPmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
cSim
.
pmeGridSize
.
z
);
float
atomCharge
=
cSim
.
pPosq
[
atomIndex
].
w
;
result
+=
atomCharge
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
ix
*
cSim
.
atoms
).
x
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iy
*
cSim
.
atoms
).
y
*
tex1Dfetch
(
bsplineThetaRef
,
atomIndex
+
iz
*
cSim
.
atoms
).
z
;
}
}
}
}
cSim
.
pPmeGrid
[
gridIndex
]
=
make_cuComplex
(
result
*
sqrt
(
cSim
.
epsfac
),
0.0
f
);
}
...
...
@@ -358,17 +380,20 @@ void kGridInterpolateForce_kernel()
((
int
)
t
.
z
)
%
cSim
.
pmeGridSize
.
z
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xindex
=
(
gridIndex
.
x
+
ix
)
%
cSim
.
pmeGridSize
.
x
;
int
xindex
=
gridIndex
.
x
+
ix
;
xindex
-=
(
xindex
>=
cSim
.
pmeGridSize
.
x
?
cSim
.
pmeGridSize
.
x
:
0
);
float
tx
=
cSim
.
pPmeBsplineTheta
[
atom
+
ix
*
cSim
.
atoms
].
x
;
float
dtx
=
cSim
.
pPmeBsplineDtheta
[
atom
+
ix
*
cSim
.
atoms
].
x
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
yindex
=
(
gridIndex
.
y
+
iy
)
%
cSim
.
pmeGridSize
.
y
;
int
yindex
=
gridIndex
.
y
+
iy
;
yindex
-=
(
yindex
>=
cSim
.
pmeGridSize
.
y
?
cSim
.
pmeGridSize
.
y
:
0
);
float
ty
=
cSim
.
pPmeBsplineTheta
[
atom
+
iy
*
cSim
.
atoms
].
y
;
float
dty
=
cSim
.
pPmeBsplineDtheta
[
atom
+
iy
*
cSim
.
atoms
].
y
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
(
gridIndex
.
z
+
iz
)
%
cSim
.
pmeGridSize
.
z
;
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
cSim
.
pmeGridSize
.
z
?
cSim
.
pmeGridSize
.
z
:
0
);
float
tz
=
cSim
.
pPmeBsplineTheta
[
atom
+
iz
*
cSim
.
atoms
].
z
;
float
dtz
=
cSim
.
pPmeBsplineDtheta
[
atom
+
iz
*
cSim
.
atoms
].
z
;
int
index
=
xindex
*
cSim
.
pmeGridSize
.
y
*
cSim
.
pmeGridSize
.
z
+
yindex
*
cSim
.
pmeGridSize
.
z
+
zindex
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
98c30a3f
...
...
@@ -1151,8 +1151,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineDtheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineDtheta"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_
floa
t2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_
floa
t2
>
(
cl
,
cl
.
getNumAtoms
(),
"
floa
t2"
,
"value.y"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_
in
t2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_
in
t2
>
(
cl
,
cl
.
getNumAtoms
(),
"
in
t2"
,
"value.y"
);
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
// Initialize the b-spline moduli.
...
...
@@ -1303,10 +1303,11 @@ void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineDtheta
->
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
(
3
,
2
*
OpenCLContext
::
ThreadBlockSize
*
PmeOrder
*
sizeof
(
mm_float4
),
NULL
);
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomRange
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeAtomRange
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
98c30a3f
...
...
@@ -491,9 +491,9 @@ private:
OpenCLArray
<
mm_float4
>*
pmeBsplineTheta
;
OpenCLArray
<
mm_float4
>*
pmeBsplineDtheta
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
mm_
floa
t2
>*
pmeAtomGridIndex
;
OpenCLArray
<
mm_
in
t2
>*
pmeAtomGridIndex
;
OpenCLArray
<
cl_float
>*
erfcTable
;
OpenCLSort
<
mm_
floa
t2
>*
sort
;
OpenCLSort
<
mm_
in
t2
>*
sort
;
OpenCLFFT3D
*
fft
;
cl
::
Kernel
exceptionsKernel
;
cl
::
Kernel
ewaldSumsKernel
;
...
...
platforms/opencl/src/kernels/pme.cl
View file @
98c30a3f
__kernel
void
updateGridIndexAndFraction
(
__global
float4*
posq,
__global
floa
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
void
updateGridIndexAndFraction
(
__global
float4*
posq,
__global
in
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
float4
pos
=
posq[i]
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
...
...
@@ -10,7 +10,7 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
int4
gridIndex
=
(
int4
)
(((
int
)
t.x
)
%
GRID_SIZE_X,
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
pmeAtomGridIndex[i]
=
(
floa
t2
)
(
i,
gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z
)
;
pmeAtomGridIndex[i]
=
(
in
t2
)
(
i,
gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z
)
;
}
}
...
...
@@ -18,12 +18,12 @@ __kernel void updateGridIndexAndFraction(__global float4* posq, __global float2*
*
For
each
grid
point,
find
the
range
of
sorted
atoms
associated
with
that
point.
*/
__kernel
void
findAtomRangeForGrid
(
__global
floa
t2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange
)
{
__kernel
void
findAtomRangeForGrid
(
__global
in
t2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange
)
{
int
start
=
(
NUM_ATOMS*get_global_id
(
0
))
/get_global_size
(
0
)
;
int
end
=
(
NUM_ATOMS*
(
get_global_id
(
0
)
+1
))
/get_global_size
(
0
)
;
int
last
=
(
start
==
0
?
-1
:
pmeAtomGridIndex[start-1].y
)
;
for
(
int
i
=
start
; i < end; ++i) {
floa
t2
atomData
=
pmeAtomGridIndex[i]
;
in
t2
atomData
=
pmeAtomGridIndex[i]
;
int
gridIndex
=
atomData.y
;
if
(
gridIndex
!=
last
)
{
for
(
int
j
=
last+1
; j <= gridIndex; ++j)
...
...
@@ -41,7 +41,7 @@ __kernel void findAtomRangeForGrid(__global float2* pmeAtomGridIndex, __global i
}
}
__kernel
void
updateBsplines
(
__global
float4*
posq,
__global
float4*
pmeBsplineTheta,
__global
float4*
pmeBsplineDTheta,
__local
float4*
bsplinesCache,
__global
floa
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
void
updateBsplines
(
__global
float4*
posq,
__global
float4*
pmeBsplineTheta,
__global
float4*
pmeBsplineDTheta,
__local
float4*
bsplinesCache,
__global
in
t2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
const
float4
scale
=
1.0f/
(
PME_ORDER-1
)
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
__local
float4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
...
...
@@ -81,45 +81,71 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
}
}
//
The
grid
index
won
't
be
needed
again.
Reuse
that
component
to
hold
the
atom
charge
,
thus
saving
//
an
extra
load
operation
in
the
charge
spreading
kernel.
//
The
grid
index
won
't
be
needed
again.
Reuse
that
component
to
hold
the
z
index
,
thus
saving
//
some
work
in
the
charge
spreading
kernel.
int
start
=
(
NUM_ATOMS*get_global_id
(
0
))
/get_global_size
(
0
)
;
int
end
=
(
NUM_ATOMS*
(
get_global_id
(
0
)
+1
))
/get_global_size
(
0
)
;
for
(
int
i
=
start
; i < end; ++i) {
float2
atomData
=
pmeAtomGridIndex[i]
;
pmeAtomGridIndex[i].y
=
posq[
(
int
)
atomData.x].w
;
float
posz
=
posq[pmeAtomGridIndex[i].x].z
;
posz
-=
floor
(
posz*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
int
z
=
((
int
)
((
posz*invPeriodicBoxSize.z
)
*GRID_SIZE_Z
))
%
GRID_SIZE_Z
;
pmeAtomGridIndex[i].y
=
z
;
}
}
__kernel
void
gridSpreadCharge
(
__global
float2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
float2*
pmeGrid,
__global
float4*
pmeBsplineTheta
)
{
__kernel
void
gridSpreadCharge
(
__global
float
4*
posq,
__global
int
2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
float2*
pmeGrid,
__global
float4*
pmeBsplineTheta
)
{
unsigned
int
numGridPoints
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
for
(
int
gridIndex
=
get_global_id
(
0
)
; gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
//
Compute
the
charge
on
a
grid
point.
int4
gridPoint
;
gridPoint.x
=
gridIndex/
(
GRID_SIZE_Y*GRID_SIZE_Z
)
;
int
remainder
=
gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z
;
gridPoint.y
=
remainder/GRID_SIZE_Z
;
gridPoint.z
=
remainder-gridPoint.y*GRID_SIZE_Z
;
gridPoint.x
+=
GRID_SIZE_X
;
gridPoint.y
+=
GRID_SIZE_Y
;
gridPoint.z
+=
GRID_SIZE_Z
;
float
result
=
0.0f
;
for
(
int
ix
=
0
; ix < PME_ORDER; ++ix)
for
(
int
iy
=
0
; iy < PME_ORDER; ++iy)
for
(
int
iz
=
0
; iz < PME_ORDER; ++iz) {
int
x
=
(
gridPoint.x-ix
)
%GRID_SIZE_X
;
int
y
=
(
gridPoint.y-iy
)
%GRID_SIZE_Y
;
int
z
=
(
gridPoint.z-iz
)
%GRID_SIZE_Z
;
int
gridIndex
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z
;
int
firstAtom
=
pmeAtomRange[gridIndex]
;
int
lastAtom
=
pmeAtomRange[gridIndex+1]
;
for
(
int
i
=
firstAtom
; i < lastAtom; ++i) {
float2
atomData
=
pmeAtomGridIndex[i]
;
//
Loop
over
all
atoms
that
affect
this
grid
point.
for
(
int
ix
=
0
; ix < PME_ORDER; ++ix) {
int
x
=
gridPoint.x-ix+
(
gridPoint.x
>=
ix
?
0
:
GRID_SIZE_X
)
;
for
(
int
iy
=
0
; iy < PME_ORDER; ++iy) {
int
y
=
gridPoint.y-iy+
(
gridPoint.y
>=
iy
?
0
:
GRID_SIZE_Y
)
;
int
z1
=
gridPoint.z-PME_ORDER+1
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
)
;
int
z2
=
(
z1
<
gridPoint.z
?
gridPoint.z
:
GRID_SIZE_Z-1
)
;
int
gridIndex1
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1
;
int
gridIndex2
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2
;
int
firstAtom
=
pmeAtomRange[gridIndex1]
;
int
lastAtom
=
pmeAtomRange[gridIndex2+1]
;
for
(
int
i
=
firstAtom
; i < lastAtom; ++i)
{
int2
atomData
=
pmeAtomGridIndex[i]
;
int
atomIndex
=
atomData.x
;
int
z
=
atomData.y
;
int
iz
=
gridPoint.z-z+
(
gridPoint.z
>=
z
?
0
:
GRID_SIZE_Z
)
;
float
atomCharge
=
posq[atomIndex].w
;
result
+=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
}
if
(
z1
>
gridPoint.z
)
{
gridIndex1
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z
;
gridIndex2
=
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z
;
firstAtom
=
pmeAtomRange[gridIndex1]
;
lastAtom
=
pmeAtomRange[gridIndex2+1]
;
for
(
int
i
=
firstAtom
; i < lastAtom; ++i)
{
int2
atomData
=
pmeAtomGridIndex[i]
;
int
atomIndex
=
atomData.x
;
float
atomCharge
=
atomData.y
;
int
z
=
atomData.y
;
int
iz
=
gridPoint.z-z+
(
gridPoint.z
>=
z
?
0
:
GRID_SIZE_Z
)
;
float
atomCharge
=
posq[atomIndex].w
;
result
+=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
}
}
}
}
pmeGrid[gridIndex]
=
(
float2
)
(
result*EPSILON_FACTOR,
0.0f
)
;
}
}
...
...
@@ -168,15 +194,18 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
int
xindex
=
(
gridIndex.x
+
ix
)
%
GRID_SIZE_X
;
int
xindex
=
gridIndex.x+ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
float
tx
=
pmeBsplineTheta[atom+ix*NUM_ATOMS].x
;
float
dtx
=
pmeBsplineDTheta[atom+ix*NUM_ATOMS].x
;
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
int
yindex
=
(
gridIndex.y
+
iy
)
%
GRID_SIZE_Y
;
int
yindex
=
gridIndex.y+iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
float
ty
=
pmeBsplineTheta[atom+iy*NUM_ATOMS].y
;
float
dty
=
pmeBsplineDTheta[atom+iy*NUM_ATOMS].y
;
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
int
zindex
=
(
gridIndex.z
+
iz
)
%
GRID_SIZE_Z
;
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
float
tz
=
pmeBsplineTheta[atom+iz*NUM_ATOMS].z
;
float
dtz
=
pmeBsplineDTheta[atom+iz*NUM_ATOMS].z
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment