Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
592dc5a9
"platforms/cpu/include/AlignedArray.h" did not exist on "47e03b07cd37464b93db2c69c2a36932eae60b97"
Commit
592dc5a9
authored
Sep 08, 2011
by
Peter Eastman
Browse files
Optimizations to PME reciprocal space calculation
parent
c08d8a53
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
86 additions
and
52 deletions
+86
-52
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+1
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+9
-14
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+2
-3
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+74
-34
No files found.
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
592dc5a9
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2011
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
592dc5a9
...
@@ -1149,8 +1149,6 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
...
@@ -1149,8 +1149,6 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
delete
pmeBsplineModuliZ
;
delete
pmeBsplineModuliZ
;
if
(
pmeBsplineTheta
!=
NULL
)
if
(
pmeBsplineTheta
!=
NULL
)
delete
pmeBsplineTheta
;
delete
pmeBsplineTheta
;
if
(
pmeBsplineDtheta
!=
NULL
)
delete
pmeBsplineDtheta
;
if
(
pmeAtomRange
!=
NULL
)
if
(
pmeAtomRange
!=
NULL
)
delete
pmeAtomRange
;
delete
pmeAtomRange
;
if
(
pmeAtomGridIndex
!=
NULL
)
if
(
pmeAtomGridIndex
!=
NULL
)
...
@@ -1273,7 +1271,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1273,7 +1271,6 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineModuliY
=
new
OpenCLArray
<
cl_float
>
(
cl
,
gridSizeY
,
"pmeBsplineModuliY"
);
pmeBsplineModuliY
=
new
OpenCLArray
<
cl_float
>
(
cl
,
gridSizeY
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
OpenCLArray
<
cl_float
>
(
cl
,
gridSizeZ
,
"pmeBsplineModuliZ"
);
pmeBsplineModuliZ
=
new
OpenCLArray
<
cl_float
>
(
cl
,
gridSizeZ
,
"pmeBsplineModuliZ"
);
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineTheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineTheta"
);
pmeBsplineDtheta
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
PmeOrder
*
numParticles
,
"pmeBsplineDtheta"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
new
OpenCLArray
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
new
OpenCLArray
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
<
mm_int2
>
(
cl
,
cl
.
getNumAtoms
(),
"int2"
,
"value.y"
);
sort
=
new
OpenCLSort
<
mm_int2
>
(
cl
,
cl
.
getNumAtoms
(),
"int2"
,
"value.y"
);
...
@@ -1411,9 +1408,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1411,9 +1408,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineDtheta
->
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
(
2
,
OpenCLContext
::
ThreadBlockSize
*
PmeOrder
*
sizeof
(
mm_float4
),
NULL
);
pmeUpdateBsplinesKernel
.
setArg
(
3
,
2
*
OpenCLContext
::
ThreadBlockSize
*
PmeOrder
*
sizeof
(
mm_float4
),
NULL
);
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeAtomRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeAtomRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeAtomRangeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomRange
->
getDeviceBuffer
());
pmeAtomRangeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomRange
->
getDeviceBuffer
());
pmeAtomRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeAtomRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getPosq
().
getDeviceBuffer
());
...
@@ -1429,9 +1425,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1429,9 +1425,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineDtheta
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
(
5
,
2
*
128
*
PmeOrder
*
sizeof
(
mm_float4
),
NULL
);
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeGrid
->
getDeviceBuffer
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
pmeFinishSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
...
@@ -1454,8 +1449,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1454,8 +1449,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if
(
pmeGrid
!=
NULL
&&
cl
.
getContextIndex
()
==
0
)
{
if
(
pmeGrid
!=
NULL
&&
cl
.
getContextIndex
()
==
0
)
{
mm_float4
boxSize
=
cl
.
getPeriodicBoxSize
();
mm_float4
boxSize
=
cl
.
getPeriodicBoxSize
();
mm_float4
invBoxSize
=
cl
.
getInvPeriodicBoxSize
();
mm_float4
invBoxSize
=
cl
.
getInvPeriodicBoxSize
();
pmeUpdateBsplinesKernel
.
setArg
<
mm_float4
>
(
5
,
boxSize
);
pmeUpdateBsplinesKernel
.
setArg
<
mm_float4
>
(
4
,
boxSize
);
pmeUpdateBsplinesKernel
.
setArg
<
mm_float4
>
(
6
,
invBoxSize
);
pmeUpdateBsplinesKernel
.
setArg
<
mm_float4
>
(
5
,
invBoxSize
);
cl
.
executeKernel
(
pmeUpdateBsplinesKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeUpdateBsplinesKernel
,
cl
.
getNumAtoms
());
if
(
deviceIsCpu
)
{
if
(
deviceIsCpu
)
{
pmeSpreadChargeKernel
.
setArg
<
mm_float4
>
(
5
,
boxSize
);
pmeSpreadChargeKernel
.
setArg
<
mm_float4
>
(
5
,
boxSize
);
...
@@ -1482,9 +1477,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1482,9 +1477,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeConvolutionKernel
.
setArg
<
cl_float
>
(
6
,
(
float
)
(
1.0
/
(
M_PI
*
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
)));
pmeConvolutionKernel
.
setArg
<
cl_float
>
(
6
,
(
float
)
(
1.0
/
(
M_PI
*
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
)));
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
pmeInterpolateForceKernel
.
setArg
<
mm_float4
>
(
5
,
boxSize
);
pmeInterpolateForceKernel
.
setArg
<
mm_float4
>
(
3
,
boxSize
);
pmeInterpolateForceKernel
.
setArg
<
mm_float4
>
(
6
,
invBoxSize
);
pmeInterpolateForceKernel
.
setArg
<
mm_float4
>
(
4
,
invBoxSize
);
cl
.
executeKernel
(
pmeInterpolateForceKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeInterpolateForceKernel
,
cl
.
getNumAtoms
()
,
128
);
}
}
double
energy
=
ewaldSelfEnergy
;
double
energy
=
ewaldSelfEnergy
;
if
(
dispersionCoefficient
!=
0.0
)
{
if
(
dispersionCoefficient
!=
0.0
)
{
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
592dc5a9
...
@@ -475,8 +475,8 @@ private:
...
@@ -475,8 +475,8 @@ private:
class
OpenCLCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
class
OpenCLCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
public:
public:
OpenCLCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
OpenCLContext
&
cl
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
OpenCLCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
OpenCLContext
&
cl
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
hasInitializedKernel
(
false
),
cl
(
cl
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
exceptionIndices
(
NULL
),
cosSinSums
(
NULL
),
pmeGrid
(
NULL
),
pmeGrid2
(
NULL
),
hasInitializedKernel
(
false
),
cl
(
cl
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
exceptionIndices
(
NULL
),
cosSinSums
(
NULL
),
pmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeBsplineDtheta
(
NULL
),
pmeAtomRange
(
NULL
),
pmeGrid2
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
fft
(
NULL
)
{
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
fft
(
NULL
)
{
}
}
~
OpenCLCalcNonbondedForceKernel
();
~
OpenCLCalcNonbondedForceKernel
();
...
@@ -509,7 +509,6 @@ private:
...
@@ -509,7 +509,6 @@ private:
OpenCLArray
<
cl_float
>*
pmeBsplineModuliY
;
OpenCLArray
<
cl_float
>*
pmeBsplineModuliY
;
OpenCLArray
<
cl_float
>*
pmeBsplineModuliZ
;
OpenCLArray
<
cl_float
>*
pmeBsplineModuliZ
;
OpenCLArray
<
mm_float4
>*
pmeBsplineTheta
;
OpenCLArray
<
mm_float4
>*
pmeBsplineTheta
;
OpenCLArray
<
mm_float4
>*
pmeBsplineDtheta
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
cl_int
>*
pmeAtomRange
;
OpenCLArray
<
mm_int2
>*
pmeAtomGridIndex
;
OpenCLArray
<
mm_int2
>*
pmeAtomGridIndex
;
OpenCLSort
<
mm_int2
>*
sort
;
OpenCLSort
<
mm_int2
>*
sort
;
...
...
platforms/opencl/src/kernels/pme.cl
View file @
592dc5a9
__kernel
void
updateBsplines
(
__global
float4*
posq,
__global
float4*
pmeBsplineTheta,
__global
float4*
pmeBsplineDTheta,
__local
float4*
bsplinesCache,
__global
int2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
void
updateBsplines
(
__global
float4*
posq,
__global
float4*
pmeBsplineTheta,
__local
float4*
bsplinesCache,
__global
int2*
pmeAtomGridIndex,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
const
float4
scale
=
1.0f/
(
PME_ORDER-1
)
;
const
float4
scale
=
1.0f/
(
PME_ORDER-1
)
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
__local
float4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
__local
float4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
__local
float4*
ddata
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER
+
get_local_size
(
0
)
*PME_ORDER]
;
for
(
int
j
=
0
; j < PME_ORDER; j++) {
data[j]
=
0.0f
;
ddata[j]
=
0.0f
;
}
float4
pos
=
posq[i]
;
float4
pos
=
posq[i]
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
...
@@ -29,9 +24,6 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
...
@@ -29,9 +24,6 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
data[j-k-1]
=
div*
((
dr+
(
float4
)
k
)
*data[j-k-2]
+
(
-dr+
(
float4
)
(
j-k
))
*data[j-k-1]
)
;
data[j-k-1]
=
div*
((
dr+
(
float4
)
k
)
*data[j-k-2]
+
(
-dr+
(
float4
)
(
j-k
))
*data[j-k-1]
)
;
data[0]
=
div*
(
-
dr+1.0f
)
*data[0]
;
data[0]
=
div*
(
-
dr+1.0f
)
*data[0]
;
}
}
ddata[0]
=
-data[0]
;
for
(
int
j
=
1
; j < PME_ORDER; j++)
ddata[j]
=
data[j-1]-data[j]
;
data[PME_ORDER-1]
=
scale*dr*data[PME_ORDER-2]
;
data[PME_ORDER-1]
=
scale*dr*data[PME_ORDER-2]
;
for
(
int
j
=
1
; j < (PME_ORDER-1); j++)
for
(
int
j
=
1
; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
float4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
float4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
float4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
float4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
...
@@ -39,7 +31,6 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
...
@@ -39,7 +31,6 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
for
(
int
j
=
0
; j < PME_ORDER; j++) {
for
(
int
j
=
0
; j < PME_ORDER; j++) {
data[j].w
=
pos.w
; // Storing the charge here improves cache coherency in the charge spreading kernel
data[j].w
=
pos.w
; // Storing the charge here improves cache coherency in the charge spreading kernel
pmeBsplineTheta[i+j*NUM_ATOMS]
=
data[j]
;
pmeBsplineTheta[i+j*NUM_ATOMS]
=
data[j]
;
pmeBsplineDTheta[i+j*NUM_ATOMS]
=
ddata[j]
;
}
}
}
}
}
}
...
@@ -81,28 +72,54 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
...
@@ -81,28 +72,54 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
BUFFER_SIZE
(
PME_ORDER*PME_ORDER*PME_ORDER
)
__kernel
__attribute__
((
reqd_work_group_size
(
BUFFER_SIZE,
1
,
1
)))
__kernel
void
gridSpreadCharge
(
__global
float4*
posq,
__global
int2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
long*
pmeGrid,
__global
float4*
pmeBsplineTheta,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
void
gridSpreadCharge
(
__global
float4*
posq,
__global
int2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
long*
pmeGrid,
__global
float4*
pmeBsplineTheta,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
int
ix
=
get_local_id
(
0
)
/
(
PME_ORDER*PME_ORDER
)
;
int
ix
=
get_local_id
(
0
)
/
(
PME_ORDER*PME_ORDER
)
;
int
remainder
=
get_local_id
(
0
)
-ix*PME_ORDER*PME_ORDER
;
int
remainder
=
get_local_id
(
0
)
-ix*PME_ORDER*PME_ORDER
;
int
iy
=
remainder/PME_ORDER
;
int
iy
=
remainder/PME_ORDER
;
int
iz
=
remainder-iy*PME_ORDER
;
int
iz
=
remainder-iy*PME_ORDER
;
__local
float4
theta[PME_ORDER]
;
__local
float
charge[BUFFER_SIZE]
;
__local
int
basex[BUFFER_SIZE]
;
__local
int
basey[BUFFER_SIZE]
;
__local
int
basez[BUFFER_SIZE]
;
if
(
ix
<
PME_ORDER
)
{
if
(
ix
<
PME_ORDER
)
{
for
(
int
atomIndex
=
get_group_id
(
0
)
; atomIndex < NUM_ATOMS; atomIndex += get_num_groups(0)) {
for
(
int
baseIndex
=
get_group_id
(
0
)
*BUFFER_SIZE
; baseIndex < NUM_ATOMS; baseIndex += get_num_groups(0)*BUFFER_SIZE) {
//
Load
the
next
block
of
atoms
into
the
buffers.
if
(
get_local_id
(
0
)
<
BUFFER_SIZE
)
{
int
atomIndex
=
baseIndex+get_local_id
(
0
)
;
if
(
atomIndex
<
NUM_ATOMS
)
{
float4
pos
=
posq[atomIndex]
;
float4
pos
=
posq[atomIndex]
;
float
atomCharge
=
pos.w
;
charge[get_local_id
(
0
)
]
=
pos.w
;
float
add
=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
int
x
=
(
int
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X
)
+ix
;
basex[get_local_id
(
0
)
]
=
(
int
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X
)
;
int
y
=
(
int
)
((
pos.y*invPeriodicBoxSize.y
)
*GRID_SIZE_Y
)
+iy
;
basey[get_local_id
(
0
)
]
=
(
int
)
((
pos.y*invPeriodicBoxSize.y
)
*GRID_SIZE_Y
)
;
int
z
=
(
int
)
((
pos.z*invPeriodicBoxSize.z
)
*GRID_SIZE_Z
)
+iz
;
basez[get_local_id
(
0
)
]
=
(
int
)
((
pos.z*invPeriodicBoxSize.z
)
*GRID_SIZE_Z
)
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
lastIndex
=
min
(
BUFFER_SIZE,
NUM_ATOMS-baseIndex
)
;
for
(
int
index
=
0
; index < lastIndex; index++) {
int
atomIndex
=
index+baseIndex
;
if
(
get_local_id
(
0
)
<
PME_ORDER
)
theta[get_local_id
(
0
)
]
=
pmeBsplineTheta[atomIndex+get_local_id
(
0
)
*NUM_ATOMS]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
add
=
charge[index]*theta[ix].x*theta[iy].y*theta[iz].z
;
int
x
=
basex[index]+ix
;
int
y
=
basey[index]+iy
;
int
z
=
basez[index]+iz
;
x
-=
(
x
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
x
-=
(
x
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
y
-=
(
y
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
y
-=
(
y
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
z
-=
(
z
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
z
-=
(
z
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
atom_add
(
&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],
(
long
)
(
add*0xFFFFFFFF
))
;
atom_add
(
&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],
(
long
)
(
add*0xFFFFFFFF
))
;
}
}
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
pmeGrid
)
{
__kernel
void
finishSpreadCharge
(
__global
long*
pmeGrid
)
{
...
@@ -203,7 +220,11 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
...
@@ -203,7 +220,11 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
}
}
__kernel
void
gridInterpolateForce
(
__global
float4*
posq,
__global
float4*
forceBuffers,
__global
float4*
pmeBsplineTheta,
__global
float4*
pmeBsplineDTheta,
__global
float2*
pmeGrid,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
__kernel
__attribute__
((
reqd_work_group_size
(
128
,
1
,
1
)))
__kernel
void
gridInterpolateForce
(
__global
float4*
posq,
__global
float4*
forceBuffers,
__global
float2*
pmeGrid,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
__local
float4*
bsplinesCache
)
{
const
float4
scale
=
1.0f/
(
PME_ORDER-1
)
;
__local
float4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
__local
float4*
ddata
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER
+
get_local_size
(
0
)
*PME_ORDER]
;
for
(
int
atom
=
get_global_id
(
0
)
; atom < NUM_ATOMS; atom += get_global_size(0)) {
for
(
int
atom
=
get_global_id
(
0
)
; atom < NUM_ATOMS; atom += get_global_size(0)) {
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
pos
=
posq[atom]
;
float4
pos
=
posq[atom]
;
...
@@ -216,26 +237,45 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
...
@@ -216,26 +237,45 @@ __kernel void gridInterpolateForce(__global float4* posq, __global float4* force
int4
gridIndex
=
(
int4
)
(((
int
)
t.x
)
%
GRID_SIZE_X,
int4
gridIndex
=
(
int4
)
(((
int
)
t.x
)
%
GRID_SIZE_X,
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
//
Since
we
need
the
full
set
of
thetas,
it
's
faster
to
compute
them
here
than
load
them
//
from
global
memory.
float4
dr
=
(
float4
)
(
t.x-
(
int
)
t.x,
t.y-
(
int
)
t.y,
t.z-
(
int
)
t.z,
0.0f
)
;
data[PME_ORDER-1]
=
0.0f
;
data[1]
=
dr
;
data[0]
=
1.0f-dr
;
for
(
int
j
=
3
; j < PME_ORDER; j++) {
float
div
=
1.0f/
(
j-1.0f
)
;
data[j-1]
=
div*dr*data[j-2]
;
for
(
int
k
=
1
; k < (j-1); k++)
data[j-k-1]
=
div*
((
dr+
(
float4
)
k
)
*data[j-k-2]
+
(
-dr+
(
float4
)
(
j-k
))
*data[j-k-1]
)
;
data[0]
=
div*
(
-
dr+1.0f
)
*data[0]
;
}
ddata[0]
=
-data[0]
;
for
(
int
j
=
1
; j < PME_ORDER; j++)
ddata[j]
=
data[j-1]-data[j]
;
data[PME_ORDER-1]
=
scale*dr*data[PME_ORDER-2]
;
for
(
int
j
=
1
; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
float4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
float4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
data[0]
=
scale*
(
-dr+1.0f
)
*data[0]
;
//
Compute
the
force
on
this
atom.
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
for
(
int
ix
=
0
; ix < PME_ORDER; ix++) {
int
xindex
=
gridIndex.x+ix
;
int
xindex
=
gridIndex.x+ix
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
xindex
-=
(
xindex
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
float
tx
=
pmeBsplineTheta[atom+ix*NUM_ATOMS].x
;
float
dtx
=
pmeBsplineDTheta[atom+ix*NUM_ATOMS].x
;
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
for
(
int
iy
=
0
; iy < PME_ORDER; iy++) {
int
yindex
=
gridIndex.y+iy
;
int
yindex
=
gridIndex.y+iy
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
yindex
-=
(
yindex
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
float
ty
=
pmeBsplineTheta[atom+iy*NUM_ATOMS].y
;
float
dty
=
pmeBsplineDTheta[atom+iy*NUM_ATOMS].y
;
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
for
(
int
iz
=
0
; iz < PME_ORDER; iz++) {
int
zindex
=
gridIndex.z+iz
;
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
float
tz
=
pmeBsplineTheta[atom+iz*NUM_ATOMS].z
;
float
dtz
=
pmeBsplineDTheta[atom+iz*NUM_ATOMS].z
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
float
gridvalue
=
pmeGrid[index].x
;
float
gridvalue
=
pmeGrid[index].x
;
force.x
+=
d
tx*ty*t
z*gridvalue
;
force.x
+=
d
data[ix].x*data[iy].y*data[iz].
z*gridvalue
;
force.y
+=
tx*dty*t
z*gridvalue
;
force.y
+=
data[ix].x*ddata[iy].y*data[iz].
z*gridvalue
;
force.z
+=
tx*ty*dt
z*gridvalue
;
force.z
+=
data[ix].x*data[iy].y*ddata[iz].
z*gridvalue
;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment